本篇文章为你整理了Java爬虫常用工具使用(java爬虫工具类)的详细内容,包含有java 爬虫工具 java爬虫工具类 java爬虫需要的基本知识 java爬虫入门教程 Java爬虫常用工具使用,希望能帮助你了解 Java爬虫常用工具使用。
2、封装获取Cookie的方法
public List org.apache.http.cookie.Cookie getHtmlByWebClient(String url, String method) {
@SuppressWarnings("resource")
WebClient webClient = new WebClient();
webClient.getOptions().setTimeout(100000);
webClient.getOptions().setCssEnabled(false);// 取消css支持
webClient.getOptions().setJavaScriptEnabled(true); // 取消javascript支持
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要,设置支持AJAX
WebResponse response = null;
try {
//先去请求网站,让他把网站的CookieSet进去
try {
webClient.getPage(url);
} catch (IOException e) {
} catch (FailingHttpStatusCodeException e) {
//等待30秒让WebClient执行js脚本
TimeUnit.SECONDS.sleep(30L);
URL ur = new URL(url);
WebRequest webequest;
if ("GET".equals(method)) {
webequest = new WebRequest(ur, HttpMethod.GET);
} else {
webequest = new WebRequest(ur, HttpMethod.POST);
response = webClient.loadWebResponse(webequest);
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
webClient.close();
Set Cookie cookies = null;
try {
cookies = webClient.getCookies(new URL(url));
} catch (MalformedURLException e) {
return Cookie.toHttpClient(cookies);
@Test
void name888() {
String url = "https://www.yuque.com/wanqi-1f4b0/vlgn2k/cnf0v6tw2v5viz5t"
List org.apache.http.cookie.Cookie get = getHtmlByWebClient(url);
StringBuilder sb = new StringBuilder();
for (org.apache.http.cookie.Cookie cookie : get) {
HttpCookie httpCookie = new HttpCookie( cookie.getName(), cookie.getValue());
sb.append(httpCookie);
sb.append("; ");
Map String, String headers = new HashMap ();
headers.put("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:106.0) Gecko/20100101 Firefox/106.0");
headers.put("Host", "pic.netbian.com");
headers.put("Sec-Fetch-Dest", "document");
headers.put("Sec-Fetch-Mode", "navigate");
headers.put("Sec-Fetch-Site", "cross-site");
headers.put("Upgrade-Insecure-Requests", "1");
headers.put("Pragma", "no-cache");
headers.put("Cookie", sb.toString());
String s1 = HttpUtil.createGet(url)
.addHeaders(headers)
.execute().body();
System.out.println(s1);
2、html解析工具htmlparser封装
package com.wanqi.util;
import org.htmlparser.tags.CompositeTag;
* @Auther: wq
* @Date: 2020/3/9 17:37
* @Description: htmlparser定制标签
* @Version: 1.0
public class CustomizeTag extends CompositeTag {
private static final String mIds[] = {
"tbody", "b", "strong", "dd", "section", "big"
private static final String mEndTagEnders[] = {
"tbody", "b", "strong", "dd", "section", "big"
public CustomizeTag() {
import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.jetbrains.annotations.NotNull;
import java.util.ArrayList;
import java.util.List;
public class ParseUtil {
* 提取具有某个属性值的标签列表
* @param T
* @param html
* 被提取的HTML文本
* @param tagType
* 标签类型
* @param attributeName
* 某个属性的名称
* @param attributeValue
* 属性应取的值
* @return
@SuppressWarnings({ "serial", "unchecked" })
public static T extends TagNode List T parseTags(String html, final Class T tagType, final String attributeName,
final String attributeValue) {
try {
// 创建一个HTML解释器
Parser parser = new Parser();
parser.setInputHTML(html);
NodeList tagList = parser.parse((NodeFilter) node - {
if (node.getClass() == tagType) {
T tn = (T) node;
if (attributeName == null) {
return true;
String attrValue = tn.getAttribute(attributeName);
if (attrValue != null attrValue.equals(attributeValue)) {
return true;
return false;
return getTs(tagList);
} catch (ParserException e) {
return null;
@NotNull
private static T extends TagNode List T getTs(NodeList tagList) {
List T tags = new ArrayList T
for (int i = 0; i tagList.size(); i++) {
T t = (T) tagList.elementAt(i);
tags.add(t);
return tags;
@SuppressWarnings({ "serial", "unchecked" })
public static T extends TagNode List T parseNodes(String html, final Class T tagType,
final String attributeName, final String attributeValue) {
try {
// 创建一个HTML解释器
Parser parser = new Parser();
parser.setInputHTML(html);
NodeList tagList = parser.parse((NodeFilter) node - {
if (node instanceof TagNode) {
T tn = (T) node;
if (attributeName == null) {
return true;
String attrValue = tn.getAttribute(attributeName);
if (attrValue != null attrValue.equals(attributeValue)) {
return true;
return false;
return getTs(tagList);
} catch (ParserException e) {
return null;
public static T extends TagNode List T parseTags(String html, final Class T tagType) {
return parseTags(html, tagType, null, null);
public static T extends TagNode T parseTag(String html, final Class T tagType, final String attributeName,
final String attributeValue) {
List T tags = parseTags(html, tagType, attributeName, attributeValue);
if (tags != null tags.size() 0) {
return tags.get(0);
return null;
public static T extends TagNode T parseNode(String html, final Class T tagType, final String attributeName,
final String attributeValue) {
List T tags = parseNodes(html, tagType, attributeName, attributeValue);
if (tags != null tags.size() 0) {
return tags.get(0);
return null;
public static T extends TagNode T parseTag(String html, final Class T tagType) {
return parseTag(html, tagType, null, null);
@SuppressWarnings({ "serial", "unchecked" })
public static T extends TagNode List T parseFuzzyTags(String html, final Class T tagType,
final String attributeName, final String attributeValue) {
try {
// 创建一个HTML解释器
Parser parser = new Parser();
parser.setInputHTML(html);
NodeList tagList = parser.parse((NodeFilter) node - {
if (node instanceof TagNode) {
T tn = (T) node;
if (attributeName == null) {
return true;
String attrValue = tn.getAttribute(attributeName);
if (attrValue != null attrValue.contains(attributeValue)) {
return true;
return false;
return getTs(tagList);
} catch (ParserException e) {
// e.printStackTrace();
return null;
public static T extends TagNode T parseFuzzyTag(String html, final Class T tagType, final String attributeName,
final String attributeValue) {
List T tags = parseFuzzyTags(html, tagType, attributeName, attributeValue);
if (tags != null tags.size() 0) {
return tags.get(0);
return null;
* 自定义html标签
* @param html
* @param tagType 标签名
* @return
public static CustomizeTag parseTagsByCustomize(String html, final String tagType) {
List CustomizeTag tags = parseTagsByCustomizes(html, tagType, null, null);
if (tags != null tags.size() 0) {
return tags.get(0);
return null;
public static CustomizeTag parseTagsByCustomize(String html, final String tagType, final String attributeName,
final String attributeValue) {
List CustomizeTag tags = parseTagsByCustomizes(html, tagType, attributeName, attributeValue);
if (tags != null tags.size() 0) {
return tags.get(0);
return null;
public static List CustomizeTag parseTagsByCustomizes(String html, final String tagType, final String attributeName,
final String attributeValue) {
try {
// 创建一个HTML解释器
CustomizeTag customizeTag = new CustomizeTag();
Parser parser = new Parser();
parser.setInputHTML(html);
PrototypicalNodeFactory p=new PrototypicalNodeFactory();
p.registerTag(customizeTag);
parser.setNodeFactory(p);
String tagName = null;
String[] ids = customizeTag.getIds();
for (String id : ids) {
if(tagType.equals(id)){
tagName = id;
if(tagName == null){
return null;
NodeList tagList = parser.parse(new TagNameFilter(tagName));
List CustomizeTag tags = new ArrayList CustomizeTag
for (int i = 0; i tagList.size(); i++) {
CustomizeTag t = (CustomizeTag) tagList.elementAt(i);
if (attributeName == null) {
tags.add(t);
} else {
String attrValue = t.getAttribute(attributeName);
if (attrValue != null attrValue.equals(attributeValue)) {
tags.add(t);
return tags;
} catch (ParserException e) {
return null;
* 自定义html标签
* @param html
* @param tagType 标签名
* @return
public static CustomizeTag parseFuzzyTagsByCustomize(String html, final String tagType) {
List CustomizeTag tags = parseFuzzyTagsByCustomizes(html, tagType, null, null);
if (tags != null tags.size() 0) {
return tags.get(0);
return null;
public static CustomizeTag parseFuzzyTagsByCustomize(String html, final String tagType, final String attributeName,
final String attributeValue) {
List CustomizeTag tags = parseFuzzyTagsByCustomizes(html, tagType, attributeName, attributeValue);
if (tags != null tags.size() 0) {
return tags.get(0);
return null;
public static List CustomizeTag parseFuzzyTagsByCustomizes(String html, final String tagType, final String attributeName,
final String attributeValue) {
try {
// 创建一个HTML解释器
CustomizeTag customizeTag = new CustomizeTag();
Parser parser = new Parser();
parser.setInputHTML(html);
PrototypicalNodeFactory p=new PrototypicalNodeFactory();
p.registerTag(customizeTag);
parser.setNodeFactory(p);
String tagName = null;
String[] ids = customizeTag.getIds();
for (String id : ids) {
if(tagType.equals(id)){
tagName = id;
if(tagName == null){
return null;
NodeList tagList = parser.parse(new TagNameFilter(tagName));
List CustomizeTag tags = new ArrayList CustomizeTag
for (int i = 0; i tagList.size(); i++) {
CustomizeTag t = (CustomizeTag) tagList.elementAt(i);
if (attributeName == null) {
tags.add(t);
} else {
String attrValue = t.getAttribute(attributeName);
if (attrValue != null attrValue.contains(attributeValue)) {
tags.add(t);
return tags;
} catch (ParserException e) {
return null;
以上就是Java爬虫常用工具使用(java爬虫工具类)的详细内容,想要了解更多 Java爬虫常用工具使用的内容,请持续关注盛行IT软件开发工作室。
郑重声明:本文由网友发布,不代表盛行IT的观点,版权归原作者所有,仅为传播更多信息之目的,如有侵权请联系,我们将第一时间修改或删除,多谢。