Java爬虫常用工具使用（java爬虫工具类）

　　本篇文章为你整理了Java爬虫常用工具使用（java爬虫工具类）的详细内容，包含有java 爬虫工具 java爬虫工具类 java爬虫需要的基本知识 java爬虫入门教程 Java爬虫常用工具使用，希望能帮助你了解 Java爬虫常用工具使用。

　　2、封装获取Cookie的方法

public List org.apache.http.cookie.Cookie getHtmlByWebClient(String url, String method) {

　　 @SuppressWarnings("resource")

　　 WebClient webClient = new WebClient();

　　 webClient.getOptions().setTimeout(100000);

　　 webClient.getOptions().setCssEnabled(false);// 取消css支持

　　 webClient.getOptions().setJavaScriptEnabled(true); // 取消javascript支持

　　 webClient.getOptions().setThrowExceptionOnScriptError(false);

　　 webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要，设置支持AJAX

　　 WebResponse response = null;

　　 try {

　　 //先去请求网站，让他把网站的CookieSet进去

　　 try {

　　 webClient.getPage(url);

　　 } catch (IOException e) {

　　 } catch (FailingHttpStatusCodeException e) {

　　 //等待30秒让WebClient执行js脚本

　　 TimeUnit.SECONDS.sleep(30L);

　　 URL ur = new URL(url);

　　 WebRequest webequest;

　　 if ("GET".equals(method)) {

　　 webequest = new WebRequest(ur, HttpMethod.GET);

　　 } else {

　　 webequest = new WebRequest(ur, HttpMethod.POST);

　　 response = webClient.loadWebResponse(webequest);

　　 } catch (Exception e) {

　　 throw new RuntimeException(e);

　　 } finally {

　　 webClient.close();

　　 Set Cookie cookies = null;

　　 try {

　　 cookies = webClient.getCookies(new URL(url));

　　 } catch (MalformedURLException e) {

　　 return Cookie.toHttpClient(cookies);

 @Test

　　 void name888() {

　　 String url = "https://www.yuque.com/wanqi-1f4b0/vlgn2k/cnf0v6tw2v5viz5t"

　　 List org.apache.http.cookie.Cookie get = getHtmlByWebClient(url);

　　 StringBuilder sb = new StringBuilder();

　　 for (org.apache.http.cookie.Cookie cookie : get) {

　　 HttpCookie httpCookie = new HttpCookie( cookie.getName(), cookie.getValue());

　　 sb.append(httpCookie);

　　 sb.append("; ");

　　 Map String, String headers = new HashMap ();

　　 headers.put("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:106.0) Gecko/20100101 Firefox/106.0");

　　 headers.put("Host", "pic.netbian.com");

　　 headers.put("Sec-Fetch-Dest", "document");

　　 headers.put("Sec-Fetch-Mode", "navigate");

　　 headers.put("Sec-Fetch-Site", "cross-site");

　　 headers.put("Upgrade-Insecure-Requests", "1");

　　 headers.put("Pragma", "no-cache");

　　 headers.put("Cookie", sb.toString());

　　 String s1 = HttpUtil.createGet(url)

　　 .addHeaders(headers)

　　 .execute().body();

　　 System.out.println(s1);

　　2、html解析工具htmlparser封装

package com.wanqi.util;

　　import org.htmlparser.tags.CompositeTag;

　　 * @Auther: wq

　　 * @Date: 2020/3/9 17:37

　　 * @Description: htmlparser定制标签

　　 * @Version: 1.0

　　public class CustomizeTag extends CompositeTag {

　　 private static final String mIds[] = {

　　 "tbody", "b", "strong", "dd", "section", "big"

　　 private static final String mEndTagEnders[] = {

　　 "tbody", "b", "strong", "dd", "section", "big"

　　 public CustomizeTag() {

　　
import org.htmlparser.PrototypicalNodeFactory;

　　import org.htmlparser.filters.TagNameFilter;

　　import org.htmlparser.nodes.TagNode;

　　import org.htmlparser.util.NodeList;

　　import org.htmlparser.util.ParserException;

　　import org.jetbrains.annotations.NotNull;

　　import java.util.ArrayList;

　　import java.util.List;

　　public class ParseUtil {

　　 * 提取具有某个属性值的标签列表

　　 * @param T

　　 * @param html

　　 * 被提取的HTML文本

　　 * @param tagType

　　 * 标签类型

　　 * @param attributeName

　　 * 某个属性的名称

　　 * @param attributeValue

　　 * 属性应取的值

　　 * @return

　　 @SuppressWarnings({ "serial", "unchecked" })

　　 public static T extends TagNode List T parseTags(String html, final Class T tagType, final String attributeName,

　　 final String attributeValue) {

　　 try {

　　 // 创建一个HTML解释器

　　 Parser parser = new Parser();

　　 parser.setInputHTML(html);

　　 NodeList tagList = parser.parse((NodeFilter) node - {

　　 if (node.getClass() == tagType) {

　　 T tn = (T) node;

　　 if (attributeName == null) {

　　 return true;

　　 String attrValue = tn.getAttribute(attributeName);

　　 if (attrValue != null attrValue.equals(attributeValue)) {

　　 return true;

　　 return false;

　　 return getTs(tagList);

　　 } catch (ParserException e) {

　　 return null;

　　 @NotNull

　　 private static T extends TagNode List T getTs(NodeList tagList) {

　　 List T tags = new ArrayList T

　　 for (int i = 0; i tagList.size(); i++) {

　　 T t = (T) tagList.elementAt(i);

　　 tags.add(t);

　　 return tags;

　　 @SuppressWarnings({ "serial", "unchecked" })

　　 public static T extends TagNode List T parseNodes(String html, final Class T tagType,

　　 final String attributeName, final String attributeValue) {

　　 try {

　　 // 创建一个HTML解释器

　　 Parser parser = new Parser();

　　 parser.setInputHTML(html);

　　 NodeList tagList = parser.parse((NodeFilter) node - {

　　 if (node instanceof TagNode) {

　　 T tn = (T) node;

　　 if (attributeName == null) {

　　 return true;

　　 String attrValue = tn.getAttribute(attributeName);

　　 if (attrValue != null attrValue.equals(attributeValue)) {

　　 return true;

　　 return false;

　　 return getTs(tagList);

　　 } catch (ParserException e) {

　　 return null;

　　
public static T extends TagNode List T parseTags(String html, final Class T tagType) {

　　 return parseTags(html, tagType, null, null);

　　 public static T extends TagNode T parseTag(String html, final Class T tagType, final String attributeName,

　　 final String attributeValue) {

　　 List T tags = parseTags(html, tagType, attributeName, attributeValue);

　　 if (tags != null tags.size() 0) {

　　 return tags.get(0);

　　 return null;

　　 public static T extends TagNode T parseNode(String html, final Class T tagType, final String attributeName,

　　 final String attributeValue) {

　　 List T tags = parseNodes(html, tagType, attributeName, attributeValue);

　　 if (tags != null tags.size() 0) {

　　 return tags.get(0);

　　 return null;

　　 public static T extends TagNode T parseTag(String html, final Class T tagType) {

　　 return parseTag(html, tagType, null, null);

　　 @SuppressWarnings({ "serial", "unchecked" })

　　 public static T extends TagNode List T parseFuzzyTags(String html, final Class T tagType,

　　 final String attributeName, final String attributeValue) {

　　 try {

　　 // 创建一个HTML解释器

　　 Parser parser = new Parser();

　　 parser.setInputHTML(html);

　　 NodeList tagList = parser.parse((NodeFilter) node - {

　　 if (node instanceof TagNode) {

　　 T tn = (T) node;

　　 if (attributeName == null) {

　　 return true;

　　 String attrValue = tn.getAttribute(attributeName);

　　 if (attrValue != null attrValue.contains(attributeValue)) {

　　 return true;

　　 return false;

　　 return getTs(tagList);

　　 } catch (ParserException e) {

　　 // e.printStackTrace();

　　 return null;

　　 public static T extends TagNode T parseFuzzyTag(String html, final Class T tagType, final String attributeName,

　　 final String attributeValue) {

　　 List T tags = parseFuzzyTags(html, tagType, attributeName, attributeValue);

　　 if (tags != null tags.size() 0) {

　　 return tags.get(0);

　　 return null;

　　 * 自定义html标签

　　 * @param html

　　 * @param tagType 标签名

　　 * @return

　　 public static CustomizeTag parseTagsByCustomize(String html, final String tagType) {

　　 List CustomizeTag tags = parseTagsByCustomizes(html, tagType, null, null);

　　 if (tags != null tags.size() 0) {

　　 return tags.get(0);

　　 return null;

　　
public static CustomizeTag parseTagsByCustomize(String html, final String tagType, final String attributeName,

　　 final String attributeValue) {

　　 List CustomizeTag tags = parseTagsByCustomizes(html, tagType, attributeName, attributeValue);

　　 if (tags != null tags.size() 0) {

　　 return tags.get(0);

　　 return null;

　　
public static List CustomizeTag parseTagsByCustomizes(String html, final String tagType, final String attributeName,

　　 final String attributeValue) {

　　 try {

　　 // 创建一个HTML解释器

　　 CustomizeTag customizeTag = new CustomizeTag();

　　 Parser parser = new Parser();

　　 parser.setInputHTML(html);

　　 PrototypicalNodeFactory p=new PrototypicalNodeFactory();

　　 p.registerTag(customizeTag);

　　 parser.setNodeFactory(p);

　　 String tagName = null;

　　 String[] ids = customizeTag.getIds();

　　 for (String id : ids) {

　　 if(tagType.equals(id)){

　　 tagName = id;

　　 if(tagName == null){

　　 return null;

　　 NodeList tagList = parser.parse(new TagNameFilter(tagName));

　　 List CustomizeTag tags = new ArrayList CustomizeTag

　　 for (int i = 0; i tagList.size(); i++) {

　　 CustomizeTag t = (CustomizeTag) tagList.elementAt(i);

　　 if (attributeName == null) {

　　 tags.add(t);

　　 } else {

　　 String attrValue = t.getAttribute(attributeName);

　　 if (attrValue != null attrValue.equals(attributeValue)) {

　　 tags.add(t);

　　 return tags;

　　 } catch (ParserException e) {

　　 return null;

　　 * 自定义html标签

　　 * @param html

　　 * @param tagType 标签名

　　 * @return

　　 public static CustomizeTag parseFuzzyTagsByCustomize(String html, final String tagType) {

　　 List CustomizeTag tags = parseFuzzyTagsByCustomizes(html, tagType, null, null);

　　 if (tags != null tags.size() 0) {

　　 return tags.get(0);

　　 return null;

　　
public static CustomizeTag parseFuzzyTagsByCustomize(String html, final String tagType, final String attributeName,

　　 final String attributeValue) {

　　 List CustomizeTag tags = parseFuzzyTagsByCustomizes(html, tagType, attributeName, attributeValue);

　　 if (tags != null tags.size() 0) {

　　 return tags.get(0);

　　 return null;

　　
public static List CustomizeTag parseFuzzyTagsByCustomizes(String html, final String tagType, final String attributeName,

　　 final String attributeValue) {

　　 try {

　　 // 创建一个HTML解释器

　　 CustomizeTag customizeTag = new CustomizeTag();

　　 Parser parser = new Parser();

　　 parser.setInputHTML(html);

　　 PrototypicalNodeFactory p=new PrototypicalNodeFactory();

　　 p.registerTag(customizeTag);

　　 parser.setNodeFactory(p);

　　 String tagName = null;

　　 String[] ids = customizeTag.getIds();

　　 for (String id : ids) {

　　 if(tagType.equals(id)){

　　 tagName = id;

　　 if(tagName == null){

　　 return null;

　　 NodeList tagList = parser.parse(new TagNameFilter(tagName));

　　 List CustomizeTag tags = new ArrayList CustomizeTag

　　 for (int i = 0; i tagList.size(); i++) {

　　 CustomizeTag t = (CustomizeTag) tagList.elementAt(i);

　　 if (attributeName == null) {

　　 tags.add(t);

　　 } else {

　　 String attrValue = t.getAttribute(attributeName);

　　 if (attrValue != null attrValue.contains(attributeValue)) {

　　 tags.add(t);

　　 return tags;

　　 } catch (ParserException e) {

　　 return null;

　　以上就是Java爬虫常用工具使用（java爬虫工具类）的详细内容，想要了解更多 Java爬虫常用工具使用的内容，请持续关注盛行IT软件开发工作室。

郑重声明：本文由网友发布，不代表盛行IT的观点，版权归原作者所有，仅为传播更多信息之目的，如有侵权请联系，我们将第一时间修改或删除，多谢。

相关文章阅读