Java爬虫常用工具使用(java爬虫工具类)

  本篇文章为你整理了Java爬虫常用工具使用(java爬虫工具类)的详细内容,包含有java 爬虫工具 java爬虫工具类 java爬虫需要的基本知识 java爬虫入门教程 Java爬虫常用工具使用,希望能帮助你了解 Java爬虫常用工具使用。

  2、封装获取Cookie的方法

  

public List org.apache.http.cookie.Cookie getHtmlByWebClient(String url, String method) {

 

   @SuppressWarnings("resource")

   WebClient webClient = new WebClient();

   webClient.getOptions().setTimeout(100000);

   webClient.getOptions().setCssEnabled(false);// 取消css支持

   webClient.getOptions().setJavaScriptEnabled(true); // 取消javascript支持

   webClient.getOptions().setThrowExceptionOnScriptError(false);

   webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要,设置支持AJAX

   WebResponse response = null;

   try {

   //先去请求网站,让他把网站的CookieSet进去

   try {

   webClient.getPage(url);

   } catch (IOException e) {

   } catch (FailingHttpStatusCodeException e) {

   //等待30秒让WebClient执行js脚本

   TimeUnit.SECONDS.sleep(30L);

   URL ur = new URL(url);

   WebRequest webequest;

   if ("GET".equals(method)) {

   webequest = new WebRequest(ur, HttpMethod.GET);

   } else {

   webequest = new WebRequest(ur, HttpMethod.POST);

   response = webClient.loadWebResponse(webequest);

   } catch (Exception e) {

   throw new RuntimeException(e);

   } finally {

   webClient.close();

   Set Cookie cookies = null;

   try {

   cookies = webClient.getCookies(new URL(url));

   } catch (MalformedURLException e) {

   return Cookie.toHttpClient(cookies);

  

 

  

 @Test

 

   void name888() {

   String url = "https://www.yuque.com/wanqi-1f4b0/vlgn2k/cnf0v6tw2v5viz5t"

   List org.apache.http.cookie.Cookie get = getHtmlByWebClient(url);

   StringBuilder sb = new StringBuilder();

   for (org.apache.http.cookie.Cookie cookie : get) {

   HttpCookie httpCookie = new HttpCookie( cookie.getName(), cookie.getValue());

   sb.append(httpCookie);

   sb.append("; ");

   Map String, String headers = new HashMap ();

   headers.put("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:106.0) Gecko/20100101 Firefox/106.0");

   headers.put("Host", "pic.netbian.com");

   headers.put("Sec-Fetch-Dest", "document");

   headers.put("Sec-Fetch-Mode", "navigate");

   headers.put("Sec-Fetch-Site", "cross-site");

   headers.put("Upgrade-Insecure-Requests", "1");

   headers.put("Pragma", "no-cache");

   headers.put("Cookie", sb.toString());

   String s1 = HttpUtil.createGet(url)

   .addHeaders(headers)

   .execute().body();

   System.out.println(s1);

  

 

  2、html解析工具htmlparser封装

  

package com.wanqi.util;

 

  import org.htmlparser.tags.CompositeTag;

   * @Auther: wq

   * @Date: 2020/3/9 17:37

   * @Description: htmlparser定制标签

   * @Version: 1.0

  public class CustomizeTag extends CompositeTag {

   private static final String mIds[] = {

   "tbody", "b", "strong", "dd", "section", "big"

   private static final String mEndTagEnders[] = {

   "tbody", "b", "strong", "dd", "section", "big"

   public CustomizeTag() {

  
import org.htmlparser.PrototypicalNodeFactory;

  import org.htmlparser.filters.TagNameFilter;

  import org.htmlparser.nodes.TagNode;

  import org.htmlparser.util.NodeList;

  import org.htmlparser.util.ParserException;

  import org.jetbrains.annotations.NotNull;

  import java.util.ArrayList;

  import java.util.List;

  public class ParseUtil {

   * 提取具有某个属性值的标签列表

   * @param T

   * @param html

   * 被提取的HTML文本

   * @param tagType

   * 标签类型

   * @param attributeName

   * 某个属性的名称

   * @param attributeValue

   * 属性应取的值

   * @return

   @SuppressWarnings({ "serial", "unchecked" })

   public static T extends TagNode List T parseTags(String html, final Class T tagType, final String attributeName,

   final String attributeValue) {

   try {

   // 创建一个HTML解释器

   Parser parser = new Parser();

   parser.setInputHTML(html);

   NodeList tagList = parser.parse((NodeFilter) node - {

   if (node.getClass() == tagType) {

   T tn = (T) node;

   if (attributeName == null) {

   return true;

   String attrValue = tn.getAttribute(attributeName);

   if (attrValue != null attrValue.equals(attributeValue)) {

   return true;

   return false;

   return getTs(tagList);

   } catch (ParserException e) {

   return null;

   @NotNull

   private static T extends TagNode List T getTs(NodeList tagList) {

   List T tags = new ArrayList T

   for (int i = 0; i tagList.size(); i++) {

   T t = (T) tagList.elementAt(i);

   tags.add(t);

   return tags;

   @SuppressWarnings({ "serial", "unchecked" })

   public static T extends TagNode List T parseNodes(String html, final Class T tagType,

   final String attributeName, final String attributeValue) {

   try {

   // 创建一个HTML解释器

   Parser parser = new Parser();

   parser.setInputHTML(html);

   NodeList tagList = parser.parse((NodeFilter) node - {

   if (node instanceof TagNode) {

   T tn = (T) node;

   if (attributeName == null) {

   return true;

   String attrValue = tn.getAttribute(attributeName);

   if (attrValue != null attrValue.equals(attributeValue)) {

   return true;

   return false;

   return getTs(tagList);

   } catch (ParserException e) {

   return null;

  
public static T extends TagNode List T parseTags(String html, final Class T tagType) {

   return parseTags(html, tagType, null, null);

   public static T extends TagNode T parseTag(String html, final Class T tagType, final String attributeName,

   final String attributeValue) {

   List T tags = parseTags(html, tagType, attributeName, attributeValue);

   if (tags != null tags.size() 0) {

   return tags.get(0);

   return null;

   public static T extends TagNode T parseNode(String html, final Class T tagType, final String attributeName,

   final String attributeValue) {

   List T tags = parseNodes(html, tagType, attributeName, attributeValue);

   if (tags != null tags.size() 0) {

   return tags.get(0);

   return null;

   public static T extends TagNode T parseTag(String html, final Class T tagType) {

   return parseTag(html, tagType, null, null);

   @SuppressWarnings({ "serial", "unchecked" })

   public static T extends TagNode List T parseFuzzyTags(String html, final Class T tagType,

   final String attributeName, final String attributeValue) {

   try {

   // 创建一个HTML解释器

   Parser parser = new Parser();

   parser.setInputHTML(html);

   NodeList tagList = parser.parse((NodeFilter) node - {

   if (node instanceof TagNode) {

   T tn = (T) node;

   if (attributeName == null) {

   return true;

   String attrValue = tn.getAttribute(attributeName);

   if (attrValue != null attrValue.contains(attributeValue)) {

   return true;

   return false;

   return getTs(tagList);

   } catch (ParserException e) {

   // e.printStackTrace();

   return null;

   public static T extends TagNode T parseFuzzyTag(String html, final Class T tagType, final String attributeName,

   final String attributeValue) {

   List T tags = parseFuzzyTags(html, tagType, attributeName, attributeValue);

   if (tags != null tags.size() 0) {

   return tags.get(0);

   return null;

   * 自定义html标签

   * @param html

   * @param tagType 标签名

   * @return

   public static CustomizeTag parseTagsByCustomize(String html, final String tagType) {

   List CustomizeTag tags = parseTagsByCustomizes(html, tagType, null, null);

   if (tags != null tags.size() 0) {

   return tags.get(0);

   return null;

  
public static CustomizeTag parseTagsByCustomize(String html, final String tagType, final String attributeName,

   final String attributeValue) {

   List CustomizeTag tags = parseTagsByCustomizes(html, tagType, attributeName, attributeValue);

   if (tags != null tags.size() 0) {

   return tags.get(0);

   return null;

  
public static List CustomizeTag parseTagsByCustomizes(String html, final String tagType, final String attributeName,

   final String attributeValue) {

   try {

   // 创建一个HTML解释器

   CustomizeTag customizeTag = new CustomizeTag();

   Parser parser = new Parser();

   parser.setInputHTML(html);

   PrototypicalNodeFactory p=new PrototypicalNodeFactory();

   p.registerTag(customizeTag);

   parser.setNodeFactory(p);

   String tagName = null;

   String[] ids = customizeTag.getIds();

   for (String id : ids) {

   if(tagType.equals(id)){

   tagName = id;

   if(tagName == null){

   return null;

   NodeList tagList = parser.parse(new TagNameFilter(tagName));

   List CustomizeTag tags = new ArrayList CustomizeTag

   for (int i = 0; i tagList.size(); i++) {

   CustomizeTag t = (CustomizeTag) tagList.elementAt(i);

   if (attributeName == null) {

   tags.add(t);

   } else {

   String attrValue = t.getAttribute(attributeName);

   if (attrValue != null attrValue.equals(attributeValue)) {

   tags.add(t);

   return tags;

   } catch (ParserException e) {

   return null;

   * 自定义html标签

   * @param html

   * @param tagType 标签名

   * @return

   public static CustomizeTag parseFuzzyTagsByCustomize(String html, final String tagType) {

   List CustomizeTag tags = parseFuzzyTagsByCustomizes(html, tagType, null, null);

   if (tags != null tags.size() 0) {

   return tags.get(0);

   return null;

  
public static CustomizeTag parseFuzzyTagsByCustomize(String html, final String tagType, final String attributeName,

   final String attributeValue) {

   List CustomizeTag tags = parseFuzzyTagsByCustomizes(html, tagType, attributeName, attributeValue);

   if (tags != null tags.size() 0) {

   return tags.get(0);

   return null;

  
public static List CustomizeTag parseFuzzyTagsByCustomizes(String html, final String tagType, final String attributeName,

   final String attributeValue) {

   try {

   // 创建一个HTML解释器

   CustomizeTag customizeTag = new CustomizeTag();

   Parser parser = new Parser();

   parser.setInputHTML(html);

   PrototypicalNodeFactory p=new PrototypicalNodeFactory();

   p.registerTag(customizeTag);

   parser.setNodeFactory(p);

   String tagName = null;

   String[] ids = customizeTag.getIds();

   for (String id : ids) {

   if(tagType.equals(id)){

   tagName = id;

   if(tagName == null){

   return null;

   NodeList tagList = parser.parse(new TagNameFilter(tagName));

   List CustomizeTag tags = new ArrayList CustomizeTag

   for (int i = 0; i tagList.size(); i++) {

   CustomizeTag t = (CustomizeTag) tagList.elementAt(i);

   if (attributeName == null) {

   tags.add(t);

   } else {

   String attrValue = t.getAttribute(attributeName);

   if (attrValue != null attrValue.contains(attributeValue)) {

   tags.add(t);

   return tags;

   } catch (ParserException e) {

   return null;

  

 

  以上就是Java爬虫常用工具使用(java爬虫工具类)的详细内容,想要了解更多 Java爬虫常用工具使用的内容,请持续关注盛行IT软件开发工作室。

郑重声明:本文由网友发布,不代表盛行IT的观点,版权归原作者所有,仅为传播更多信息之目的,如有侵权请联系,我们将第一时间修改或删除,多谢。

留言与评论(共有 条评论)
   
验证码: