python爬虫百度文库,Python爬虫csdn

  python爬虫百度文库,Python爬虫csdn

  这篇文章主要为大家详细介绍了大蟒实现垂直爬虫系统的方法,文中示例代码介绍的非常详细,具有一定的参考价值,感兴趣的小伙伴们可以参考一下,希望能够给你带来帮助

  html_downloader

  从人人贷导入请求

  定义下载(网址):

  如果全球资源定位器(Uniform Resource Locator)不是:

  返回

  response=request.urlopen(url)

  if response.getcode()!=200:

  不返回

  return response.read()

  html_outeputer

  data_list=[]

  定义收集数据(数据):

  数据列表。追加(数据)

  极好的输出_html():

  fout=open(output.html , w )

  fout.write(html )

  fout.write(body )

  fout.write(表格)

  对于data_list:中的数据项

  fout.write(tr )

  fout。write( TD % s/TD % dataitem[ URL ])

  fout。write( TD % s/TD % dataitem[ title ])

  fout。write( TD % s/TD % dataitem[ datetime ])

  fout。write( TD % s/TD % dataitem[访问计数])

  fout.write(/tr )

  fout.write(/table )

  fout.write(/body )

  fout.write(/html )

  fout.close()

  html _解析器

  进口是

  从bs4导入美丽的声音

  从urllib.parse导入urljoin

  def get_new_urls(page_url,soup):

  new_urls=set()

  links=soup.find_all(a ,href=re。compile(r /\ d/\ d/\ w/page \htm ))

  对于链接:中的链接

  new_url=link[href]

  new_full_url=urljoin(page_url,new_url)

  新增网址(新增完整网址)

  返回新网址

  def get_new_data(page_url,soup):

  res_data={}

  title_node=soup.find(h1 ,class_=arti-title )

  如果标题节点不是:

  返回res_data

  RES _ data[标题]=标题节点。get _ text()

  datetime_node=soup.find(span ,class_=arti-update )

  RES _ data[ datetime ]=日期时间节点。get _ text()

  访问count _ node=soup。find( span ,class_=WP_VisitCount )

  RES _ data[访问计数]=访问计数节点。get _ text()

  res_data[url]=page_url

  返回res_data

  定义解析(页面url,html cont):

  如果页面_url为没有人或html_cont为无:

  返回

  soup=BeautifulSoup(html_cont, html.parser ,from_encoding=utf-8 )

  new _ URL=get _ new _ URLs(page _ URL

  oup)

   new_data = get_new_data(page_url, soup)

   return new_urls, new_data

  

  spider_main

  

import urls_manager, html_downloader, \

   html_parser, html_outputer

  def craw(root_url):

   count = 1

   urls_manager.add_new_url(root_url)

   #启动爬虫循环

   while urls_manager.has_new_url():

   new_url = urls_manager.get_new_url()

   print(craw %d : %s % (count, new_url))

   html_cont = html_downloader.download(new_url)

   new_urls, new_data = html_parser.parse(new_url, html_cont)

   urls_manager.add_new_urls(new_urls)

   if new_data:

   html_outputer.collect_data(new_data)

   if count == 10:

   break

   count = count + 1

   html_outputer.output_html()

  if __name__ == __main__:

   root_url = http://news.zzuli.edu.cn/

   craw(root_url)

  import urls_manager, html_downloader, \

   html_parser, html_outputer

  def craw(root_url):

   count = 1

   urls_manager.add_new_url(root_url)

   #启动爬虫循环

   while urls_manager.has_new_url():

   new_url = urls_manager.get_new_url()

   print(craw %d : %s % (count, new_url))

   html_cont = html_downloader.download(new_url)

   new_urls, new_data = html_parser.parse(new_url, html_cont)

   urls_manager.add_new_urls(new_urls)

   if new_data:

   html_outputer.collect_data(new_data)

   if count == 10:

   break

   count = count + 1

   html_outputer.output_html()

  if __name__ == __main__:

   root_url = http://news.zzuli.edu.cn/

   craw(root_url)

  

  test_64

  

from bs4 import BeautifulSoup

  import re

  html_doc = """

  <html><head><title>The Dormouses story</title></head>

  <body>

  <p class="title"><b>The Dormouses story</b></p>

  <p class="story">Once upon a time there were three little sisters; and their names were

  <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,

  <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

  <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

  and they lived at the bottom of a well.</p>

  <p class="story">...</p>

  """

  soup = BeautifulSoup(html_doc, html.parser)

  print(获取所有链接)

  links = soup.find_all(a)

  for link in links:

   print(link.name, link[href], link.get_text())

  print(获取lacie链接)

  link_node = soup.find(a, href=http://example.com/lacie)

  print(link_node.name, link_node[href], link_node.get_text())

  print(正则匹配)

  link_node = soup.find(a, href=re.compile(rill))

  print(link_node.name, link_node[href], link_node.get_text())

  print(获取P段落文字)

  p_node = soup.find(p, class_=title)

  print(p_node.name, p_node.get_text())

  

  urls_manager

  

new_urls = set()

  old_urls = set()

  def add_new_url(url):

   if url is None:

   return

   if url not in new_urls and url not in old_urls:

   new_urls.add(url)

  def add_new_urls(urls):

   if urls is None or len(urls) == 0:

   return

   for url in urls:

   add_new_url(url)

  def get_new_url():

   new_url = new_urls.pop()

   old_urls.add(new_url)

   return new_url

  def has_new_url():

   return len(new_urls) != 0

  

  

总结

  本篇文章就到这里了,希望能够给你带来帮助,也希望您能够多多关注盛行IT软件开发工作室的更多内容!

郑重声明:本文由网友发布,不代表盛行IT的观点,版权归原作者所有,仅为传播更多信息之目的,如有侵权请联系,我们将第一时间修改或删除,多谢。

留言与评论(共有 条评论)
   
验证码: