python爬虫百度文库,Python爬虫csdn

　　这篇文章主要为大家详细介绍了大蟒实现垂直爬虫系统的方法，文中示例代码介绍的非常详细，具有一定的参考价值，感兴趣的小伙伴们可以参考一下，希望能够给你带来帮助

　　html_downloader

　　从人人贷导入请求

　　定义下载(网址):

　　如果全球资源定位器(Uniform Resource Locator)不是：

　　response=request.urlopen(url)

　　if response.getcode()！=200:

　　不返回

　　return response.read()

　　html_outeputer

　　data_list=[]

　　定义收集数据（数据):

　　数据列表。追加（数据)

　　极好的输出_html():

　　fout=open(output.html ， w )

　　fout.write(html )

　　fout.write(body )

　　fout.write(表格)

　　对于data_list:中的数据项

　　fout.write(tr )

　　fout。write( TD % s/TD % dataitem[ URL ])

　　fout。write( TD % s/TD % dataitem[ title ])

　　fout。write( TD % s/TD % dataitem[ datetime ])

　　fout。write( TD % s/TD % dataitem[访问计数])

　　fout.write(/tr )

　　fout.write(/table )

　　fout.write(/body )

　　fout.write(/html )

　　fout.close()

　　html _解析器

　　进口是

　　从bs4导入美丽的声音

　　从urllib.parse导入urljoin

　　def get_new_urls(page_url，soup):

　　new_urls=set()

　　links=soup.find_all(a ，href=re。compile(r /\ d/\ d/\ w/page \htm ))

　　对于链接：中的链接

　　new_url=link[href]

　　new_full_url=urljoin(page_url，new_url)

　　新增网址（新增完整网址)

　　返回新网址

　　def get_new_data(page_url，soup):

　　res_data={}

　　title_node=soup.find(h1 ，class_=arti-title )

　　如果标题节点不是：

　　返回res_data

　　RES _ data[标题]=标题节点。get _ text()

　　datetime_node=soup.find(span ，class_=arti-update )

　　RES _ data[ datetime ]=日期时间节点。get _ text()

　　访问count _ node=soup。find( span ，class_=WP_VisitCount )

　　RES _ data[访问计数]=访问计数节点。get _ text()

　　res_data[url]=page_url

　　返回res_data

　　定义解析（页面url，html cont):

　　如果页面_url为没有人或html_cont为无：

　　soup=BeautifulSoup(html_cont， html.parser ，from_encoding=utf-8 )

　　new _ URL=get _ new _ URLs(page _ URL

　　oup)

　　 new_data = get_new_data(page_url, soup)

　　 return new_urls, new_data

　　spider_main

import urls_manager, html_downloader, \
　　 html_parser, html_outputer
　　def craw(root_url):
　　 count = 1
　　 urls_manager.add_new_url(root_url)
　　 #启动爬虫循环
　　 while urls_manager.has_new_url():
　　 new_url = urls_manager.get_new_url()
　　 print(craw %d : %s % (count, new_url))
　　 html_cont = html_downloader.download(new_url)
　　 new_urls, new_data = html_parser.parse(new_url, html_cont)
　　 urls_manager.add_new_urls(new_urls)
　　 if new_data:
　　 html_outputer.collect_data(new_data)
　　 if count == 10:
　　 break
　　 count = count + 1
　　 html_outputer.output_html()
　　if __name__ == __main__:
　　 root_url = http://news.zzuli.edu.cn/
　　 craw(root_url)
　　import urls_manager, html_downloader, \
　　 html_parser, html_outputer
　　def craw(root_url):
　　 count = 1
　　 urls_manager.add_new_url(root_url)
　　 #启动爬虫循环
　　 while urls_manager.has_new_url():
　　 new_url = urls_manager.get_new_url()
　　 print(craw %d : %s % (count, new_url))
　　 html_cont = html_downloader.download(new_url)
　　 new_urls, new_data = html_parser.parse(new_url, html_cont)
　　 urls_manager.add_new_urls(new_urls)
　　 if new_data:
　　 html_outputer.collect_data(new_data)
　　 if count == 10:
　　 break
　　 count = count + 1
　　 html_outputer.output_html()
　　if __name__ == __main__:
　　 root_url = http://news.zzuli.edu.cn/
　　 craw(root_url)

　　test_64

from bs4 import BeautifulSoup
　　import re
　　html_doc = """
　　<html><head><title>The Dormouses story</title></head>
　　<body>
　　<p class="title"><b>The Dormouses story</b></p>
　　<p class="story">Once upon a time there were three little sisters; and their names were
　　<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
　　<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
　　<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
　　and they lived at the bottom of a well.</p>
　　<p class="story">...</p>
　　"""
　　soup = BeautifulSoup(html_doc, html.parser)
　　print(获取所有链接)
　　links = soup.find_all(a)
　　for link in links:
　　 print(link.name, link[href], link.get_text())
　　print(获取lacie链接)
　　link_node = soup.find(a, href=http://example.com/lacie)
　　print(link_node.name, link_node[href], link_node.get_text())
　　print(正则匹配)
　　link_node = soup.find(a, href=re.compile(rill))
　　print(link_node.name, link_node[href], link_node.get_text())
　　print(获取P段落文字)
　　p_node = soup.find(p, class_=title)
　　print(p_node.name, p_node.get_text())

　　urls_manager

new_urls = set()
　　old_urls = set()
　　def add_new_url(url):
　　 if url is None:
　　 return
　　 if url not in new_urls and url not in old_urls:
　　 new_urls.add(url)
　　def add_new_urls(urls):
　　 if urls is None or len(urls) == 0:
　　 return
　　 for url in urls:
　　 add_new_url(url)
　　def get_new_url():
　　 new_url = new_urls.pop()
　　 old_urls.add(new_url)
　　 return new_url
　　def has_new_url():
　　 return len(new_urls) != 0

总结

　　本篇文章就到这里了，希望能够给你带来帮助，也希望您能够多多关注盛行IT软件开发工作室的更多内容!

郑重声明：本文由网友发布，不代表盛行IT的观点，版权归原作者所有，仅为传播更多信息之目的，如有侵权请联系，我们将第一时间修改或删除，多谢。

总结

相关文章阅读