python爬虫百度文库,Python爬虫csdn
这篇文章主要为大家详细介绍了大蟒实现垂直爬虫系统的方法,文中示例代码介绍的非常详细,具有一定的参考价值,感兴趣的小伙伴们可以参考一下,希望能够给你带来帮助
html_downloader
从人人贷导入请求
定义下载(网址):
如果全球资源定位器(Uniform Resource Locator)不是:
返回
response=request.urlopen(url)
if response.getcode()!=200:
不返回
return response.read()
html_outeputer
data_list=[]
定义收集数据(数据):
数据列表。追加(数据)
极好的输出_html():
fout=open(output.html , w )
fout.write(html )
fout.write(body )
fout.write(表格)
对于data_list:中的数据项
fout.write(tr )
fout。write( TD % s/TD % dataitem[ URL ])
fout。write( TD % s/TD % dataitem[ title ])
fout。write( TD % s/TD % dataitem[ datetime ])
fout。write( TD % s/TD % dataitem[访问计数])
fout.write(/tr )
fout.write(/table )
fout.write(/body )
fout.write(/html )
fout.close()
html _解析器
进口是
从bs4导入美丽的声音
从urllib.parse导入urljoin
def get_new_urls(page_url,soup):
new_urls=set()
links=soup.find_all(a ,href=re。compile(r /\ d/\ d/\ w/page \htm ))
对于链接:中的链接
new_url=link[href]
new_full_url=urljoin(page_url,new_url)
新增网址(新增完整网址)
返回新网址
def get_new_data(page_url,soup):
res_data={}
title_node=soup.find(h1 ,class_=arti-title )
如果标题节点不是:
返回res_data
RES _ data[标题]=标题节点。get _ text()
datetime_node=soup.find(span ,class_=arti-update )
RES _ data[ datetime ]=日期时间节点。get _ text()
访问count _ node=soup。find( span ,class_=WP_VisitCount )
RES _ data[访问计数]=访问计数节点。get _ text()
res_data[url]=page_url
返回res_data
定义解析(页面url,html cont):
如果页面_url为没有人或html_cont为无:
返回
soup=BeautifulSoup(html_cont, html.parser ,from_encoding=utf-8 )
new _ URL=get _ new _ URLs(page _ URL
oup)
new_data = get_new_data(page_url, soup)
return new_urls, new_data
spider_main
import urls_manager, html_downloader, \html_parser, html_outputer
def craw(root_url):
count = 1
urls_manager.add_new_url(root_url)
#启动爬虫循环
while urls_manager.has_new_url():
new_url = urls_manager.get_new_url()
print(craw %d : %s % (count, new_url))
html_cont = html_downloader.download(new_url)
new_urls, new_data = html_parser.parse(new_url, html_cont)
urls_manager.add_new_urls(new_urls)
if new_data:
html_outputer.collect_data(new_data)
if count == 10:
break
count = count + 1
html_outputer.output_html()
if __name__ == __main__:
root_url = http://news.zzuli.edu.cn/
craw(root_url)
import urls_manager, html_downloader, \
html_parser, html_outputer
def craw(root_url):
count = 1
urls_manager.add_new_url(root_url)
#启动爬虫循环
while urls_manager.has_new_url():
new_url = urls_manager.get_new_url()
print(craw %d : %s % (count, new_url))
html_cont = html_downloader.download(new_url)
new_urls, new_data = html_parser.parse(new_url, html_cont)
urls_manager.add_new_urls(new_urls)
if new_data:
html_outputer.collect_data(new_data)
if count == 10:
break
count = count + 1
html_outputer.output_html()
if __name__ == __main__:
root_url = http://news.zzuli.edu.cn/
craw(root_url)
test_64
from bs4 import BeautifulSoupimport re
html_doc = """
<html><head><title>The Dormouses story</title></head>
<body>
<p class="title"><b>The Dormouses story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, html.parser)
print(获取所有链接)
links = soup.find_all(a)
for link in links:
print(link.name, link[href], link.get_text())
print(获取lacie链接)
link_node = soup.find(a, href=http://example.com/lacie)
print(link_node.name, link_node[href], link_node.get_text())
print(正则匹配)
link_node = soup.find(a, href=re.compile(rill))
print(link_node.name, link_node[href], link_node.get_text())
print(获取P段落文字)
p_node = soup.find(p, class_=title)
print(p_node.name, p_node.get_text())
urls_manager
new_urls = set()old_urls = set()
def add_new_url(url):
if url is None:
return
if url not in new_urls and url not in old_urls:
new_urls.add(url)
def add_new_urls(urls):
if urls is None or len(urls) == 0:
return
for url in urls:
add_new_url(url)
def get_new_url():
new_url = new_urls.pop()
old_urls.add(new_url)
return new_url
def has_new_url():
return len(new_urls) != 0
总结
本篇文章就到这里了,希望能够给你带来帮助,也希望您能够多多关注盛行IT软件开发工作室的更多内容!
郑重声明:本文由网友发布,不代表盛行IT的观点,版权归原作者所有,仅为传播更多信息之目的,如有侵权请联系,我们将第一时间修改或删除,多谢。