基于python的租房数据分析,python租房分析
这篇文章主要介绍了计算机编程语言爬取城市房租房信息实战分享,先单线程爬虫,测试可以成功爬取之后再优化为多线程,最后存入数据库,需要的小伙伴可以参考一下的相关资料
目录
一、单线程爬虫二、优化为多线程爬虫三、使用异步超正析象管(Image Orthicon)进一步优化四、存入关系型数据库数据库(一)建表(二)将数据存入数据库中五、最终效果图(已打码)思路:先单线程爬虫,测试可以成功爬取之后再优化为多线程,最后存入数据库以爬取郑州市租房信息为例
注意:本实战项目仅以学习为目的,为避免给网站造成太大压力,请将代码中的数字修改成较小的数字,并将线程改小
一、单线程爬虫
# 用会议取代要求
# 解析库使用bs4
# 并发库使用同时发生的
导入请求
#从什么是导入etree #使用语言解析
从bs4导入美丽的声音
从人人贷导入解析
进口是
导入时间
标题={
参考地址: https://zz.zu.fang.com/ :
用户代理“:”Mozilla/5.0(Windows NT 10.0;win 64x 64)apple WebKit/537.36(KHTML,像壁虎一样)Chrome/86。0 .4240 .198 Safari/537.36英尺,
cookie : global _ cookie=ffzvt 3 kz twck 05 JM 6 twso 2 wjw 18 KL 67 hqft;城市=zzintegratecover=1;_ _ utma=147393320.42779562.161337106.1613337 _ _ utmc=147393320 _ _ utmz=14739395 . 1463951 .1 .UTM CSR=ZZ。方。com ut mccn=(referral) ut mcmd=referral ut mcct=/;_ _ utmt _ t0=1;_ _ utmt _ t1=1;_ _ utmt _ T2=1;ASP .NET _ session id=aamzdnhzct 4 i5 mx3 AK 4 cy oyp;rent _ stat log=23d 82 b 94-13d 6-4601-9019-ce 0225 c 092 f 6;captcha=61584 f 355169576 f 3355317957376 e4f 6f 7552365351342 b 7574693561766 e 63785 a 70522 f 5657370586 e 33765855534651565256574 f 37694 b 7074576 b 34576unique _ cookie=U _ ffzvt 3 kz twck 05 JM 6 tw so 2 wjw 18 KL 67 hqft * 6;_ _ utmb=147393320。12 .10 14839 .16767676767
}
数据={
agentbid:
}
session=requests.session()
session.headers=头
# 获取页面
def getHtml(url):
尝试:
re=session.get(url)
编码=表观编码
返回回复文本
例外:
打印(回复状态代码)
# 获取页面总数量
def getNum(text):
soup=BeautifulSoup(text, lxml )
txt=soup.select( .叶凡. txt)[0].文本
# 取出"共**页"中间的数字
num=re.search(r\d ,txt).组(0)
退货数量
# 获取详细链接
def getLink(tex):
soup=BeautifulSoup(text, lxml )
林
ks=soup.select(.title a)
for link in links:
href=parse.urljoin(https://zz.zu.fang.com/,link[href])
hrefs.append(href)
# 解析页面
def parsePage(url):
res=session.get(url)
if res.status_code==200:
res.encoding=res.apparent_encoding
soup=BeautifulSoup(res.text,lxml)
try:
title=soup.select(div .title)[0].text.strip().replace( ,)
price=soup.select(div .trl-item)[0].text.strip()
block=soup.select(.rcont #agantzfxq_C02_08)[0].text.strip()
building=soup.select(.rcont #agantzfxq_C02_07)[0].text.strip()
try:
address=soup.select(.trl-item2 .rcont)[2].text.strip()
except:
address=soup.select(.trl-item2 .rcont)[1].text.strip()
detail1=soup.select(.clearfix)[4].text.strip().replace(\n\n\n,,).replace(\n,)
detail2=soup.select(.clearfix)[5].text.strip().replace(\n\n\n,,).replace(\n,)
detail=detail1+detail2
name=soup.select(.zf_jjname)[0].text.strip()
buserid=re.search(buserid: \(\d+)\,res.text).group(1)
phone=getPhone(buserid)
print(title,price,block,building,address,detail,name,phone)
house = (title, price, block, building, address, detail, name, phone)
info.append(house)
except:
pass
else:
print(re.status_code,re.text)
# 获取代理人号码
def getPhone(buserid):
url=https://zz.zu.fang.com/RentDetails/Ajax/GetAgentVirtualMobile.aspx
data[agentbid]=buserid
res=session.post(url,data=data)
if res.status_code==200:
return res.text
else:
print(res.status_code)
return
if __name__ == __main__:
start_time=time.time()
hrefs=[]
info=[]
init_url = https://zz.zu.fang.com/house/
num=getNum(getHtml(init_url))
for i in range(0,num):
url = fhttps://zz.zu.fang.com/house/i3{i+1}/
text=getHtml(url)
getLink(text)
print(hrefs)
for href in hrefs:
parsePage(href)
print("共获取%d条数据"%len(info))
print("共耗时{}".format(time.time()-start_time))
session.close()
二、优化为多线程爬虫
# 用session取代requests# 解析库使用bs4
# 并发库使用concurrent
import requests
# from lxml import etree # 使用xpath解析
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from urllib import parse
import re
import time
headers = {
referer: https://zz.zu.fang.com/,
user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36,
cookie: global_cookie=ffzvt3kztwck05jm6twso2wjw18kl67hqft; integratecover=1; city=zz; keyWord_recenthousezz=%5b%7b%22name%22%3a%22%e6%96%b0%e5%af%86%22%2c%22detailName%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a014868%2f%22%2c%22sort%22%3a1%7d%2c%7b%22name%22%3a%22%e4%ba%8c%e4%b8%83%22%2c%22detailName%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a014864%2f%22%2c%22sort%22%3a1%7d%2c%7b%22name%22%3a%22%e9%83%91%e4%b8%9c%e6%96%b0%e5%8c%ba%22%2c%22detailName%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a0842%2f%22%2c%22sort%22%3a1%7d%5d; __utma=147393320.427795962.1613371106.1613558547.1613575774.5; __utmc=147393320; __utmz=147393320.1613575774.5.4.utmcsr=zz.fang.comutmccn=(referral)utmcmd=referralutmcct=/; ASP.NET_SessionId=vhrhxr1tdatcc1xyoxwybuwv; g_sourcepage=zf_fy%5Elb_pc; Captcha=4937566532507336644D6557347143746B5A6A6B4A7A48445A422F2F6A51746C67516F31357446573052634562725162316152533247514250736F72775566574A2B33514357304B6976343D; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; __utmb=147393320.9.10.1613575774; unique_cookie=U_0l0d1ilf1t0ci2rozai9qi24k1pkl9lcmrs*4
}
data={
agentbid:
}
session = requests.session()
session.headers = headers
# 获取页面
def getHtml(url):
res = session.get(url)
if res.status_code==200:
res.encoding = res.apparent_encoding
return res.text
else:
print(res.status_code)
# 获取页面总数量
def getNum(text):
soup = BeautifulSoup(text, lxml)
txt = soup.select(.fanye .txt)[0].text
# 取出“共**页”中间的数字
num = re.search(r\d+, txt).group(0)
return num
# 获取详细链接
def getLink(url):
text=getHtml(url)
soup=BeautifulSoup(text,lxml)
links=soup.select(.title a)
for link in links:
href=parse.urljoin(https://zz.zu.fang.com/,link[href])
hrefs.append(href)
# 解析页面
def parsePage(url):
res=session.get(url)
if res.status_code==200:
res.encoding=res.apparent_encoding
soup=BeautifulSoup(res.text,lxml)
try:
title=soup.select(div .title)[0].text.strip().replace( ,)
price=soup.select(div .trl-item)[0].text.strip()
block=soup.select(.rcont #agantzfxq_C02_08)[0].text.strip()
building=soup.select(.rcont #agantzfxq_C02_07)[0].text.strip()
try:
address=soup.select(.trl-item2 .rcont)[2].text.strip()
except:
address=soup.select(.trl-item2 .rcont)[1].text.strip()
detail1=soup.select(.clearfix)[4].text.strip().replace(\n\n\n,,).replace(\n,)
detail2=soup.select(.clearfix)[5].text.strip().replace(\n\n\n,,).replace(\n,)
detail=detail1+detail2
name=soup.select(.zf_jjname)[0].text.strip()
buserid=re.search(buserid: \(\d+)\,res.text).group(1)
phone=getPhone(buserid)
print(title,price,block,building,address,detail,name,phone)
house = (title, price, block, building, address, detail, name, phone)
info.append(house)
except:
pass
else:
print(re.status_code,re.text)
# 获取代理人号码
def getPhone(buserid):
url=https://zz.zu.fang.com/RentDetails/Ajax/GetAgentVirtualMobile.aspx
data[agentbid]=buserid
res=session.post(url,data=data)
if res.status_code==200:
return res.text
else:
print(res.status_code)
return
if __name__ == __main__:
start_time=time.time()
hrefs=[]
info=[]
init_url = https://zz.zu.fang.com/house/
num=getNum(getHtml(init_url))
with ThreadPoolExecutor(max_workers=5) as t:
for i in range(0,num):
url = fhttps://zz.zu.fang.com/house/i3{i+1}/
t.submit(getLink,url)
print("共获取%d个链接"%len(hrefs))
print(hrefs)
with ThreadPoolExecutor(max_workers=30) as t:
for href in hrefs:
t.submit(parsePage,href)
print("共获取%d条数据"%len(info))
print("耗时{}".format(time.time()-start_time))
session.close()
三、使用asyncio进一步优化
# 用session取代requests# 解析库使用bs4
# 并发库使用concurrent
import requests
# from lxml import etree # 使用xpath解析
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from urllib import parse
import re
import time
import asyncio
headers = {
referer: https://zz.zu.fang.com/,
user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36,
cookie: global_cookie=ffzvt3kztwck05jm6twso2wjw18kl67hqft; integratecover=1; city=zz; keyWord_recenthousezz=%5b%7b%22name%22%3a%22%e6%96%b0%e5%af%86%22%2c%22detailName%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a014868%2f%22%2c%22sort%22%3a1%7d%2c%7b%22name%22%3a%22%e4%ba%8c%e4%b8%83%22%2c%22detailName%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a014864%2f%22%2c%22sort%22%3a1%7d%2c%7b%22name%22%3a%22%e9%83%91%e4%b8%9c%e6%96%b0%e5%8c%ba%22%2c%22detailName%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a0842%2f%22%2c%22sort%22%3a1%7d%5d; __utma=147393320.427795962.1613371106.1613558547.1613575774.5; __utmc=147393320; __utmz=147393320.1613575774.5.4.utmcsr=zz.fang.comutmccn=(referral)utmcmd=referralutmcct=/; ASP.NET_SessionId=vhrhxr1tdatcc1xyoxwybuwv; g_sourcepage=zf_fy%5Elb_pc; Captcha=4937566532507336644D6557347143746B5A6A6B4A7A48445A422F2F6A51746C67516F31357446573052634562725162316152533247514250736F72775566574A2B33514357304B6976343D; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; __utmb=147393320.9.10.1613575774; unique_cookie=U_0l0d1ilf1t0ci2rozai9qi24k1pkl9lcmrs*4
}
data={
agentbid:
}
session = requests.session()
session.headers = headers
# 获取页面
def getHtml(url):
res = session.get(url)
if res.status_code==200:
res.encoding = res.apparent_encoding
return res.text
else:
print(res.status_code)
# 获取页面总数量
def getNum(text):
soup = BeautifulSoup(text, lxml)
txt = soup.select(.fanye .txt)[0].text
# 取出“共**页”中间的数字
num = re.search(r\d+, txt).group(0)
return num
# 获取详细链接
def getLink(url):
text=getHtml(url)
soup=BeautifulSoup(text,lxml)
links=soup.select(.title a)
for link in links:
href=parse.urljoin(https://zz.zu.fang.com/,link[href])
hrefs.append(href)
# 解析页面
def parsePage(url):
res=session.get(url)
if res.status_code==200:
res.encoding=res.apparent_encoding
soup=BeautifulSoup(res.text,lxml)
try:
title=soup.select(div .title)[0].text.strip().replace( ,)
price=soup.select(div .trl-item)[0].text.strip()
block=soup.select(.rcont #agantzfxq_C02_08)[0].text.strip()
building=soup.select(.rcont #agantzfxq_C02_07)[0].text.strip()
try:
address=soup.select(.trl-item2 .rcont)[2].text.strip()
except:
address=soup.select(.trl-item2 .rcont)[1].text.strip()
detail1=soup.select(.clearfix)[4].text.strip().replace(\n\n\n,,).replace(\n,)
detail2=soup.select(.clearfix)[5].text.strip().replace(\n\n\n,,).replace(\n,)
detail=detail1+detail2
name=soup.select(.zf_jjname)[0].text.strip()
buserid=re.search(buserid: \(\d+)\,res.text).group(1)
phone=getPhone(buserid)
print(title,price,block,building,address,detail,name,phone)
house = (title, price, block, building, address, detail, name, phone)
info.append(house)
except:
pass
else:
print(re.status_code,re.text)
# 获取代理人号码
def getPhone(buserid):
url=https://zz.zu.fang.com/RentDetails/Ajax/GetAgentVirtualMobile.aspx
data[agentbid]=buserid
res=session.post(url,data=data)
if res.status_code==200:
return res.text
else:
print(res.status_code)
return
# 获取详细链接的线程池
async def Pool1(num):
loop=asyncio.get_event_loop()
task=[]
with ThreadPoolExecutor(max_workers=5) as t:
for i in range(0,num):
url = fhttps://zz.zu.fang.com/house/i3{i+1}/
task.append(loop.run_in_executor(t,getLink,url))
# 解析页面的线程池
async def Pool2(hrefs):
loop=asyncio.get_event_loop()
task=[]
with ThreadPoolExecutor(max_workers=30) as t:
for href in hrefs:
task.append(loop.run_in_executor(t,parsePage,href))
if __name__ == __main__:
start_time=time.time()
hrefs=[]
info=[]
task=[]
init_url = https://zz.zu.fang.com/house/
num=getNum(getHtml(init_url))
loop = asyncio.get_event_loop()
loop.run_until_complete(Pool1(num))
print("共获取%d个链接"%len(hrefs))
print(hrefs)
loop.run_until_complete(Pool2(hrefs))
loop.close()
print("共获取%d条数据"%len(info))
print("耗时{}".format(time.time()-start_time))
session.close()
四、存入Mysql数据库
(一)建表
from sqlalchemy import create_enginefrom sqlalchemy import String, Integer, Column, Text
from sqlalchemy.orm import sessionmaker
from sqlalchemy.orm import scoped_session # 多线程爬虫时避免出现线程安全问题
from sqlalchemy.ext.declarative import declarative_base
BASE = declarative_base() # 实例化
engine = create_engine(
"mysql+pymysql://root:root@127.0.0.1:3306/pytest?charset=utf8",
max_overflow=300, # 超出连接池大小最多可以创建的连接
pool_size=100, # 连接池大小
echo=False, # 不显示调试信息
)
class House(BASE):
__tablename__ = house
id = Column(Integer, primary_key=True, autoincrement=True)
title=Column(String(200))
price=Column(String(200))
block=Column(String(200))
building=Column(String(200))
address=Column(String(200))
detail=Column(Text())
name=Column(String(20))
phone=Column(String(20))
BASE.metadata.create_all(engine)
Session = sessionmaker(engine)
sess = scoped_session(Session)
(二)将数据存入数据库中
# 用session取代requests# 解析库使用bs4
# 并发库使用concurrent
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from urllib import parse
from mysqldb import sess, House
import re
import time
import asyncio
headers = {
referer: https://zz.zu.fang.com/,
user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36,
cookie: global_cookie=ffzvt3kztwck05jm6twso2wjw18kl67hqft; integratecover=1; city=zz; __utmc=147393320; ASP.NET_SessionId=vhrhxr1tdatcc1xyoxwybuwv; __utma=147393320.427795962.1613371106.1613575774.1613580597.6; __utmz=147393320.1613580597.6.5.utmcsr=zz.fang.comutmccn=(referral)utmcmd=referralutmcct=/; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; Rent_StatLog=c158b2a7-4622-45a9-9e69-dcf6f42cf577; keyWord_recenthousezz=%5b%7b%22name%22%3a%22%e4%ba%8c%e4%b8%83%22%2c%22detailName%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a014864%2f%22%2c%22sort%22%3a1%7d%2c%7b%22name%22%3a%22%e9%83%91%e4%b8%9c%e6%96%b0%e5%8c%ba%22%2c%22detailName%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a0842%2f%22%2c%22sort%22%3a1%7d%2c%7b%22name%22%3a%22%e7%bb%8f%e5%bc%80%22%2c%22detailName%22%3a%22%22%2c%22url%22%3a%22%2fhouse-a014871%2f%22%2c%22sort%22%3a1%7d%5d; g_sourcepage=zf_fy%5Elb_pc; Captcha=6B65716A41454739794D666864397178613772676C75447A4E746C657144775A347A6D42554F446532357649643062344F6976756E563450554E59594B7833712B413579506C4B684958343D; unique_cookie=U_0l0d1ilf1t0ci2rozai9qi24k1pkl9lcmrs*14; __utmb=147393320.21.10.1613580597
}
data={
agentbid:
}
session = requests.session()
session.headers = headers
# 获取页面
def getHtml(url):
res = session.get(url)
if res.status_code==200:
res.encoding = res.apparent_encoding
return res.text
else:
print(res.status_code)
# 获取页面总数量
def getNum(text):
soup = BeautifulSoup(text, lxml)
txt = soup.select(.fanye .txt)[0].text
# 取出“共**页”中间的数字
num = re.search(r\d+, txt).group(0)
return num
# 获取详细链接
def getLink(url):
text=getHtml(url)
soup=BeautifulSoup(text,lxml)
links=soup.select(.title a)
for link in links:
href=parse.urljoin(https://zz.zu.fang.com/,link[href])
hrefs.append(href)
# 解析页面
def parsePage(url):
res=session.get(url)
if res.status_code==200:
res.encoding=res.apparent_encoding
soup=BeautifulSoup(res.text,lxml)
try:
title=soup.select(div .title)[0].text.strip().replace( ,)
price=soup.select(div .trl-item)[0].text.strip()
block=soup.select(.rcont #agantzfxq_C02_08)[0].text.strip()
building=soup.select(.rcont #agantzfxq_C02_07)[0].text.strip()
try:
address=soup.select(.trl-item2 .rcont)[2].text.strip()
except:
address=soup.select(.trl-item2 .rcont)[1].text.strip()
detail1=soup.select(.clearfix)[4].text.strip().replace(\n\n\n,,).replace(\n,)
detail2=soup.select(.clearfix)[5].text.strip().replace(\n\n\n,,).replace(\n,)
detail=detail1+detail2
name=soup.select(.zf_jjname)[0].text.strip()
buserid=re.search(buserid: \(\d+)\,res.text).group(1)
phone=getPhone(buserid)
print(title,price,block,building,address,detail,name,phone)
house = (title, price, block, building, address, detail, name, phone)
info.append(house)
try:
house_data=House(
title=title,
price=price,
block=block,
building=building,
address=address,
detail=detail,
name=name,
phone=phone
)
sess.add(house_data)
sess.commit()
except Exception as e:
print(e) # 打印错误信息
sess.rollback() # 回滚
except:
pass
else:
print(re.status_code,re.text)
# 获取代理人号码
def getPhone(buserid):
url=https://zz.zu.fang.com/RentDetails/Ajax/GetAgentVirtualMobile.aspx
data[agentbid]=buserid
res=session.post(url,data=data)
if res.status_code==200:
return res.text
else:
print(res.status_code)
return
# 获取详细链接的线程池
async def Pool1(num):
loop=asyncio.get_event_loop()
task=[]
with ThreadPoolExecutor(max_workers=5) as t:
for i in range(0,num):
url = fhttps://zz.zu.fang.com/house/i3{i+1}/
task.append(loop.run_in_executor(t,getLink,url))
# 解析页面的线程池
async def Pool2(hrefs):
loop=asyncio.get_event_loop()
task=[]
with ThreadPoolExecutor(max_workers=30) as t:
for href in hrefs:
task.append(loop.run_in_executor(t,parsePage,href))
if __name__ == __main__:
start_time=time.time()
hrefs=[]
info=[]
task=[]
init_url = https://zz.zu.fang.com/house/
num=getNum(getHtml(init_url))
loop = asyncio.get_event_loop()
loop.run_until_complete(Pool1(num))
print("共获取%d个链接"%len(hrefs))
print(hrefs)
loop.run_until_complete(Pool2(hrefs))
loop.close()
print("共获取%d条数据"%len(info))
print("耗时{}".format(time.time()-start_time))
session.close()
五、最终效果图 (已打码)
到此这篇关于Python爬取城市租房信息实战分享的文章就介绍到这了,更多相关Python爬取租房信息内容请搜索盛行IT软件开发工作室以前的文章或继续浏览下面的相关文章希望大家以后多多支持盛行IT软件开发工作室!
郑重声明:本文由网友发布,不代表盛行IT的观点,版权归原作者所有,仅为传播更多信息之目的,如有侵权请联系,我们将第一时间修改或删除,多谢。