爬的是杭州近期的的新房价

# encoding: utf-8
"""
@author: Ruvik
@software: PyCharm
@file: Spider_House.py
@time: 2020/7/9 20:12
"""

# 爬一下链家网上杭州新房的价格
import requests # 进行网络请求
import xlwt # 与excel相关的操作
from lxml import etree # 引入xpath库,方便定位元素
import time # 进行访问频率控制
import random # 随机数生成
import re # 正则表达式

# 主程序
def main():

base_url = "https://hz.fang.lianjia.com/loupan/pg" # 最基本的网址,后续会根据这个进行翻页操作
Savepath = ".\\杭州新房价.xls" # 存储路径
datalist = getdata(base_url)
savedata(datalist, Savepath)

# 获取html源码
def ask_url(url):
html=""
headers={ # 进行伪装,防止416错误,模拟浏览器头部信息,向豆瓣服务器发送消息
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
}
try:
response = requests.get(url,headers=headers,timeout=10) # 用户代理,表示告诉豆瓣服务器,我们是什么类型的机器、浏览器(本质上是告诉浏览器,我们可以接收什么水平的文件内容)
time.sleep(random.randint(3, 6)) # 每隔3-6s执行一次请求
# print(response.status_code)
html = response.content # 获取网页的html源代码
# print(html)
print("请求访问成功")
except requests.exceptions.RequestException as e:
print("超时")
print(e)
return html

# 从html源码中获取信息
def getdata(baseurl):
Datalist = [] # 用来存储已经经过处理的信息
for i in range(1, 100): # 在1-100页内爬取房价的信息
url = baseurl+str(i) # 拼接url网址,进行换页
# print(url)
data = ask_url(url) # 获取到源代码
# 从源代码中提取信息
if data != "":
html_data = etree.HTML(data)
div_list = html_data.xpath('//div[@class="resblock-desc-wrapper"]')
# print(len(div_list))
for item in div_list:
data_item = []
# 提取详情链接
link = "https://hz.fang.lianjia.com"
link += item.xpath('./div/a/@href')[0]
data_item.append(link)
# print(link)

# 提取名字
name = item.xpath("./div/a/text()")[0]
data_item.append(name)
# print(name)

# 提取地址
loc = item.xpath('./div[@class="resblock-location"]/span/text()')
location = loc[0] +"|"+ loc[1] +"|"+ item.xpath('./div[@class="resblock-location"]/a/text()')[0]
data_item.append(location)
# print(location)

# 提取户型
room = item.xpath('./a[@class="resblock-room"]/span/text()')
rooms = ""
for k in range(len(room)):
if k < len(room) - 1:
rooms += str(room[k]) + "|"
else:
rooms += str(room[k])
# print(rooms)
data_item.append(rooms)

# 提取大小
size = item.xpath('./div[@class="resblock-area"]/span/text()')
if len(size) != 0:
data_item.append(size[0])
# print(size[0])
else:
data_item.append(" ")

# 提取标签
tag = item.xpath('./div[@class="resblock-tag"]/span/text()')
tags = ""
for k in range(len(tag)):
if k < len(tag)-1:
tags += str(tag[k]) + "|"
else:
tags += str(tag[k])
# print(tags)
data_item.append(tags)

# 提取价格
Price = ""
price = item.xpath('./div[@class="resblock-price"]/div/span/text()')
for k in range(len(price)):
Price += price[k]
Price = re.sub('\xa0', "", Price) # 去掉“\xa0”字符
# print(Price)
data_item.append(Price)
Datalist.append(data_item)

print(len(Datalist))
return Datalist

# 将html获取的信息存入Excel表格中

def savedata(Datalist,Savapath):
col = ("链接", "名字", "地址", "户型", "大小", "标签", "价格") # Excel的表头
house_list = xlwt.Workbook(encoding="utf-8", style_compression=0) # 创建workbook对象
worksheet = house_list.add_sheet("HouseList", cell_overwrite_ok=True) # 新建工作区,设为可覆盖
for i in range(0, 7): # 写入表头
worksheet.write(0, i, col[i])
for i in range(0, 990): # 写入数据
print("正在写入第%d条数据" % (i + 1))
item = Datalist[i]
for j in range(0, 7):
worksheet.write(i + 1, j, item[j])
house_list.save(Savapath) # 存储

# 程序从这里开始执行
if __name__ == "__main__":
main()

print("爬取完毕!")