""" @author: Ruvik @software: PyCharm @file: Spider_House.py @time: 2020/7/9 20:12 """
import requests import xlwt from lxml import etree import time import random import re
def main():
base_url = "https://hz.fang.lianjia.com/loupan/pg" Savepath = ".\\杭州新房价.xls" datalist = getdata(base_url) savedata(datalist, Savepath)
def ask_url(url): html="" headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36" } try: response = requests.get(url,headers=headers,timeout=10) time.sleep(random.randint(3, 6)) html = response.content print("请求访问成功") except requests.exceptions.RequestException as e: print("超时") print(e) return html
def getdata(baseurl): Datalist = [] for i in range(1, 100): url = baseurl+str(i) data = ask_url(url) if data != "": html_data = etree.HTML(data) div_list = html_data.xpath('//div[@class="resblock-desc-wrapper"]') for item in div_list: data_item = [] link = "https://hz.fang.lianjia.com" link += item.xpath('./div/a/@href')[0] data_item.append(link)
name = item.xpath("./div/a/text()")[0] data_item.append(name)
loc = item.xpath('./div[@class="resblock-location"]/span/text()') location = loc[0] +"|"+ loc[1] +"|"+ item.xpath('./div[@class="resblock-location"]/a/text()')[0] data_item.append(location)
room = item.xpath('./a[@class="resblock-room"]/span/text()') rooms = "" for k in range(len(room)): if k < len(room) - 1: rooms += str(room[k]) + "|" else: rooms += str(room[k]) data_item.append(rooms)
size = item.xpath('./div[@class="resblock-area"]/span/text()') if len(size) != 0: data_item.append(size[0]) else: data_item.append(" ")
tag = item.xpath('./div[@class="resblock-tag"]/span/text()') tags = "" for k in range(len(tag)): if k < len(tag)-1: tags += str(tag[k]) + "|" else: tags += str(tag[k]) data_item.append(tags)
Price = "" price = item.xpath('./div[@class="resblock-price"]/div/span/text()') for k in range(len(price)): Price += price[k] Price = re.sub('\xa0', "", Price) data_item.append(Price) Datalist.append(data_item)
print(len(Datalist)) return Datalist
def savedata(Datalist,Savapath): col = ("链接", "名字", "地址", "户型", "大小", "标签", "价格") house_list = xlwt.Workbook(encoding="utf-8", style_compression=0) worksheet = house_list.add_sheet("HouseList", cell_overwrite_ok=True) for i in range(0, 7): worksheet.write(0, i, col[i]) for i in range(0, 990): print("正在写入第%d条数据" % (i + 1)) item = Datalist[i] for j in range(0, 7): worksheet.write(i + 1, j, item[j]) house_list.save(Savapath)
if __name__ == "__main__": main()
print("爬取完毕!")
|