python爬取自如网房源信息( 二 )

存储到MongoDB中
需确保MongoDB已启动服务, 否则必然会存储失败
def save_to_mongodb(result):"""存储到MongoDB中"""# 创建数据库连接对象, 即连接到本地client = pymongo.MongoClient(host="localhost")# 指定数据库,这里指定ziroomdb = client.iroomz# 指定表的名称, 这里指定roominfodb_table = db.roominfotry:#存储到数据库if db_table.insert(result):print("---存储到数据库成功---",result)except Exception:print("---存储到数据库失败---",result)完整代码1 # -*- coding: utf-8 -*- 23 import requests 4 import time 5 import pymongo 6 from lxml import etree 7 from requests.exceptions import RequestException 8 def get_one_page(page): 9'''获取单页源码'''10try:11url = "http://hz.ziroom.com/z/nl/z2.html?p=" + str(page)12headers = {13'Referer':'http://hz.ziroom.com/',14'Upgrade-Insecure-Requests':'1',15'User-Agent':'Mozilla/5.0(WindowsNT6.3;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/68.0.3440.106Safari/537.36'16}17res = requests.get(url,headers=headers)18if res.status_code == 200:19return res.text20return None21except RequestException:22return None23 def parse_one_page(sourcehtml):24'''解析单页源码'''25contentTree = etree.HTML(sourcehtml)#解析源代码26results = contentTree.xpath('//ul[@id="houseList"]/li') #利用XPath提取相应内容27for result in results[1:]:28title = result.xpath("./div/h3/a/text()")[0][5:] if len(result.xpath("./div/h3/a/text()")[0]) > 5 else ""29location = result.xpath("./div/h4/a/text()")[0].replace("[", "").replace("]", '')30area = " ".join(result.xpath("./div/div/p[1]/span/text()")).replace(" ", "", 1)# 使用join方法将列表中的内容以" "字符连接31#nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip()这里需要加判断, 改写为下句32nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip() if len(result.xpath("./div/div/p[2]/span/text()"))>0 else ""33data = https://www.isolves.com/it/cxkf/yy/Python/2020-09-04/{34"title": title,35"location": location,36"area": area,37"nearby": nearby38}39save_to_mongodb(data)40#yield {"pages":pages}41 def get_pages():42"""得到总页数"""43page = 144html = get_one_page(page)45contentTree = etree.HTML(html)46pages = int(contentTree.xpath('//div[@class="pages"]/span[2]/text()')[0].strip("共页"))47return pages48 def save_to_mongodb(result):49"""存储到MongoDB中"""50# 创建数据库连接对象, 即连接到本地51client = pymongo.MongoClient(host="localhost")52# 指定数据库,这里指定ziroom53db = client.iroomz54# 指定表的名称, 这里指定roominfo55db_table = db.roominfo56try:57#存储到数据库58if db_table.insert(result):59print("---存储到数据库成功---",result)60except Exception:61print("---存储到数据库失败---",result)62 63 def main():64pages = get_pages()65print(pages)66for page in range(1,pages+1):67html = get_one_page(page)68parse_one_page(html)69 70 if __name__ == '__main__':71main()72time.sleep(1)最终结果

python爬取自如网房源信息

文章插图
 
总结在第三步中XPath使用注意事项
title = result.xpath("./div/h3/a/text()")此处的点'.'不能忘记, 它表示当前节点, 如果不加'.', '/'就表示从根节点开始选取在第四步获取多个页面时出现索引超出范围错误
nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip()IndexError: list index out of range造成这种错误原因有两种:
  1. [index] index超出list范围
  2. [index] index索引内容为空
因为这里的nearby的index是0, 排除第一种情况, 那么这里就是空行了, 加句if判断就可以解决
nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip()#改写以后:nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip() if len(result.xpath("./div/div/p[2]/span/text()"))>0 else ""以上主要是对爬虫过程学习的总结, 若有不对的地方, 还请指正, 谢谢!




推荐阅读