源码用python一天爬取20万条企业信息( 二 ) _python

try:
# 对响应体进行解析
soup = BeautifulSoup(respo, "lxml")
# 找到页面子链接，进入子页面，对子页面进行抓取
# 用select函数抽取需要的内容，单击需要的内容》检查》copy select
lianjie = soup.select('#main > div.main > div.intro > div.intros > div.text > p > a')
lianjie = lianjie[0].get('href')
wzs1.append(lianjie)
print(lianjie)
company = soup.select("#main > div.aside > div.info > div.info_c > p:nth-child(1) > strong") # 标题
company = company[0].text
# 匹配电话号码
dianhua = soup.select("#main > div.aside > div.info > div.info_c > p:nth-child(3)") # 地址
dianhua = dianhua[0].text.split("：")[1]
# 匹配手机号码
phone = soup.select("#main > div.aside > div.info > div.info_c > p:nth-child(4)") # 日租价格
phone = phone[0].text.split("：")[1]
# 匹配传真
chuanzhen = soup.select("#main > div.aside > div.info > div.info_c > p:nth-child(5)") # 月租价格
chuanzhen = chuanzhen[0].text.split("：")[1]
# 经营模式
jingying = soup.select("#main > div.aside > div.info > div.info_c > p:nth-child(8)") # 面积大小
jingying = jingying[0].text.split("：")[1]
# 公司地址
address = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(9)') # 抽取建造年份
address = address[0].text.split("：")[1]
# 公司简介
# introduction = soup.select("#main > div.main > div.intro > div.intros > div.text > p") # 楼层属性
# introduction = introduction[0].text.strip()
data = https://www.isolves.com/it/cxkf/yy/Python/2019-08-14/[company, address, dianhua, phone, chuanzhen, jingying]
print(data)
with open('首富网企业7.csv', 'a+', newline='', encoding='GB2312', errors='ignore') as csvfile:
w1 = csv.writer(csvfile)
w1.writerow(data, [1]) except:
print("出错！")
async def main(loop):
async with aiohttp.ClientSession() as sess:
tasks = []
for ii in wzs:
ur = ii
try:
tasks.append(loop.create_task(get_html(sess, ur)))
except:
print('error')
# 设置0.1的网络延迟增加爬取效率
await asyncio.sleep(0.1)
finished, unfinised = await asyncio.wait(tasks)
for i1 in finished:
await parser(i1.result())
if __name__ == '__main__':
t1 = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))
print("花费时间", time.time() - t1)
print('详细页链接抓取完毕！')
"""
########################################################################################################################
# 使用并发法获取详细页的内容
########################################################################################################################
# 定义函数获取每个网页需要爬取的内容
def parser(url):
global data
try:
res = requests.get(url, headers=headers)
# 对响应体进行解析
soup = BeautifulSoup(res.text, 'lxml')
# 找到页面子链接，进入子页面，对子页面进行抓取
# 用select函数抽取需要的内容，单击需要的内容》检查》copy select
company = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(1) > strong')
company = company[0].text
name = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(2)')
name = name[0].text
dianhua = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(3)')
dianhua = dianhua[0].text.split('：')[1]
shouji = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(4)')
shouji = shouji[0].text.split('：')[1]
chuanzhen = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(5)')
chuanzhen = chuanzhen[0].text.split('：')[1]
product = soup.select('tr:nth-child(1) > td:nth-child(2)')
product = product[0].text
company_type = soup.select('tr:nth-child(2) > td:nth-child(2) > span')
company_type = company_type[0].text.strip()
legal_person = soup.select('tr:nth-child(3) > td:nth-child(2)')
legal_person = legal_person[0].text
main_address = soup.select('tr:nth-child(5) > td:nth-child(2) > span')
main_address = main_address[0].text
brand = soup.select('tr:nth-child(6) > td:nth-child(2) > span')

源码用python一天爬取20万条企业信息( 二 )

推荐阅读

豪门明星|男网红晒账单炫富！买110万铂金包订300万家具，打扮女性化惹争议

国际茶价走高印度茶出口产量均呈负增长

蓝科技网|世界第一个AI图像传感器又是索尼制造

煤气炉什么牌子好(什么牌子的煤气灶好用又省气)

用文艺照进现实《故事里的中国》第二季致敬新时代最闪亮的星

军武阅读：能造几个航母？，如果我国每人出100块

小熊议情感|“逆商，让你的人生更高级”，原创

上岸人的独白|清华重返榜首，哈工大第六，复旦跌至第十，双一流大学评选收官

烹饪|洋葱焖猪排，快手焗鱼，涝汁凉拌金针菇，干煸菜花的做法

洋葱|全美洋葱中毒事态进一步扩大

win10系统如何共享打印机共享

BAT|没错，互联网进入到了下半场

中国历史上第一位女诗人，以女子身拯救母国于存亡之际

健康王医师|排毒养颜，延缓衰老，脸部皮肤白皙光滑，每天晚上吃3种食物

马来西亚|马来西亚出现超强变异毒株：传染性是原来10倍，现有疫苗或对其无效

陈凯歌|原创《演员2》郭敬明又任性？跟李成儒又舌战，陈凯歌飙英文表达不满

“欢乐春节·中国印象”演出在希腊雅典大学举办

公交车|这一脚，太暖心……

|接连“盯上”十几家企业，卖冰箱也不放过？，美国又开始“作妖”

历史劲爆点：缘分与桃花情意绵绵，主动去挽回幸福爱情的4大生肖！，两周之内

源码 用python一天爬取20万条企业信息( 二 )

推荐阅读

源码用python一天爬取20万条企业信息( 二 )