源码 用python一天爬取20万条企业信息( 三 )


brand = brand[0].text
area = soup.select('tr:nth-child(9) > td:nth-child(2) > span')
area = area[0].text
industry = soup.select('tr:nth-child(1) > td:nth-child(4)')
industry = industry[0].text.strip()
address = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(9)')
address = address[0].text.split(':')[1]
jingying = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(8)')
jingying = jingying[0].text.split(':')[1]
date = soup.select('tr:nth-child(5) > td:nth-child(4) > span')
date = date[0].text
wangzhi = soup.select('tr:nth-child(12) > td:nth-child(4) > p > span > a')
wangzhi = wangzhi[0].text
data = https://www.isolves.com/it/cxkf/yy/Python/2019-08-14/[company, date, name, legal_person, shouji, dianhua, chuanzhen, company_type, jingying, industry,
product, wangzhi, area, brand, address, main_address] # 将以上数据放入列表中打印在命令框
print(data)
with open('服装1.csv', 'a', newline='', encoding='GB2312') as csvfile:
w1 = csv.writer(csvfile)
w1.writerow(data)
except:
with open('服装2.csv', 'a', newline='', encoding='utf-8-sig') as csvfile:
w1 = csv.writer(csvfile)
w1.writerow(data)
print('utf解码成功')
# 利用并发加速爬取,最大线程为50个,本文章中一共有50个网站,可以加入50个线程
# 建立一个加速器对象,线程数每个网站都不同,太大网站接受不了会造成数据损失
executor = ThreadPoolExecutor(max_workers=10)
# submit()的参数: 第一个为函数,之后为该函数的传入参数,允许有多个
future_tasks = [executor.submit(parser, url) for url in wzs1]
# 等待所有的线程完成,才进入后续的执行
wait(future_tasks, return_when=ALL_COMPLETED)
print('全部信息抓取完毕')



推荐阅读