第七步:修改 Settings.py 文件 。
1、修改默认的请求信息(手机端浏览器) 。
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
"User-Agent" : "DYZB/1 CFNetwork/808.2.16 Darwin/16.3.0",
}
文章插图
2.设置管道文件(管道文件类型:ImagesPipeline)
ImagesPipeline 是专门处理图片的管道文件 。
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'douyu.pipelines.ImagesPipeline': 300,
}
3.增加一个存储下载图片的本地路径 。
IMAGES_STORE = "D:/scrapy_project/douyu/Images"
文章插图
后续可以在 pipelines 管道文件程序里直接使用自己定义的图片存储路径变量 。
第七步:编写 pipelines 管道文件 。
pipelines 管道文件有专门处理 files(文件)、images(图片)、media(视频)3个库 。
C:Python34Libsite-packagesscrapypipelines
文章插图
查看 images.py 源码 。
"""
Images Pipeline
See documentation in topics/media-pipeline.rst
"""
import functools
import hashlib
import six
try:
from cStringIO import StringIO as BytesIO
except ImportError:
from io import BytesIO
from PIL import Image
from scrapy.utils.misc import md5sum
from scrapy.utils.python import to_bytes
from scrapy.http import Request
from scrapy.settings import Settings
from scrapy.exceptions import DropItem
#TODO: from scrapy.pipelines.media import MediaPipeline
from scrapy.pipelines.files import FileException, FilesPipeline
class NoimagesDrop(DropItem):
"""Product with no images exception"""
class ImageException(FileException):
"""General image error exception"""
class ImagesPipeline(FilesPipeline):
"""Abstract pipeline that implement the image thumbnail generation logic
"""
MEDIA_NAME = 'image'
# Uppercase attributes kept for backward compatibility with code that subclasses
# ImagesPipeline. They may be overridden by settings.
MIN_WIDTH = 0
MIN_HEIGHT = 0
EXPIRES = 90
THUMBS = {}
DEFAULT_IMAGES_URLS_FIELD = 'image_urls'
DEFAULT_IMAGES_RESULT_FIELD = 'images'
def __init__(self, store_uri, download_func=None, settings=None):
super(ImagesPipeline, self).__init__(store_uri, settings=settings,
download_func=download_func)
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
resolve = functools.partial(self._key_for_pipe,
base_class_name="ImagesPipeline",
settings=settings)
self.expires = settings.getint(
resolve("IMAGES_EXPIRES"), self.EXPIRES
)
if not hasattr(self, "IMAGES_RESULT_FIELD"):
self.IMAGES_RESULT_FIELD = self.DEFAULT_IMAGES_RESULT_FIELD
if not hasattr(self, "IMAGES_URLS_FIELD"):
self.IMAGES_URLS_FIELD = self.DEFAULT_IMAGES_URLS_FIELD
self.images_urls_field = settings.get(
resolve('IMAGES_URLS_FIELD'),
self.IMAGES_URLS_FIELD
)
self.images_result_field = settings.get(
resolve('IMAGES_RESULT_FIELD'),
self.IMAGES_RESULT_FIELD
)
self.min_width = settings.getint(
resolve('IMAGES_MIN_WIDTH'), self.MIN_WIDTH
)
self.min_height = settings.getint(
resolve('IMAGES_MIN_HEIGHT'), self.MIN_HEIGHT
)
self.thumbs = settings.get(
resolve('IMAGES_THUMBS'), self.THUMBS
)
@classmethod
def from_settings(cls, settings):
s3store = cls.STORE_SCHEMES['s3']
s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
s3store.POLICY = settings['IMAGES_STORE_S3_ACL']
gcs_store = cls.STORE_SCHEMES['gs']
gcs_store.GCS_PROJECT_ID = settings['GCS_PROJECT_ID']
store_uri = settings['IMAGES_STORE']
return cls(store_uri, settings=settings)
def file_downloaded(self, response, request, info):
return self.image_downloaded(response, request, info)
def image_downloaded(self, response, request, info):
checksum = None
for path, image, buf in self.get_images(response, request, info):
if checksum is None:
buf.seek(0)
推荐阅读
- 七种颜色搭配方案,养生茶营销策划方案完整版
- Python爬虫练习:爬取800多所大学学校排名、星级等
- 麻将多少张?
- 大蛇2电影国语完整版免费观看 大蛇2免费完整版在线西瓜
- 7500航班真实情况 7500鬼航班完整版
- Python爬虫案例,腾讯动漫爬虫,步骤超详细解释,源码分析
- 书中自有黄金屋完整诗句?书中自有黄金屋的下一句是什么_1
- 四种Python爬虫常用的定位元素方法对比,你偏爱哪一款?
- Pyppeteer爬虫神器详解
- 破了这几种爬虫加密算法后,我的路更近了「JS逆向3」