1. 编辑Items.py
1 2 3 4
| # 百度图片 class baiduImage(scrapy.Item): imageName = scrapy.Field() imageUrl = scrapy.Field()
|
2. 编写爬虫
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
| class BaiduimageSpider(scrapy.Spider): name = 'baiduImage' allowed_domains = ['image.baidu.com'] keyword = '狗狗' base_url = 'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1497491098685_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1497491098685%5E00_1519X735&word=' start_urls = [base_url + keyword] i = 0
def parse(self, response): # print(response.body) pic_urls = re.findall('"objURL":"(.*?)"', response.text) for pic_url in pic_urls: self.i += 1 bi = baiduImage() bi['imageUrl'] = pic_url bi['imageName'] = str(self.i) yield bi # 返回item类,供pipelines.py文件使用 next_page = response.css('a.n::attr(href)').extract_first() page = re.findall('pn=(.*)&gsm', next_page) if int(page[0]) <= 40: # if next_page is not None: yield response.follow(next_page, callback=self.parse)
|
3. 编写pipelines.py文件
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
| class baiduImgPipeline(ImagesPipeline): # 获取配置文件中配置的图片存储路径 IMAGES_STORE = get_project_settings().get('IMAGES_STORE')
def get_media_requests(self, item, info): print(item, '////////////////////////////////////////////////////') yield scrapy.Request(item['imageUrl'])
def item_completed(self, results, item, info): image_path = [x['path'] for ok, x in results if ok] if not image_path: raise DropItem("Item contains no images") else: # 重命名图片 os.rename(self.IMAGES_STORE + image_path[0], self.IMAGES_STORE + 'full/' + item['imageName'] + '.jpg') return item
|
4. 配置settings.py文件
1 2 3 4
| ITEM_PIPELINES = { 'tutorial.pipelines.baiduImgPipeline': 1, # 扩展内置的图片下载Pipeline } IMAGES_STORE = 'data/image/'
|
知识点
get_media_requests(item, info)
在工作流程中可以看到,管道会得到文件的URL并从项目中下载。为了这么做,你需要重写 get_media_requests() 方法,并对各个图片URL返回一个Request:
1 2 3
| def get_media_requests(self, item, info): for file_url in item['file_urls']: yield scrapy.Request(file_url)
|
这些请求将被管道处理,当它们完成下载后,结果将以2-元素的元组列表形式传送到 item_completed() 方法: 每个元组包含 (success, file_info_or_error):