1. 编辑Items.py

1
2
3
4
# 百度图片
class baiduImage(scrapy.Item):
imageName = scrapy.Field()
imageUrl = scrapy.Field()

2. 编写爬虫

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
class BaiduimageSpider(scrapy.Spider):
name = 'baiduImage'
allowed_domains = ['image.baidu.com']
keyword = '狗狗'
base_url = 'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1497491098685_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1497491098685%5E00_1519X735&word='
start_urls = [base_url + keyword]
i = 0

def parse(self, response):
# print(response.body)
pic_urls = re.findall('"objURL":"(.*?)"', response.text)
for pic_url in pic_urls:
self.i += 1
bi = baiduImage()
bi['imageUrl'] = pic_url
bi['imageName'] = str(self.i)
yield bi
# 返回item类,供pipelines.py文件使用
next_page = response.css('a.n::attr(href)').extract_first()
page = re.findall('pn=(.*)&gsm', next_page)
if int(page[0]) <= 40:
# if next_page is not None:
yield response.follow(next_page, callback=self.parse)

3. 编写pipelines.py文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
class baiduImgPipeline(ImagesPipeline):
# 获取配置文件中配置的图片存储路径
IMAGES_STORE = get_project_settings().get('IMAGES_STORE')

def get_media_requests(self, item, info):
print(item, '////////////////////////////////////////////////////')
yield scrapy.Request(item['imageUrl'])

def item_completed(self, results, item, info):
image_path = [x['path'] for ok, x in results if ok]
if not image_path:
raise DropItem("Item contains no images")
else:
# 重命名图片
os.rename(self.IMAGES_STORE + image_path[0], self.IMAGES_STORE + 'full/' + item['imageName'] + '.jpg')
return item

4. 配置settings.py文件

1
2
3
4
ITEM_PIPELINES = {
'tutorial.pipelines.baiduImgPipeline': 1, # 扩展内置的图片下载Pipeline
}
IMAGES_STORE = 'data/image/'

知识点

get_media_requests(item, info)
在工作流程中可以看到,管道会得到文件的URL并从项目中下载。为了这么做,你需要重写 get_media_requests() 方法,并对各个图片URL返回一个Request:

1
2
3
def get_media_requests(self, item, info):
for file_url in item['file_urls']:
yield scrapy.Request(file_url)

这些请求将被管道处理,当它们完成下载后,结果将以2-元素的元组列表形式传送到 item_completed() 方法: 每个元组包含 (success, file_info_or_error):