python爬取知乎某个问题下所有图片

python爬取知乎某个问题下所有图片

python爬取知乎某个问题下所有图片.比方说 钓鱼贴 ???

例如这个问题

1. 思路

在chorme浏览器控制台过滤一下answer关键字,在这个请求中api服务器返回了一个json格式的数据。在这个响应中data字段包含了这个问题下的三个回答,每个回答中的content字段包含了这个回答中所有图片的url。同时paging根据字段中的is_endis_start可以判断出这是否是最后一页paging字段中还有下一页的url. 现在问题就很简单了,我们只需要请求第一页并使用正则表达式提取出回答中图片的链接保存下来,接着根据响应中的pagingnext的url依次请求下去即可。

2. 代码部分

main.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#  -*- coding: utf-8 -*-
import json
import re
import grequests
import threading
import os
import click
# import sys

# sys.setrecursionlimit(1000000) # 例如这里设置为一百万

BASE_URL = "https://www.zhihu.com/api/v4/questions/{}/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics&limit={}"
IMG_BASE_URL = "https://pic3.zhimg.com{}"


class ZhSpider(object):

def __init__(self, question_id, min_voted_num):
"""
:param question_id: 问题ID,列表形式
:param min_voted_num: 将会过滤掉点赞数小于这个值得回答
"""
self.min_voted_num = min_voted_num
self.question_url = BASE_URL.format(question_id, 3)
self.headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) ",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-us",
"Connection": "keep-alive",
"Accept-Charset": "GB2312,utf-8;q=0.7,*;q=0.7"}

def start(self):
"""
启动爬虫
"""
# 创建images目录
if not os.path.exists('images'):
os.mkdir('images')
print("已创建目录 'images', 所有的图片将会保存在该目录下")

print('start')
response = self.get(self.question_url)
next_page_answer_url = self.parse(response)
while next_page_answer_url is not None:
response = self.get(next_page_answer_url)
next_page_answer_url = self.parse(response)
print("Done!")
def get(self, url):
"""
发送get请求,返回请求响应
"""
rs = set([grequests.get(url, headers=self.headers)])
response = grequests.map(rs)[0]

return response

def parse(self, response):
print("开始新的一页")
print("开始处理响应...")
dic = json.loads(response.text)
# 此相应中的回答,过滤掉点赞数小于800的回答
answers_list = [
answer for answer in dic.get('data')
if answer.get('voteup_count') > self.min_voted_num
]
if len(answers_list) is not 0:
# 知乎的回答是按照点赞数来排序的,点赞数高的排在前面。所有只要爬到点赞数都不大于指定值的第一页就可以停下来了。
for answer in answers_list:
answer_html_content = answer.get('content')
# 正则选出回答中的img url
incomplete_urls = re.findall(
r'https:\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?',
answer_html_content)
# 使用集合去除重复图片
tmp_img_urls = set([later_url for _, later_url in incomplete_urls])
# 过滤高清大图
img_urls = []
for img_url in tmp_img_urls:
if img_url[-5] == 'r':
img_urls.append(IMG_BASE_URL.format(img_url))
print("开始下载图片:{}张.".format(len(img_urls)))
rs = (grequests.get(u, headers=self.headers, timeout=5) for u in img_urls)
res = grequests.map(rs)
# 过滤掉404响应
res = [i for i in res if i.status_code == 200]
print("开始保存图片: {}张.".format(len(res)))
self.save_imgs(res)
# 增加一个线程保存图片
# save_img_t = threading.Thread(target=self.save_imgs, args=(res,))
# print("开始保存图片: {}张".format(len(res)))
# save_img_t.start()
else:
# 终止爬虫
return None

if not dic.get('paging').get('is_end'):
next_url = dic.get('paging').get('next')
return next_url
else:
return None

def save_imgs(self, data):
"""
:return:
"""
for i in data:
with open('images/' + i.request.url[-18:], 'wb') as f:
f.write(i.content)
print("成功保存图片:{}张.".format(len(data)))


@click.command()
@click.option('--question', default="296631231", help="问题id", type=str)
@click.option('--votenum', default=800, help="最小点赞数,将会过滤掉点赞数小于该值得回答", type=int)
def start(question, votenum):
"""

:param answers:
:param votenum:
:return:
"""
# print(question, votenum)
zh_spider = ZhSpider(question, votenum)
zh_spider.start()


if __name__ == "__main__":
start()


4.展示一下结果

5.最后

代码已上传至github

python爬取知乎某个问题下所有图片

https://www.huihuidehui.com/posts/4fcb5fd7.html

作者

辉辉的辉

发布于

2019-10-01

更新于

2020-03-16

许可协议

评论