From bcac5d04cd5cda0ce6542716ed80abcc9e258978 Mon Sep 17 00:00:00 2001 From: huangxiaofei Date: Sat, 9 Jul 2016 23:53:09 +0800 Subject: [PATCH 1/2] modified: auth.py add headers to requets.get --- auth.py | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/auth.py b/auth.py index ed51290..f6f6763 100644 --- a/auth.py +++ b/auth.py @@ -68,7 +68,15 @@ def __init__(self, message): def download_captcha(): url = "https://www.zhihu.com/captcha.gif" - r = requests.get(url, params={"r": random.random(), "type": "login"}, verify=False) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/", + 'X-Requested-With': "XMLHttpRequest" + } + r = requests.get(url, params={"r": random.random(), "type": "login"}, verify=False, headers=headers) if int(r.status_code) != 200: raise NetworkError(u"验证码请求失败") image_name = u"verify." + r.headers['content-type'].split("/")[1] @@ -96,7 +104,15 @@ def download_captcha(): def search_xsrf(): url = "http://www.zhihu.com/" - r = requests.get(url, verify=False) + headers = { + 'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", + 'Host': "www.zhihu.com", + 'Origin': "http://www.zhihu.com", + 'Pragma': "no-cache", + 'Referer': "http://www.zhihu.com/", + 'X-Requested-With': "XMLHttpRequest" + } + r = requests.get(url, verify=False, headers=headers) if int(r.status_code) != 200: raise NetworkError(u"验证码请求失败") results = re.compile(r"\ Date: Sun, 10 Jul 2016 12:45:24 +0800 Subject: [PATCH 2/2] modified: zhihu.py modify some encode problem and beautifulsoup find html match --- zhihu.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/zhihu.py b/zhihu.py index e00e939..8234cf9 100755 --- a/zhihu.py +++ b/zhihu.py @@ -314,7 +314,7 @@ def get_title(self): if self.soup == None: self.parser() soup = self.soup - title = soup.find("h2", class_="zm-item-title").string.encode("utf-8").replace("\n", "") + title = soup.find("span", class_="zm-editable-content").string.encode("utf-8").replace("\n", "") self.title = title if platform.system() == 'Windows': title = title.decode('utf-8').encode('gbk') @@ -662,7 +662,7 @@ def get_topics_num(self): if self.soup == None: self.parser() soup = self.soup - topics_num = soup.find_all("div", class_="zm-profile-side-section-title")[1].strong.string.encode("utf-8") + topics_num = soup.find_all("div", class_="zm-profile-side-section-title")[2].strong.string.encode("utf-8") I='' for i in topics_num: if i.isdigit(): @@ -1067,7 +1067,7 @@ def get_question(self): if self.soup == None: self.parser() soup = self.soup - question_link = soup.find("h2", class_="zm-item-title zm-editable-content").a + question_link = soup.find("h2", class_="zm-item-title").a url = "http://www.zhihu.com" + question_link["href"] title = question_link.string.encode("utf-8") question = Question(url, title) @@ -1144,7 +1144,8 @@ def to_txt(self): if platform.system() == 'Windows': anon_user_id = "匿名用户".decode('utf-8').encode('gbk') else: - anon_user_id = "匿名用户" + anon_uget_titleser_id = "匿名用户" + anon_user_id = "匿名用户".decode('utf-8').encode('gbk') if self.get_author().get_user_id() == anon_user_id: if not os.path.isdir(os.path.join(os.path.join(os.getcwd(), "text"))): os.makedirs(os.path.join(os.path.join(os.getcwd(), "text")))