From 00fbddef41fc05cb5fe74a2969b287b429989549 Mon Sep 17 00:00:00 2001 From: Thinkerinhell Date: Fri, 6 May 2016 17:41:48 +1000 Subject: [PATCH 1/3] add comments object and test case to get comment --- test.py | 10 +++++++ zhihu.py | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) diff --git a/test.py b/test.py index c7a9ceb..c291f20 100755 --- a/test.py +++ b/test.py @@ -87,6 +87,9 @@ def answer_test(answer_url): answer.to_txt() # 把答案输出为markdown文件 answer.to_md() + #该回答下的所有评论 + all_comments = answer.get_comments() + print question # @@ -101,6 +104,11 @@ def answer_test(answer_url): print visit_times # 输出: 改答案所属问题被浏览次数 + # 输出: 所有答主在该问题下的评论 + for c in all_comments : + if c.get_answer_author_flag(): + print c.get_content() + def user_test(user_url): user = User(user_url) # 获取用户ID @@ -282,6 +290,7 @@ def main(): question_test(url) answer_url = "http://www.zhihu.com/question/24269892/answer/29960616" answer_test(answer_url) + user_url = "http://www.zhihu.com/people/jixin" user_test(user_url) collection_url = "http://www.zhihu.com/collection/36750683" @@ -293,6 +302,7 @@ def main(): test() + if __name__ == '__main__': main() diff --git a/zhihu.py b/zhihu.py index 222eaea..95bd5b7 100755 --- a/zhihu.py +++ b/zhihu.py @@ -490,6 +490,7 @@ def get_visit_times(self): return int(soup.find("meta", itemprop="visitsCount")["content"]) + class User: user_url = None # session = None @@ -1173,6 +1174,32 @@ def get_voters(self): voter_id = voter_info.a["title"].encode("utf-8") yield User(voter_url, voter_id) + def get_comments(self): + if self.soup == None: + self.parser() + soup = self.soup + + try: + #print soup.find("div", {"class":lambda x : x and "zm-item-answer" in x.split()})["data-aid"] + data_aid = soup.find("div", {"class":lambda x : x and "zm-item-answer" in x.split()})["data-aid"] + request_url = 'http://www.zhihu.com/node/AnswerCommentListV2' + # if session == None: + # create_session() + # s = session + # r = s.get(request_url, params={"params": "{\"answer_id\":\"%d\"}" % int(data_aid)}) + r = requests.get(request_url, params={"params": "{\"answer_id\":\"%d\"}" % int(data_aid)}) + soup = BeautifulSoup(r.content, "lxml") + comments = soup.findAll("div",{"class":"zm-item-comment"}) + + #print comments + if len(comments) == 0: + return + yield + else: + for comment in comments: + yield Comment(comment["data-id"],comment) + except TypeError as err: + print 'type error in get comments' class Collection: url = None @@ -1293,3 +1320,65 @@ def get_top_i_answers(self, n): if j > n: break yield answer + + +class Comment: + comment_id = None + soup = None + + def setFlag(self, input): + if (u"提问者" in input): + self.question_author_flag = True + if (u"作者" in input): + self.answer_author_flag = True + + def parser(self): + soup = self.soup + commenthddiv = soup.find("div",{"class":"zm-comment-hd"}) + + if (commenthddiv.contents[0].strip() == u"匿名用户"): + #print(u"user link is {0}, user id is {1}".format(None,u"匿名用户")) + self.author = User(None, u"匿名用户") + self.setFlag(commenthddiv.contents[1].string) + else: + apart = commenthddiv.find("a", {"class":"zg-link"}) + if (apart is not None): + #print(u"user link is {0}, user id is {1}".format(apart['href'],apart.string)) + self.author = User(apart['href'], apart.string) + self.setFlag(apart.next_sibling.string) + + self.content = (" ".join(soup.find("div",{"class":"zm-comment-content"}).stripped_strings)) + +# def __init__(self, comment_id, soup, author=None, question_author_flag=None, answer_author_flag=None, content=None): + def __init__(self, comment_id, soup): + self.comment_id = comment_id + self.soup = soup + # print 'collection url',url + #if author != None: + # self.author = author + #if question_author_flag != None: + # self.question_author_flag = question_author_flag + #if answer_author_flag != None: + # self.creator = answer_author_flag + #if content != None: + # self.content = content + self.question_author_flag = False + self.answer_author_flag = False + self.parser() + + def get_author(self): + return self.author + + def get_content(self): + content = self.content + if platform.system() == 'Windows': + content = content.decode('utf-8').encode('gbk') + return content + else: + return content + + def get_question_author_flag(self): + return self.question_author_flag + + def get_answer_author_flag(self): + return self.answer_author_flag \ No newline at end of file From f6b0e26b7afe1037fa6786958baeeb79f1a47f25 Mon Sep 17 00:00:00 2001 From: Thinkerinhell Date: Fri, 6 May 2016 17:51:15 +1000 Subject: [PATCH 2/3] removed some print debug comments --- .idea/vcs.xml | 6 ++++++ zhihu.py | 24 ++++-------------------- 2 files changed, 10 insertions(+), 20 deletions(-) create mode 100644 .idea/vcs.xml diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/zhihu.py b/zhihu.py index 95bd5b7..6312a19 100755 --- a/zhihu.py +++ b/zhihu.py @@ -1180,18 +1180,13 @@ def get_comments(self): soup = self.soup try: - #print soup.find("div", {"class":lambda x : x and "zm-item-answer" in x.split()})["data-aid"] data_aid = soup.find("div", {"class":lambda x : x and "zm-item-answer" in x.split()})["data-aid"] request_url = 'http://www.zhihu.com/node/AnswerCommentListV2' - # if session == None: - # create_session() - # s = session - # r = s.get(request_url, params={"params": "{\"answer_id\":\"%d\"}" % int(data_aid)}) + r = requests.get(request_url, params={"params": "{\"answer_id\":\"%d\"}" % int(data_aid)}) soup = BeautifulSoup(r.content, "lxml") comments = soup.findAll("div",{"class":"zm-item-comment"}) - #print comments if len(comments) == 0: return yield @@ -1337,31 +1332,19 @@ def parser(self): commenthddiv = soup.find("div",{"class":"zm-comment-hd"}) if (commenthddiv.contents[0].strip() == u"匿名用户"): - #print(u"user link is {0}, user id is {1}".format(None,u"匿名用户")) self.author = User(None, u"匿名用户") self.setFlag(commenthddiv.contents[1].string) else: apart = commenthddiv.find("a", {"class":"zg-link"}) if (apart is not None): - #print(u"user link is {0}, user id is {1}".format(apart['href'],apart.string)) self.author = User(apart['href'], apart.string) self.setFlag(apart.next_sibling.string) self.content = (" ".join(soup.find("div",{"class":"zm-comment-content"}).stripped_strings)) -# def __init__(self, comment_id, soup, author=None, question_author_flag=None, answer_author_flag=None, content=None): def __init__(self, comment_id, soup): self.comment_id = comment_id self.soup = soup - # print 'collection url',url - #if author != None: - # self.author = author - #if question_author_flag != None: - # self.question_author_flag = question_author_flag - #if answer_author_flag != None: - # self.creator = answer_author_flag - #if content != None: - # self.content = content self.question_author_flag = False self.answer_author_flag = False self.parser() @@ -1376,9 +1359,10 @@ def get_content(self): return content else: return content - + #是否提问者 def get_question_author_flag(self): return self.question_author_flag - + + #是否答案作者 def get_answer_author_flag(self): return self.answer_author_flag \ No newline at end of file From 4bcf04753983780cb1ef5305728c02d50b85684f Mon Sep 17 00:00:00 2001 From: Thinkerinhell Date: Fri, 6 May 2016 17:53:57 +1000 Subject: [PATCH 3/3] removed vcs.xml --- .idea/vcs.xml | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 .idea/vcs.xml diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 94a25f7..0000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file