Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
183 changes: 58 additions & 125 deletions stackoverflow_users_taginfo.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
from bs4 import BeautifulSoup
import numpy as np
import requests
from datetime import datetime
import itertools
import re
import requests
import sys
import time
from bs4 import BeautifulSoup
import numpy as np
from wordcloud import WordCloud


Expand Down Expand Up @@ -33,80 +36,7 @@ def toint(a):
return int(a)


def info_mainpage(url):
""" Given the main tag page, this function gets basic information about tag
pages and tag names and their scores as well.
On the basic info, there are three numbers scraped :
1. Number of tag pages
2. Total number of tags
3. Number of tags per page

Parameters
----------
url : string
URL link to user's main tag page.

Output
------
pginfo : dict
Dictionary that holds the three page info as listed earlier.

name : list of strings
Holds the tag names

count : list of ints
Holds the tag scores
"""

soup = BeautifulSoup(requests.get(url).text, "lxml")

pg_blk = soup.find_all("div", class_="pager fr")
tag_blk = soup.find_all("div", class_="answer-votes")
tag_blk_str = [str(i) for i in tag_blk]
str0 = find_between(str(soup.find_all("span", class_="count")))
lim_ntags = int(str0.replace(',', ''))

max_tags = None
num_pages = 1
if len(pg_blk) != 0:
max_tags = len(tag_blk)
last_page_blk = pg_blk[0].find_all("span", class_="page-numbers")[-2]
num_pages = int(find_between(str(last_page_blk)))
pginfo = {'pages': num_pages, 'tags': lim_ntags, 'tags_perpage': max_tags}

name = [unquote_str(find_between(i, '[', ']')) for i in tag_blk_str]
count = [toint(find_between(i)) for i in tag_blk_str]
return pginfo, name, count


def stackoverflow_taginfo(url):
""" Get information about an user's tags from their Stack Overflow
tag pages fed as the input URL. Mainly two pieces of information are
scraped : tag names and their respective counts/scores.

Parameters
----------
url : string
URL link to user's main tag page.

Output
------
name : list of strings
Holds the tag names

count : list of ints
Holds the tag scores
"""

soup = BeautifulSoup(requests.get(url).text, "lxml")
tag_blk = soup.find_all("div", class_="answer-votes")
tag_blk_str = [str(i) for i in tag_blk]
name = [unquote_str(find_between(i, '[', ']')) for i in tag_blk_str]
count = [toint(find_between(i)) for i in tag_blk_str]
return name, count


def taginfo(link, lim_num_tags=None, return_sort=True, print_page_count=False):
def taginfo(link, lim_num_tags=None, print_page_count=False, request_frequency=1.5):
""" Get information about Stack Overflow and all Stack Exchange sites users'
tags (tags and corresponding tag points scored).
This could be directly used with wordcloud module for generating tag cloud.
Expand Down Expand Up @@ -135,17 +65,17 @@ def taginfo(link, lim_num_tags=None, return_sort=True, print_page_count=False):
Number of tags to be tracked. Default is None, which tracks all tags
possible.

return_sort : bool (default=True)
This boolean flag decides whether the output list has the tags
sorted by their counts. Since WordCloud module internally sorts
them anyway, so for performance one can turn it off.

print_page_count : bool(default=False)
Print per page progress on processing data.

request_frequency : float (default=1.5)
Number of seconds to wait before making another request. Making
requests too often will lead to throttling and eventually timeout
errors.

Output
------
Output is a dictionary with items for tag names and keys for tag count.
Output is a dictionary with tag names for keys and tag count for values.
"""

# Get start link (profile page's tag link)
Expand All @@ -155,50 +85,55 @@ def taginfo(link, lim_num_tags=None, return_sort=True, print_page_count=False):
start_link = "http://stackoverflow.com/users/" + str(link) + \
"?tab=tags&sort=votes&page="

tag_name = []
tag_count = []

if print_page_count:
print("Processing page : 1/NA")

info1 = info_mainpage(start_link + '1')
num_tags = info1[0]['tags']
tag_name.append(info1[1])
tag_count.append(info1[2])
tags_per_page = len(info1[1])

if lim_num_tags is None:
num_tags = info1[0]['tags']
else:
num_tags = min(lim_num_tags, info1[0]['tags'])
num_pages = int(np.ceil(num_tags/tags_per_page))

print('tags_per_page : '+str(tags_per_page))
print('num_tags : '+str(num_tags))
print('num_pages : '+str(num_pages))

if num_pages > 1:
num_pages = int(np.ceil(lim_num_tags/float(tags_per_page)))
for page_id in range(2, num_pages+1):
if print_page_count:
print("Processing page : " + str(page_id) + "/" + str(num_pages))

url = start_link + str(page_id)
page_tag_name, page_tag_count = stackoverflow_taginfo(url)
tag_name.append(page_tag_name)
tag_count.append(page_tag_count)

info0 = list(zip(itertools.chain(*tag_name), itertools.chain(*tag_count)))
sorted_indx = np.argsort([item[1] for item in info0])[::-1]
info = [info0[idx] for idx in sorted_indx][:lim_num_tags]
# regex pattern to match title of tag divs
tag_patt = re.compile(r'\d+ non-wiki questions \(\d+ score\)\. \d+ non-wiki answers \(\d+ score\)\.')
# pattern to match score inside the title
score_patt = re.compile(r'non-wiki answers \((\d+) score\)')

info = {}
last_get = None
for page in itertools.count(1):
if print_page_count:
print("Processing page: ", page)

if last_get is None:
last_get = datetime.now()
else:
delta = datetime.now() - last_get
wait_needed = request_frequency - delta.total_seconds()
if wait_needed > 0:
time.sleep(wait_needed)
last_get = datetime.now()

resp = requests.get(start_link + str(page))
soup = BeautifulSoup(resp.text, 'lxml')
tag_divs = soup.find_all('div', title=tag_patt)
if not tag_divs:
# we're out of tags
break

if print_page_count:
print(len(tag_divs), 'tags found on page')

for tag_div in tag_divs:
tag_score = int(score_patt.search(tag_div.get('title')).group(1))
tag_name = tag_div.find('a', class_='post-tag').text
info[tag_name] = tag_score

if len(info) == lim_num_tags:
# we're done
break
if len(info) == lim_num_tags:
# break out of pagination loop
break

# For a case when all tag counts are zeros, it would throw error.
# So, for such a case, escape it by setting all counts to "1".
dict_info = dict(info)
if info[0][1] == 0:
dict_info = dict.fromkeys(dict_info, 1)
if info and max(info.values()) == 0:
info = dict.fromkeys(info, 1)

return info

return dict_info

def draw_taginfo(info,
image_dims,
Expand All @@ -222,7 +157,6 @@ def draw_taginfo(info,
WC.to_image().save(out_filepath)
print("Tag Cloud Saved as " + out_filepath)

return

def tag_cloud(link=22656,
lim_num_tags=200,
Expand Down Expand Up @@ -251,4 +185,3 @@ def tag_cloud(link=22656,

info = taginfo(link=link, lim_num_tags=lim_num_tags)
draw_taginfo(info, image_dims=image_dims, out_filepath=out_filepath, skip_tags = skip_tags)
return