diff --git a/.gitignore b/.gitignore index ded6067..9d4be04 100644 --- a/.gitignore +++ b/.gitignore @@ -1,36 +1,3 @@ *.py[cod] - -# C extensions -*.so - -# Packages -*.egg -*.egg-info -dist -build -eggs -parts -bin -var -sdist -develop-eggs -.installed.cfg -lib -lib64 __pycache__ - -# Installer logs -pip-log.txt - -# Unit test / coverage reports -.coverage -.tox -nosetests.xml - -# Translations -*.mo - -# Mr Developer -.mr.developer.cfg -.project -.pydevproject +*.html diff --git a/examples/paycheckProcess.py b/examples/paycheckProcess.py index e8e1aa2..d83e7ea 100755 --- a/examples/paycheckProcess.py +++ b/examples/paycheckProcess.py @@ -1,118 +1,225 @@ +#!/usr/bin/env python2 from datetime import date, timedelta from dateutil.relativedelta import relativedelta from bs4 import BeautifulSoup import re from getpass import getpass - +import os import sys sys.path.append("../") from paycheckrecords import * def checkRowForAll(row): - for col in row.findAll('td'): - if "Federal Income Tax" in str(col): - return True - if "Social Security" in str(col): - return True - if "Medicare" in str(col): - return True - if "NY Income Tax" in str(col): - return True - if "Cell Phone" in str(col): - return True - if "Deductions" in str(col): - return True - if "Taxes" in str(col): - return True - - return False - + for col in row.findAll('td'): + if "Federal Income Tax" in str(col): + return True + if "Social Security" in str(col): + return True + if "Medicare" in str(col): + return True + if "NY Income Tax" in str(col): + return True + if "Cell Phone" in str(col): + return True + if "Deductions" in str(col): + return True + if "Taxes" in str(col): + return True + + return False + def blackOut(html): - soup = BeautifulSoup(html) - - #blackout net pay - tmp = soup.findAll('u') - for tag in tmp: - if "Net Pay" in str(tag.parent): - tag["style"] = "background-color:black; -webkit-print-color-adjust: exact;" - tableList = ["paystub_pay_tbl", "paystub_ee_taxes_tbl", "paystub_summary_tbl"] - - #black out all - for curTable in tableList: - tmpTable = soup.find("table", {"id": curTable}) - allrows = tmpTable.findAll('tr') - for row in allrows: - if checkRowForAll(row): - for col in row.findAll('td'): - if '.' in str(col): - col["style"] = "background-color:black; -webkit-print-color-adjust: exact;" - - - - #black out netthispay - elem = soup.find(text=re.compile('.*Net This Check:.*')) - elem = elem.findNext('td') - elem["style"] = "background-color:black; -webkit-print-color-adjust: exact;" - - #black out account - elem = soup.find(text=re.compile('.*Acct#.*')) - - nelem = elem.findNext('td') - nelem["style"] = "background-color:black; -webkit-print-color-adjust: exact;" - - contents = elem.string - contentsList = contents.split("#") - newcontent = contentsList[0] + "#" - contentsList = contentsList[1].split(":") - newcontent = newcontent + contentsList[0] + ":" + contentsList[1] - elem.replaceWith(newcontent) - - return str(soup.prettify(formatter=None)) + soup = BeautifulSoup(html, "lxml") + + #blackout net pay + tmp = soup.findAll('u') + for tag in tmp: + if "Net Pay" in str(tag.parent): + tag["style"] = "background-color:black; -webkit-print-color-adjust: exact;" + tableList = ["paystub_pay_tbl", "paystub_ee_taxes_tbl", "paystub_summary_tbl"] + + #black out all + for curTable in tableList: + tmpTable = soup.find("table", {"id": curTable}) + allrows = tmpTable.findAll('tr') + for row in allrows: + if checkRowForAll(row): + for col in row.findAll('td'): + if '.' in str(col): + col["style"] = "background-color:black; -webkit-print-color-adjust: exact;" + + + + #black out netthispay + elem = soup.find(text=re.compile('.*Net This Check:.*')) + elem = elem.findNext('td') + elem["style"] = "background-color:black; -webkit-print-color-adjust: exact;" + + #black out account + elem = soup.find(text=re.compile('.*Acct#.*')) + + nelem = elem.findNext('td') + nelem["style"] = "background-color:black; -webkit-print-color-adjust: exact;" + + contents = elem.string + contentsList = contents.split("#") + newcontent = contentsList[0] + "#" + contentsList = contentsList[1].split(":") + newcontent = newcontent + contentsList[0] + ":" + contentsList[1] + elem.replaceWith(newcontent) + + return str(soup.prettify(formatter=None)) + +def printSimpleSummary( stubs ): + gross = 0.0 + totalnet = 0.0 + + print("") + print("QUICK SUMMARY:") + print("") + + print("----------------------------------------------") + print(('{: <20} {: >12} {: >12}'.format( "Date", + "Total Pay", + "Net Pay" ))) + print("----------------------------------------------") + for stub in stubs: + print(('{: <20} {: >12} {: >12}'.format( stub.PayDate.strftime("%Y-%m-%d"), + stub.TotalPay, + stub.NetPay ))) + gross = gross + stub.TotalPay + totalnet = totalnet + stub.NetPay + + print("----------------------------------------------") + print(('{: <20} {: >12} {: >12}'.format( "", + str(gross), + str(totalnet) ))) + print("") + +def printDetailedSummary( stubs ): + summary = {} + for stub in stubs: + for f in stub.StubDetails: + if f['name'] in summary: + summary[f['name']]['hours'] += f['hours'] + summary[f['name']]['rate'] += f['rate'] + summary[f['name']]['current'] += f['current'] + else: + summary[f['name']] = { 'hours' : f['hours'], + 'rate' : f['rate'], + 'current' : f['current'] } + + print("") + print("DETAILED TOTALS:") + print("") + + print("-----------------------------------------------------------") + print(('{: <20} {: >12} {: >12} {: >12}'.format( "Field", + "Total Hours", + "Total Rate", + "Total" ))) + print("-----------------------------------------------------------") + for s in summary: + print(('{: <20} {: >12.2f} {: >12.2f} {: >12.2f}'.format( s, + summary[s]['hours'], + summary[s]['rate'], + summary[s]['current'] ))) + print("") + + +def savePayStubs( stubs, redact=False ): + for stub in stubs: + filename = "paystub-" + stub.PayDate.strftime("%Y-%m-%d") + + if os.path.isfile(filename + ".html"): + i = 1 + while os.path.isfile(filename + "_" + str(i) + ".html"): + i += 1 + if i == 100: + print("There seem to be a lot of duplicate files? Aborting.") + return -1 + filename += '_' + str(i) + + out = open(filename + ".html", "w") + out.write(stub.HTML) + out.close() + + if redact: + out = open(filename + "_redacted.html", "w") + out.write(blackOut(stub.HTML)) + out.close() + +def yesno( x ): + while True: + resp = input(x) + if( resp.lower() == 'y' ): + return True + elif( resp.lower() == 'n' ): + return False + else: + print(" Invalid response.") + +def get_date( x, fmt='%m/%d/%Y' ): + while True: + try: + #resp = eval(input(x)) or datetime.today().strftime(fmt) + resp = input(x) or datetime.today().strftime(fmt) + return datetime.strptime(resp, fmt) + except ValueError: + print(" Invalid date or date format provided.") def main(): - - _day = int(input("Day:")) - username = raw_input("Username:") - password = getpass("Password:") - - paycheckinst = paycheckrecords(username, password) - try: - - now = date.today() - - if now.day > _day: - startdate = now.replace(day=_day+1) - enddate = startdate + timedelta(days=32) - enddate = enddate.replace(day = _day) - - else: - - - enddate = now.replace(day=_day) - tmpdate = now.replace(day=1) - timedelta(days=1) - startdate = tmpdate.replace(day=_day+1) - - - - ret = paycheckinst.getPayStubsInRange(startdate, enddate) - gross = 0.0 - for stub in ret: - print "Date: ", stub.PayDate - print "Total Pay: ", stub.TotalPay - print "Net Pay: ", stub.NetPay - print "" - gross = gross + stub.TotalPay - filename = "paystub " + stub.PayDate.strftime("%m-%d-%Y") - out = open(filename + ".html", "w") - out.write(stub.HTML) - out.close() - - out = open(filename + "(blacked out).html", "w") - out.write(blackOut(stub.HTML)) - out.close() - print "Gross: " + str(gross) - finally: - paycheckinst.close() - + + print("") + print("Print a summary of all pay stubs between the given dates.") + print("Optionally save off the pay stubs and redacted pay stubs.") + print("") + + while True: + startdate = get_date("Start date (MM/DD/YYYY): ", '%m/%d/%Y') + enddate = get_date("End date (MM/DD/YYYY): ", '%m/%d/%Y') + if( startdate <= enddate ): + break + else: + print(" Invalid date range. Start date must be before or equal to end date.") + + savestubs = yesno("Save pay stubs? [Y/n] ") + if( savestubs ): + saveredacted = yesno("Save redacted pay stubs? [Y/n] ") + if( saveredacted ): + # Deleting the sensitive information is an exercise for the reader ... + print(" WARNING: redacted pay stubs are intended to be printed. Although") + print(" it is blacked out, the sensitive information is still") + print(" present in the document.") + saveredacted = yesno(" Do you acknowledge and accept the above warning? [Y/n] ") + + print("PaycheckRecords.com Credentials:") + + while True: + username = input(" Username: ") + if( username != "" ): + break + + while True: + password = getpass(" Password: ") + if( password != "" ): + break + + print("") + + paycheckinst = paycheckrecords(username, password) + + try: + stubs = paycheckinst.getPayStubsInRange(startdate, enddate) + + printSimpleSummary( stubs ) + printDetailedSummary( stubs ) + + if savestubs: + savePayStubs( stubs, saveredacted ) + + finally: + paycheckinst.close() + main() diff --git a/paycheckrecords/__init__.py b/paycheckrecords/__init__.py index 7884ee8..b847350 100644 --- a/paycheckrecords/__init__.py +++ b/paycheckrecords/__init__.py @@ -1,3 +1,2 @@ -import paystub -from paycheckrecords import * - +from . import paystub +from .paycheckrecords import * diff --git a/paycheckrecords/paycheckrecords.py b/paycheckrecords/paycheckrecords.py index 4722ec6..2aabc3b 100755 --- a/paycheckrecords/paycheckrecords.py +++ b/paycheckrecords/paycheckrecords.py @@ -1,123 +1,153 @@ from getpass import getpass import threading -import mechanize +import mechanicalsoup from bs4 import BeautifulSoup -from paystub import paystub +from .paystub import paystub from datetime import datetime from datetime import timedelta class paycheckrecords: - _br = mechanize.Browser() - _browserSem = threading.Semaphore() - _thread = None - _stop = False - _timer = None - _threadSleep = threading.Event() - - def __init__(self, username, password): - self._br.set_handle_robots(False) - self._br.open("https://www.paycheckrecords.com") - self._br.select_form(name="Login_Form") - - self._br.form["userStrId"] = username - self._br.form["password"] = password - - self._br.submit() - - self._thread = threading.Thread(target=self.preventTimeOut) - self._thread.start() - - def preventTimeOut(self): - while not self._stop: - self._browserSem.acquire() -# print "aquired lock" - url = self._br.geturl() - #print "url = ", url - self._br.open(url) -# print "refreshed" - self._browserSem.release() -# print "reload page from thread" - self._threadSleep.wait(30) -# print "awake" - self._threadSleep.clear() - - - - def getLatestPayStub(self): - self._browserSem.acquire() - originalurl = self._br.geturl() - paystubResponse = self._br.open("https://www.paycheckrecords.com/in/paychecks.jsp") - - ret = self._getPaystubsFromTable(paystubResponse.read(), range(1, 2)) - - self._br.open(originalurl) - self._browserSem.release() - return ret[0] - - def getPayStubsInRange(self, startDate, endDate, sequence = 0): - self._browserSem.acquire() - originalurl = self._br.geturl() - paystubResponse = self._br.open("https://www.paycheckrecords.com/in/paychecks.jsp") - self._br.select_form(name="dateSelect") - self._br.form["startDate"] = startDate.strftime("%m/%d/%Y") - self._br.form["endDate"] = endDate.strftime("%m/%d/%Y") - paystubResponse = self._br.submit() - ret = self._getPaystubsFromTable(paystubResponse.read(),sequence) - - self._br.open(originalurl) - self._browserSem.release() - return ret - - - - def _getPaystubsFromTable(self, html, sequence, GetHtml = True): - soup = BeautifulSoup(html) - PayStubTable = soup.find("table", { "class" : "report" }) - payrows = PayStubTable.findAll('tr') - headerCols = payrows[0].findAll('td') - ret = [] - i = 0 - DateIndex = -1 - NetIndex = -1 - TotalIndex = -1 - - for col in headerCols: - colName = col.string - if colName == u'Pay Date' and DateIndex == -1: - DateIndex = i - elif colName == u'Total Pay' and TotalIndex == -1: - TotalIndex = i - elif colName == u'Net Pay' and NetIndex == -1: - NetIndex = i - i = i + 1 - if sequence == 0: - sequence = range(1, len(payrows)) - for index in sequence: - paystubHtml = None - rowCols = payrows[index].findAll('td') - rowDate = rowCols[DateIndex].a.string.strip() - rowTotalPay = float(rowCols[TotalIndex].string.strip().strip("$")) - rowNetPay = float(rowCols[NetIndex].string.strip().strip("$")) - tmpDateTime = datetime.strptime(rowDate, '%m/%d/%Y') - if GetHtml: - paystubResponse = self._br.open(rowCols[DateIndex].a['href']) - paystubHtml = paystubResponse.read() - self._br.back() - tmpPayStub = paystub(tmpDateTime, rowTotalPay, rowNetPay, paystubHtml) - ret.append(tmpPayStub) - - return ret - - - - def close(self): - #print "Closing Instance" - self._stop = True - #print "_stop set" - self._threadSleep.set() - #print "_threadSleep set" - self._thread.join() - #print "thread joined" - self._br.close() - #print "Closing Done" \ No newline at end of file + _br = mechanicalsoup.StatefulBrowser() + _browserSem = threading.Semaphore() + _thread = None + _stop = False + _timer = None + _threadSleep = threading.Event() + + def __init__(self, username, password): + #self._br.set_handle_robots(False) + self._br.open("https://www.paycheckrecords.com") + self._br.select_form() + + self._br["userStrId"] = username + self._br["password"] = password + + self._br.submit_selected() + + self._thread = threading.Thread(target=self.preventTimeOut) + self._thread.start() + + def preventTimeOut(self): + while not self._stop: + self._browserSem.acquire() +# print "aquired lock" + url = self._br.get_url() + #print "url = ", url + self._br.open(url) +# print "refreshed" + self._browserSem.release() +# print "reload page from thread" + self._threadSleep.wait(30) +# print "awake" + self._threadSleep.clear() + + + + def getLatestPayStub(self): + self._browserSem.acquire() + originalurl = self._br.get_url() + paystubResponse = self._br.open("https://www.paycheckrecords.com/in/paychecks.jsp") + + ret = self._getPaystubsFromTable(paystubResponse.read(), list(range(1, 2))) + + self._br.open(originalurl) + self._browserSem.release() + return ret[0] + + def getPayStubsInRange(self, startDate, endDate, sequence = 0): + self._browserSem.acquire() + originalurl = self._br.get_url() + paystubResponse = self._br.open("https://www.paycheckrecords.com/in/paychecks.jsp") + self._br.select_form("#dateSelect") + self._br["startDate"] = startDate.strftime("%m/%d/%Y") + self._br["endDate"] = endDate.strftime("%m/%d/%Y") + paystubResponse = self._br.submit_selected() + ret = self._getPaystubsFromTable(paystubResponse.text,sequence) + + self._br.open(originalurl) + self._browserSem.release() + return ret + + def _getPayStubDetails(self, html): + soup = BeautifulSoup(html, "lxml") + details = soup.find_all("table", { "class" : [ "detailsWages", "detailsPart" ] }) + rv = [] + + # Paystub details seem to contain 4 elements, each consisting of one or more rows: + # [0] Pay (e.g. salary, bonus, ... ) + # [1] Deductions (e.g. 401k, healthcare, ... ) + # [2] Taxes (e.g. federal, state, SS, medicare, ... ) + # [3] Summary + for d in range( 0, len(details) ): + for r in details[d].find_all('tr')[1:]: + tds = r.find_all('td') + if( d == 0 ): # Pay field has extra elements: hours and rate + rv.append( { 'name' : tds[0].text.strip(), + 'hours' : float(tds[1].text.strip() or 0.0), + 'rate' : float(tds[2].text.strip() or 0.0), + 'current' : float(tds[3].text.strip()), + 'ytd' : float(tds[4].text.strip()) } ) + else: + rv.append( { 'name' : tds[0].text.strip(), + 'current' : float(tds[1].text.strip()), + 'ytd' : float(tds[2].text.strip()), + # Make post-processing easier + 'hours' : float(0.0), + 'rate' : float(0.0) } ) + + # List of dictionaries containing name/hours/rate/current/ytd + # information for each line-item of a paystub + return rv + + def _getPaystubsFromTable(self, html, sequence, GetHtml = True): + soup = BeautifulSoup(html, "lxml") + PayStubTable = soup.find("table", { "class" : "report" }) + payrows = PayStubTable.findAll('tr') + headerCols = payrows[0].findAll('td') + ret = [] + i = 0 + DateIndex = -1 + NetIndex = -1 + TotalIndex = -1 + + for col in headerCols: + colName = col.string + if colName == 'Pay Date' and DateIndex == -1: + DateIndex = i + elif colName == 'Total Pay' and TotalIndex == -1: + TotalIndex = i + elif colName == 'Net Pay' and NetIndex == -1: + NetIndex = i + i = i + 1 + if sequence == 0: + sequence = list(range(1, len(payrows))) + for index in sequence: + paystubHtml = None + rowCols = payrows[index].findAll('td') + rowDate = rowCols[DateIndex].a.string.strip() + rowTotalPay = float(rowCols[TotalIndex].string.strip().strip("$").translate(dict.fromkeys(list(map(ord,',')),None))) + rowNetPay = float(rowCols[NetIndex].string.strip().strip("$").translate(dict.fromkeys(list(map(ord,',')),None))) + tmpDateTime = datetime.strptime(rowDate, '%m/%d/%Y') + if GetHtml: + paystubResponse = self._br.open_relative(rowCols[DateIndex].a['href']) + paystubHtml = paystubResponse.text + stubDetails = self._getPayStubDetails(paystubHtml) + #self._br.back() + tmpPayStub = paystub(tmpDateTime, rowTotalPay, rowNetPay, stubDetails, paystubHtml) + ret.append(tmpPayStub) + + return ret + + + + def close(self): + #print "Closing Instance" + self._stop = True + #print "_stop set" + self._threadSleep.set() + #print "_threadSleep set" + self._thread.join() + #print "thread joined" + self._br.close() + #print "Closing Done" diff --git a/paycheckrecords/paystub.py b/paycheckrecords/paystub.py index de5ef7f..e1586b3 100755 --- a/paycheckrecords/paystub.py +++ b/paycheckrecords/paystub.py @@ -1,15 +1,16 @@ import datetime class paystub: - def __init__(self, payDate, TotalPay, NetPay, html = None): - if type(payDate) is not datetime and type(payDate) is not datetime.datetime: - raise ValueError("payDate is not a datetime object") - - if type(TotalPay) is not float: - raise ValueError("TotalPay needs to be a float") - if type(NetPay) is not float: - raise ValueError("NetPay needs to be a float") - - self.PayDate = payDate - self.TotalPay = TotalPay - self.NetPay = NetPay - self.HTML = html + def __init__(self, payDate, TotalPay, NetPay, stubDetails = None, html = None): + if type(payDate) is not datetime and type(payDate) is not datetime.datetime: + raise ValueError("payDate is not a datetime object") + + if type(TotalPay) is not float: + raise ValueError("TotalPay needs to be a float") + if type(NetPay) is not float: + raise ValueError("NetPay needs to be a float") + + self.PayDate = payDate + self.TotalPay = TotalPay + self.NetPay = NetPay + self.StubDetails = stubDetails + self.HTML = html