diff --git a/.gitignore b/.gitignore
index ded6067..9d4be04 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,36 +1,3 @@
*.py[cod]
-
-# C extensions
-*.so
-
-# Packages
-*.egg
-*.egg-info
-dist
-build
-eggs
-parts
-bin
-var
-sdist
-develop-eggs
-.installed.cfg
-lib
-lib64
__pycache__
-
-# Installer logs
-pip-log.txt
-
-# Unit test / coverage reports
-.coverage
-.tox
-nosetests.xml
-
-# Translations
-*.mo
-
-# Mr Developer
-.mr.developer.cfg
-.project
-.pydevproject
+*.html
diff --git a/examples/paycheckProcess.py b/examples/paycheckProcess.py
index e8e1aa2..d83e7ea 100755
--- a/examples/paycheckProcess.py
+++ b/examples/paycheckProcess.py
@@ -1,118 +1,225 @@
+#!/usr/bin/env python2
from datetime import date, timedelta
from dateutil.relativedelta import relativedelta
from bs4 import BeautifulSoup
import re
from getpass import getpass
-
+import os
import sys
sys.path.append("../")
from paycheckrecords import *
def checkRowForAll(row):
- for col in row.findAll('td'):
- if "Federal Income Tax" in str(col):
- return True
- if "Social Security" in str(col):
- return True
- if "Medicare" in str(col):
- return True
- if "NY Income Tax" in str(col):
- return True
- if "Cell Phone" in str(col):
- return True
- if "Deductions" in str(col):
- return True
- if "Taxes" in str(col):
- return True
-
- return False
-
+ for col in row.findAll('td'):
+ if "Federal Income Tax" in str(col):
+ return True
+ if "Social Security" in str(col):
+ return True
+ if "Medicare" in str(col):
+ return True
+ if "NY Income Tax" in str(col):
+ return True
+ if "Cell Phone" in str(col):
+ return True
+ if "Deductions" in str(col):
+ return True
+ if "Taxes" in str(col):
+ return True
+
+ return False
+
def blackOut(html):
- soup = BeautifulSoup(html)
-
- #blackout net pay
- tmp = soup.findAll('u')
- for tag in tmp:
- if "Net Pay" in str(tag.parent):
- tag["style"] = "background-color:black; -webkit-print-color-adjust: exact;"
- tableList = ["paystub_pay_tbl", "paystub_ee_taxes_tbl", "paystub_summary_tbl"]
-
- #black out all
- for curTable in tableList:
- tmpTable = soup.find("table", {"id": curTable})
- allrows = tmpTable.findAll('tr')
- for row in allrows:
- if checkRowForAll(row):
- for col in row.findAll('td'):
- if '.' in str(col):
- col["style"] = "background-color:black; -webkit-print-color-adjust: exact;"
-
-
-
- #black out netthispay
- elem = soup.find(text=re.compile('.*Net This Check:.*'))
- elem = elem.findNext('td')
- elem["style"] = "background-color:black; -webkit-print-color-adjust: exact;"
-
- #black out account
- elem = soup.find(text=re.compile('.*Acct#.*'))
-
- nelem = elem.findNext('td')
- nelem["style"] = "background-color:black; -webkit-print-color-adjust: exact;"
-
- contents = elem.string
- contentsList = contents.split("#")
- newcontent = contentsList[0] + "#"
- contentsList = contentsList[1].split(":")
- newcontent = newcontent + contentsList[0] + ":" + contentsList[1]
- elem.replaceWith(newcontent)
-
- return str(soup.prettify(formatter=None))
+ soup = BeautifulSoup(html, "lxml")
+
+ #blackout net pay
+ tmp = soup.findAll('u')
+ for tag in tmp:
+ if "Net Pay" in str(tag.parent):
+ tag["style"] = "background-color:black; -webkit-print-color-adjust: exact;"
+ tableList = ["paystub_pay_tbl", "paystub_ee_taxes_tbl", "paystub_summary_tbl"]
+
+ #black out all
+ for curTable in tableList:
+ tmpTable = soup.find("table", {"id": curTable})
+ allrows = tmpTable.findAll('tr')
+ for row in allrows:
+ if checkRowForAll(row):
+ for col in row.findAll('td'):
+ if '.' in str(col):
+ col["style"] = "background-color:black; -webkit-print-color-adjust: exact;"
+
+
+
+ #black out netthispay
+ elem = soup.find(text=re.compile('.*Net This Check:.*'))
+ elem = elem.findNext('td')
+ elem["style"] = "background-color:black; -webkit-print-color-adjust: exact;"
+
+ #black out account
+ elem = soup.find(text=re.compile('.*Acct#.*'))
+
+ nelem = elem.findNext('td')
+ nelem["style"] = "background-color:black; -webkit-print-color-adjust: exact;"
+
+ contents = elem.string
+ contentsList = contents.split("#")
+ newcontent = contentsList[0] + "#"
+ contentsList = contentsList[1].split(":")
+ newcontent = newcontent + contentsList[0] + ":" + contentsList[1]
+ elem.replaceWith(newcontent)
+
+ return str(soup.prettify(formatter=None))
+
+def printSimpleSummary( stubs ):
+ gross = 0.0
+ totalnet = 0.0
+
+ print("")
+ print("QUICK SUMMARY:")
+ print("")
+
+ print("----------------------------------------------")
+ print(('{: <20} {: >12} {: >12}'.format( "Date",
+ "Total Pay",
+ "Net Pay" )))
+ print("----------------------------------------------")
+ for stub in stubs:
+ print(('{: <20} {: >12} {: >12}'.format( stub.PayDate.strftime("%Y-%m-%d"),
+ stub.TotalPay,
+ stub.NetPay )))
+ gross = gross + stub.TotalPay
+ totalnet = totalnet + stub.NetPay
+
+ print("----------------------------------------------")
+ print(('{: <20} {: >12} {: >12}'.format( "",
+ str(gross),
+ str(totalnet) )))
+ print("")
+
+def printDetailedSummary( stubs ):
+ summary = {}
+ for stub in stubs:
+ for f in stub.StubDetails:
+ if f['name'] in summary:
+ summary[f['name']]['hours'] += f['hours']
+ summary[f['name']]['rate'] += f['rate']
+ summary[f['name']]['current'] += f['current']
+ else:
+ summary[f['name']] = { 'hours' : f['hours'],
+ 'rate' : f['rate'],
+ 'current' : f['current'] }
+
+ print("")
+ print("DETAILED TOTALS:")
+ print("")
+
+ print("-----------------------------------------------------------")
+ print(('{: <20} {: >12} {: >12} {: >12}'.format( "Field",
+ "Total Hours",
+ "Total Rate",
+ "Total" )))
+ print("-----------------------------------------------------------")
+ for s in summary:
+ print(('{: <20} {: >12.2f} {: >12.2f} {: >12.2f}'.format( s,
+ summary[s]['hours'],
+ summary[s]['rate'],
+ summary[s]['current'] )))
+ print("")
+
+
+def savePayStubs( stubs, redact=False ):
+ for stub in stubs:
+ filename = "paystub-" + stub.PayDate.strftime("%Y-%m-%d")
+
+ if os.path.isfile(filename + ".html"):
+ i = 1
+ while os.path.isfile(filename + "_" + str(i) + ".html"):
+ i += 1
+ if i == 100:
+ print("There seem to be a lot of duplicate files? Aborting.")
+ return -1
+ filename += '_' + str(i)
+
+ out = open(filename + ".html", "w")
+ out.write(stub.HTML)
+ out.close()
+
+ if redact:
+ out = open(filename + "_redacted.html", "w")
+ out.write(blackOut(stub.HTML))
+ out.close()
+
+def yesno( x ):
+ while True:
+ resp = input(x)
+ if( resp.lower() == 'y' ):
+ return True
+ elif( resp.lower() == 'n' ):
+ return False
+ else:
+ print(" Invalid response.")
+
+def get_date( x, fmt='%m/%d/%Y' ):
+ while True:
+ try:
+ #resp = eval(input(x)) or datetime.today().strftime(fmt)
+ resp = input(x) or datetime.today().strftime(fmt)
+ return datetime.strptime(resp, fmt)
+ except ValueError:
+ print(" Invalid date or date format provided.")
def main():
-
- _day = int(input("Day:"))
- username = raw_input("Username:")
- password = getpass("Password:")
-
- paycheckinst = paycheckrecords(username, password)
- try:
-
- now = date.today()
-
- if now.day > _day:
- startdate = now.replace(day=_day+1)
- enddate = startdate + timedelta(days=32)
- enddate = enddate.replace(day = _day)
-
- else:
-
-
- enddate = now.replace(day=_day)
- tmpdate = now.replace(day=1) - timedelta(days=1)
- startdate = tmpdate.replace(day=_day+1)
-
-
-
- ret = paycheckinst.getPayStubsInRange(startdate, enddate)
- gross = 0.0
- for stub in ret:
- print "Date: ", stub.PayDate
- print "Total Pay: ", stub.TotalPay
- print "Net Pay: ", stub.NetPay
- print ""
- gross = gross + stub.TotalPay
- filename = "paystub " + stub.PayDate.strftime("%m-%d-%Y")
- out = open(filename + ".html", "w")
- out.write(stub.HTML)
- out.close()
-
- out = open(filename + "(blacked out).html", "w")
- out.write(blackOut(stub.HTML))
- out.close()
- print "Gross: " + str(gross)
- finally:
- paycheckinst.close()
-
+
+ print("")
+ print("Print a summary of all pay stubs between the given dates.")
+ print("Optionally save off the pay stubs and redacted pay stubs.")
+ print("")
+
+ while True:
+ startdate = get_date("Start date (MM/DD/YYYY): ", '%m/%d/%Y')
+ enddate = get_date("End date (MM/DD/YYYY): ", '%m/%d/%Y')
+ if( startdate <= enddate ):
+ break
+ else:
+ print(" Invalid date range. Start date must be before or equal to end date.")
+
+ savestubs = yesno("Save pay stubs? [Y/n] ")
+ if( savestubs ):
+ saveredacted = yesno("Save redacted pay stubs? [Y/n] ")
+ if( saveredacted ):
+ # Deleting the sensitive information is an exercise for the reader ...
+ print(" WARNING: redacted pay stubs are intended to be printed. Although")
+ print(" it is blacked out, the sensitive information is still")
+ print(" present in the document.")
+ saveredacted = yesno(" Do you acknowledge and accept the above warning? [Y/n] ")
+
+ print("PaycheckRecords.com Credentials:")
+
+ while True:
+ username = input(" Username: ")
+ if( username != "" ):
+ break
+
+ while True:
+ password = getpass(" Password: ")
+ if( password != "" ):
+ break
+
+ print("")
+
+ paycheckinst = paycheckrecords(username, password)
+
+ try:
+ stubs = paycheckinst.getPayStubsInRange(startdate, enddate)
+
+ printSimpleSummary( stubs )
+ printDetailedSummary( stubs )
+
+ if savestubs:
+ savePayStubs( stubs, saveredacted )
+
+ finally:
+ paycheckinst.close()
+
main()
diff --git a/paycheckrecords/__init__.py b/paycheckrecords/__init__.py
index 7884ee8..b847350 100644
--- a/paycheckrecords/__init__.py
+++ b/paycheckrecords/__init__.py
@@ -1,3 +1,2 @@
-import paystub
-from paycheckrecords import *
-
+from . import paystub
+from .paycheckrecords import *
diff --git a/paycheckrecords/paycheckrecords.py b/paycheckrecords/paycheckrecords.py
index 4722ec6..2aabc3b 100755
--- a/paycheckrecords/paycheckrecords.py
+++ b/paycheckrecords/paycheckrecords.py
@@ -1,123 +1,153 @@
from getpass import getpass
import threading
-import mechanize
+import mechanicalsoup
from bs4 import BeautifulSoup
-from paystub import paystub
+from .paystub import paystub
from datetime import datetime
from datetime import timedelta
class paycheckrecords:
- _br = mechanize.Browser()
- _browserSem = threading.Semaphore()
- _thread = None
- _stop = False
- _timer = None
- _threadSleep = threading.Event()
-
- def __init__(self, username, password):
- self._br.set_handle_robots(False)
- self._br.open("https://www.paycheckrecords.com")
- self._br.select_form(name="Login_Form")
-
- self._br.form["userStrId"] = username
- self._br.form["password"] = password
-
- self._br.submit()
-
- self._thread = threading.Thread(target=self.preventTimeOut)
- self._thread.start()
-
- def preventTimeOut(self):
- while not self._stop:
- self._browserSem.acquire()
-# print "aquired lock"
- url = self._br.geturl()
- #print "url = ", url
- self._br.open(url)
-# print "refreshed"
- self._browserSem.release()
-# print "reload page from thread"
- self._threadSleep.wait(30)
-# print "awake"
- self._threadSleep.clear()
-
-
-
- def getLatestPayStub(self):
- self._browserSem.acquire()
- originalurl = self._br.geturl()
- paystubResponse = self._br.open("https://www.paycheckrecords.com/in/paychecks.jsp")
-
- ret = self._getPaystubsFromTable(paystubResponse.read(), range(1, 2))
-
- self._br.open(originalurl)
- self._browserSem.release()
- return ret[0]
-
- def getPayStubsInRange(self, startDate, endDate, sequence = 0):
- self._browserSem.acquire()
- originalurl = self._br.geturl()
- paystubResponse = self._br.open("https://www.paycheckrecords.com/in/paychecks.jsp")
- self._br.select_form(name="dateSelect")
- self._br.form["startDate"] = startDate.strftime("%m/%d/%Y")
- self._br.form["endDate"] = endDate.strftime("%m/%d/%Y")
- paystubResponse = self._br.submit()
- ret = self._getPaystubsFromTable(paystubResponse.read(),sequence)
-
- self._br.open(originalurl)
- self._browserSem.release()
- return ret
-
-
-
- def _getPaystubsFromTable(self, html, sequence, GetHtml = True):
- soup = BeautifulSoup(html)
- PayStubTable = soup.find("table", { "class" : "report" })
- payrows = PayStubTable.findAll('tr')
- headerCols = payrows[0].findAll('td')
- ret = []
- i = 0
- DateIndex = -1
- NetIndex = -1
- TotalIndex = -1
-
- for col in headerCols:
- colName = col.string
- if colName == u'Pay Date' and DateIndex == -1:
- DateIndex = i
- elif colName == u'Total Pay' and TotalIndex == -1:
- TotalIndex = i
- elif colName == u'Net Pay' and NetIndex == -1:
- NetIndex = i
- i = i + 1
- if sequence == 0:
- sequence = range(1, len(payrows))
- for index in sequence:
- paystubHtml = None
- rowCols = payrows[index].findAll('td')
- rowDate = rowCols[DateIndex].a.string.strip()
- rowTotalPay = float(rowCols[TotalIndex].string.strip().strip("$"))
- rowNetPay = float(rowCols[NetIndex].string.strip().strip("$"))
- tmpDateTime = datetime.strptime(rowDate, '%m/%d/%Y')
- if GetHtml:
- paystubResponse = self._br.open(rowCols[DateIndex].a['href'])
- paystubHtml = paystubResponse.read()
- self._br.back()
- tmpPayStub = paystub(tmpDateTime, rowTotalPay, rowNetPay, paystubHtml)
- ret.append(tmpPayStub)
-
- return ret
-
-
-
- def close(self):
- #print "Closing Instance"
- self._stop = True
- #print "_stop set"
- self._threadSleep.set()
- #print "_threadSleep set"
- self._thread.join()
- #print "thread joined"
- self._br.close()
- #print "Closing Done"
\ No newline at end of file
+ _br = mechanicalsoup.StatefulBrowser()
+ _browserSem = threading.Semaphore()
+ _thread = None
+ _stop = False
+ _timer = None
+ _threadSleep = threading.Event()
+
+ def __init__(self, username, password):
+ #self._br.set_handle_robots(False)
+ self._br.open("https://www.paycheckrecords.com")
+ self._br.select_form()
+
+ self._br["userStrId"] = username
+ self._br["password"] = password
+
+ self._br.submit_selected()
+
+ self._thread = threading.Thread(target=self.preventTimeOut)
+ self._thread.start()
+
+ def preventTimeOut(self):
+ while not self._stop:
+ self._browserSem.acquire()
+# print "aquired lock"
+ url = self._br.get_url()
+ #print "url = ", url
+ self._br.open(url)
+# print "refreshed"
+ self._browserSem.release()
+# print "reload page from thread"
+ self._threadSleep.wait(30)
+# print "awake"
+ self._threadSleep.clear()
+
+
+
+ def getLatestPayStub(self):
+ self._browserSem.acquire()
+ originalurl = self._br.get_url()
+ paystubResponse = self._br.open("https://www.paycheckrecords.com/in/paychecks.jsp")
+
+ ret = self._getPaystubsFromTable(paystubResponse.read(), list(range(1, 2)))
+
+ self._br.open(originalurl)
+ self._browserSem.release()
+ return ret[0]
+
+ def getPayStubsInRange(self, startDate, endDate, sequence = 0):
+ self._browserSem.acquire()
+ originalurl = self._br.get_url()
+ paystubResponse = self._br.open("https://www.paycheckrecords.com/in/paychecks.jsp")
+ self._br.select_form("#dateSelect")
+ self._br["startDate"] = startDate.strftime("%m/%d/%Y")
+ self._br["endDate"] = endDate.strftime("%m/%d/%Y")
+ paystubResponse = self._br.submit_selected()
+ ret = self._getPaystubsFromTable(paystubResponse.text,sequence)
+
+ self._br.open(originalurl)
+ self._browserSem.release()
+ return ret
+
+ def _getPayStubDetails(self, html):
+ soup = BeautifulSoup(html, "lxml")
+ details = soup.find_all("table", { "class" : [ "detailsWages", "detailsPart" ] })
+ rv = []
+
+ # Paystub details seem to contain 4 elements, each consisting of one or more rows:
+ # [0] Pay (e.g. salary, bonus, ... )
+ # [1] Deductions (e.g. 401k, healthcare, ... )
+ # [2] Taxes (e.g. federal, state, SS, medicare, ... )
+ # [3] Summary
+ for d in range( 0, len(details) ):
+ for r in details[d].find_all('tr')[1:]:
+ tds = r.find_all('td')
+ if( d == 0 ): # Pay field has extra elements: hours and rate
+ rv.append( { 'name' : tds[0].text.strip(),
+ 'hours' : float(tds[1].text.strip() or 0.0),
+ 'rate' : float(tds[2].text.strip() or 0.0),
+ 'current' : float(tds[3].text.strip()),
+ 'ytd' : float(tds[4].text.strip()) } )
+ else:
+ rv.append( { 'name' : tds[0].text.strip(),
+ 'current' : float(tds[1].text.strip()),
+ 'ytd' : float(tds[2].text.strip()),
+ # Make post-processing easier
+ 'hours' : float(0.0),
+ 'rate' : float(0.0) } )
+
+ # List of dictionaries containing name/hours/rate/current/ytd
+ # information for each line-item of a paystub
+ return rv
+
+ def _getPaystubsFromTable(self, html, sequence, GetHtml = True):
+ soup = BeautifulSoup(html, "lxml")
+ PayStubTable = soup.find("table", { "class" : "report" })
+ payrows = PayStubTable.findAll('tr')
+ headerCols = payrows[0].findAll('td')
+ ret = []
+ i = 0
+ DateIndex = -1
+ NetIndex = -1
+ TotalIndex = -1
+
+ for col in headerCols:
+ colName = col.string
+ if colName == 'Pay Date' and DateIndex == -1:
+ DateIndex = i
+ elif colName == 'Total Pay' and TotalIndex == -1:
+ TotalIndex = i
+ elif colName == 'Net Pay' and NetIndex == -1:
+ NetIndex = i
+ i = i + 1
+ if sequence == 0:
+ sequence = list(range(1, len(payrows)))
+ for index in sequence:
+ paystubHtml = None
+ rowCols = payrows[index].findAll('td')
+ rowDate = rowCols[DateIndex].a.string.strip()
+ rowTotalPay = float(rowCols[TotalIndex].string.strip().strip("$").translate(dict.fromkeys(list(map(ord,',')),None)))
+ rowNetPay = float(rowCols[NetIndex].string.strip().strip("$").translate(dict.fromkeys(list(map(ord,',')),None)))
+ tmpDateTime = datetime.strptime(rowDate, '%m/%d/%Y')
+ if GetHtml:
+ paystubResponse = self._br.open_relative(rowCols[DateIndex].a['href'])
+ paystubHtml = paystubResponse.text
+ stubDetails = self._getPayStubDetails(paystubHtml)
+ #self._br.back()
+ tmpPayStub = paystub(tmpDateTime, rowTotalPay, rowNetPay, stubDetails, paystubHtml)
+ ret.append(tmpPayStub)
+
+ return ret
+
+
+
+ def close(self):
+ #print "Closing Instance"
+ self._stop = True
+ #print "_stop set"
+ self._threadSleep.set()
+ #print "_threadSleep set"
+ self._thread.join()
+ #print "thread joined"
+ self._br.close()
+ #print "Closing Done"
diff --git a/paycheckrecords/paystub.py b/paycheckrecords/paystub.py
index de5ef7f..e1586b3 100755
--- a/paycheckrecords/paystub.py
+++ b/paycheckrecords/paystub.py
@@ -1,15 +1,16 @@
import datetime
class paystub:
- def __init__(self, payDate, TotalPay, NetPay, html = None):
- if type(payDate) is not datetime and type(payDate) is not datetime.datetime:
- raise ValueError("payDate is not a datetime object")
-
- if type(TotalPay) is not float:
- raise ValueError("TotalPay needs to be a float")
- if type(NetPay) is not float:
- raise ValueError("NetPay needs to be a float")
-
- self.PayDate = payDate
- self.TotalPay = TotalPay
- self.NetPay = NetPay
- self.HTML = html
+ def __init__(self, payDate, TotalPay, NetPay, stubDetails = None, html = None):
+ if type(payDate) is not datetime and type(payDate) is not datetime.datetime:
+ raise ValueError("payDate is not a datetime object")
+
+ if type(TotalPay) is not float:
+ raise ValueError("TotalPay needs to be a float")
+ if type(NetPay) is not float:
+ raise ValueError("NetPay needs to be a float")
+
+ self.PayDate = payDate
+ self.TotalPay = TotalPay
+ self.NetPay = NetPay
+ self.StubDetails = stubDetails
+ self.HTML = html