From 58c638f7af3e4aedb2bda998e7817d922d1b1286 Mon Sep 17 00:00:00 2001 From: Dhrumil Mehta Date: Wed, 8 May 2019 16:46:03 -0400 Subject: [PATCH 01/15] move package to folder --- Email.py => emailanalysis/Email.py | 0 SenderMetadata.py => emailanalysis/SenderMetadata.py | 0 Utils.py => emailanalysis/Utils.py | 0 __init__.py => emailanalysis/__init__.py | 0 analyzer.py => emailanalysis/analyzer.py | 0 authenticator.py => emailanalysis/authenticator.py | 0 click_info.py => emailanalysis/click_info.py | 0 downloader.py => emailanalysis/downloader.py | 0 examples.py => emailanalysis/examples.py | 0 setup.py | 2 +- 10 files changed, 1 insertion(+), 1 deletion(-) rename Email.py => emailanalysis/Email.py (100%) rename SenderMetadata.py => emailanalysis/SenderMetadata.py (100%) rename Utils.py => emailanalysis/Utils.py (100%) rename __init__.py => emailanalysis/__init__.py (100%) rename analyzer.py => emailanalysis/analyzer.py (100%) rename authenticator.py => emailanalysis/authenticator.py (100%) rename click_info.py => emailanalysis/click_info.py (100%) rename downloader.py => emailanalysis/downloader.py (100%) rename examples.py => emailanalysis/examples.py (100%) diff --git a/Email.py b/emailanalysis/Email.py similarity index 100% rename from Email.py rename to emailanalysis/Email.py diff --git a/SenderMetadata.py b/emailanalysis/SenderMetadata.py similarity index 100% rename from SenderMetadata.py rename to emailanalysis/SenderMetadata.py diff --git a/Utils.py b/emailanalysis/Utils.py similarity index 100% rename from Utils.py rename to emailanalysis/Utils.py diff --git a/__init__.py b/emailanalysis/__init__.py similarity index 100% rename from __init__.py rename to emailanalysis/__init__.py diff --git a/analyzer.py b/emailanalysis/analyzer.py similarity index 100% rename from analyzer.py rename to emailanalysis/analyzer.py diff --git a/authenticator.py b/emailanalysis/authenticator.py similarity index 100% rename from authenticator.py rename to emailanalysis/authenticator.py diff --git a/click_info.py b/emailanalysis/click_info.py similarity index 100% rename from click_info.py rename to emailanalysis/click_info.py diff --git a/downloader.py b/emailanalysis/downloader.py similarity index 100% rename from downloader.py rename to emailanalysis/downloader.py diff --git a/examples.py b/emailanalysis/examples.py similarity index 100% rename from examples.py rename to emailanalysis/examples.py diff --git a/setup.py b/setup.py index 516f35a..41aecac 100644 --- a/setup.py +++ b/setup.py @@ -6,5 +6,5 @@ url='https://github.com/dmil/EmailAnalysis', author='Dhrumil Mehta', author_email='dhrumil.mehta@gmail.com', - packages=['.'] + packages=['emailanalysis'] ) \ No newline at end of file From a4170f0ba029f95383616b9073b8dcb930b20e0c Mon Sep 17 00:00:00 2001 From: Dhrumil Mehta Date: Wed, 8 May 2019 16:48:36 -0400 Subject: [PATCH 02/15] update __init__.py --- emailanalysis/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/emailanalysis/__init__.py b/emailanalysis/__init__.py index 34ac835..9519fcb 100644 --- a/emailanalysis/__init__.py +++ b/emailanalysis/__init__.py @@ -1,3 +1,4 @@ +import logging logger = logging.getLogger(__name__) logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.DEBUG) From 9e9de5f6cc606c3a2161b21c2347c4f9a0697a85 Mon Sep 17 00:00:00 2001 From: Dhrumil Mehta Date: Wed, 8 May 2019 16:58:29 -0400 Subject: [PATCH 03/15] move some requirements into setup.py --- Pipfile | 9 +-------- Pipfile.lock | 20 ++++++++------------ setup.py | 6 ++++-- 3 files changed, 13 insertions(+), 22 deletions(-) diff --git a/Pipfile b/Pipfile index b1721fa..903a10e 100644 --- a/Pipfile +++ b/Pipfile @@ -8,19 +8,12 @@ pylint = "*" autopep8 = "*" [packages] -httplib2 = "*" -dateutils = "*" -blessings = "*" -html2text = "*" -peewee = "*" -authenticator = "*" -oauth2client = "*" jupyter = "*" notebook = "*" matplotlib = "*" scikit-learn = "*" mpld3 = "*" -google-api-python-client = "*" +emailanalysis = {editable = true,path = "."} [requires] python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock index 4bcb4da..c38fbfe 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "09eb02067814ce634746073b43d6a8389ee7a323d94713a5daafc4748aac1143" + "sha256": "4fbb7cb056811a24fb65f9c1791e8915bb7121caafc31d573ff59a4f26b030d1" }, "pipfile-spec": 6, "requires": { @@ -50,7 +50,6 @@ "sha256:30b7a84a6983fd9f4b7f91df835ae853e901d301a33a38958f69d9da3c0eba33", "sha256:9157175b6e104ee861df9c577d4f00953c37ead3f5ae145746ce4e470100aa3f" ], - "index": "pypi", "version": "==1.1.3" }, "backcall": { @@ -73,7 +72,6 @@ "sha256:b1fdd7e7a675295630f9ae71527a8ebc10bfefa236b3d6aa4932ee4462c17ba3", "sha256:caad5211e7ba5afe04367cdd4cfc68fa886e2e08f6f35e76b7387d2109ccea6e" ], - "index": "pypi", "version": "==1.7" }, "cachetools": { @@ -151,7 +149,6 @@ "hashes": [ "sha256:c94a8e77d743abac79ed91f99f5ef594a972a527e05145cbb7aba59beced8a71" ], - "index": "pypi", "version": "==0.6.6" }, "decorator": { @@ -168,6 +165,10 @@ ], "version": "==0.6.0" }, + "emailanalysis": { + "editable": true, + "path": "." + }, "entrypoints": { "hashes": [ "sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19", @@ -180,7 +181,6 @@ "sha256:06907006ed5ce831018f03af3852d739c0b2489cdacfda6971bcc2075c762858", "sha256:937eabdc3940977f712fa648a096a5142766b6d0a0f58bc603e2ac0687397ef0" ], - "index": "pypi", "version": "==1.7.8" }, "google-auth": { @@ -202,7 +202,6 @@ "sha256:490db40fe5b2cd79c461cf56be4d39eb8ca68191ae41ba3ba79f6cb05b7dd662", "sha256:627514fb30e7566b37be6900df26c2c78a030cc9e6211bda604d8181233bcdd4" ], - "index": "pypi", "version": "==2018.1.9" }, "httplib2": { @@ -210,7 +209,6 @@ "sha256:23914b5487dfe8ef09db6656d6d63afb0cf3054ad9ebc50868ddc8e166b5f8e8", "sha256:a18121c7c72a56689efbf1aef990139ad940fee1e64c6f2458831736cd593600" ], - "index": "pypi", "version": "==0.12.3" }, "ipykernel": { @@ -455,7 +453,6 @@ "sha256:b8a81cc5d60e2d364f0b1b98f958dbd472887acaf1a5b05e21c28c31a2d6d3ac", "sha256:d486741e451287f69568a4d26d70d9acd73a2bbfa275746c535b4209891cccc6" ], - "index": "pypi", "version": "==4.1.3" }, "pandocfilters": { @@ -475,7 +472,6 @@ "hashes": [ "sha256:f0249be468e3b119a8ad83f686e7fe161303197e0534e3cdff8fa5a5417c01a5" ], - "index": "pypi", "version": "==3.9.5" }, "pexpect": { @@ -537,10 +533,10 @@ }, "pygments": { "hashes": [ - "sha256:5ffada19f6203563680669ee7f53b64dabbeb100eb51b61996085e99c03b284a", - "sha256:e8218dd399a61674745138520d0d4cf2621d7e032439341bc3f647bff125818d" + "sha256:31cba6ffb739f099a85e243eff8cb717089fdd3c7300767d9fc34cb8e1b065f5", + "sha256:5ad302949b3c98dd73f8d9fcdc7e9cb592f120e32a18e23efd7f3dc51194472b" ], - "version": "==2.3.1" + "version": "==2.4.0" }, "pyparsing": { "hashes": [ diff --git a/setup.py b/setup.py index 41aecac..204b958 100644 --- a/setup.py +++ b/setup.py @@ -6,5 +6,7 @@ url='https://github.com/dmil/EmailAnalysis', author='Dhrumil Mehta', author_email='dhrumil.mehta@gmail.com', - packages=['emailanalysis'] - ) \ No newline at end of file + packages=['emailanalysis'], + install_requires=['httplib2','dateutils','blessings','html2text', + 'peewee','authenticator','oauth2client', 'google-api-python-client'] + ) From b46c25dc5f6dd6221ce720fcc5d307962da8084c Mon Sep 17 00:00:00 2001 From: Dhrumil Mehta Date: Wed, 8 May 2019 17:07:45 -0400 Subject: [PATCH 04/15] fix imports --- emailanalysis/Email.py | 4 ++-- emailanalysis/SenderMetadata.py | 3 +-- emailanalysis/analyzer.py | 4 ++-- emailanalysis/downloader.py | 7 +++---- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/emailanalysis/Email.py b/emailanalysis/Email.py index 4d7f6c9..1722879 100644 --- a/emailanalysis/Email.py +++ b/emailanalysis/Email.py @@ -1,8 +1,8 @@ import re from peewee import * -from Utils import logger -from SenderMetadata import SenderMetadata +from emailanalysis.utils import logger +from emailanalysis.SenderMetadata import SenderMetadata db = SqliteDatabase('emails.db') diff --git a/emailanalysis/SenderMetadata.py b/emailanalysis/SenderMetadata.py index 6c48259..1e0269e 100644 --- a/emailanalysis/SenderMetadata.py +++ b/emailanalysis/SenderMetadata.py @@ -2,8 +2,7 @@ from blessings import Terminal from peewee import * -from Utils import logger -from Utils import get_answer +from emailanalysis.utils import logger, get_answer t = Terminal() db = SqliteDatabase('emails.db') diff --git a/emailanalysis/analyzer.py b/emailanalysis/analyzer.py index ae94ebf..0aed84e 100644 --- a/emailanalysis/analyzer.py +++ b/emailanalysis/analyzer.py @@ -24,8 +24,8 @@ from sklearn.cross_validation import cross_val_score from peewee import * -from Email import Email -from SenderMetadata import SenderMetadata +from emailanalysis.Email import Email +from emailanalysis.SenderMetadata import SenderMetadata t = Terminal() rootdir = os.path.realpath(os.path.dirname(__file__)) diff --git a/emailanalysis/downloader.py b/emailanalysis/downloader.py index ca9ce40..2510a6a 100644 --- a/emailanalysis/downloader.py +++ b/emailanalysis/downloader.py @@ -17,10 +17,9 @@ from dateutil.parser import parse from blessings import Terminal -from Utils import logger -from Utils import html_to_text -from Email import Email -from SenderMetadata import SenderMetadata +from emailanalysis.Utils import logger, html_to_text +from emailanalysis.Email import Email +from emailanalysis.SenderMetadata import SenderMetadata from peewee import * From 2bc6450ff587152d126ab3a5469a37e6ee228d25 Mon Sep 17 00:00:00 2001 From: Dhrumil Mehta Date: Wed, 8 May 2019 17:09:40 -0400 Subject: [PATCH 05/15] fix import --- emailanalysis/downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/emailanalysis/downloader.py b/emailanalysis/downloader.py index 2510a6a..e723167 100644 --- a/emailanalysis/downloader.py +++ b/emailanalysis/downloader.py @@ -17,7 +17,7 @@ from dateutil.parser import parse from blessings import Terminal -from emailanalysis.Utils import logger, html_to_text +from emailanalysis.utils import logger, html_to_text from emailanalysis.Email import Email from emailanalysis.SenderMetadata import SenderMetadata From a0594c6da4959185f589b9be6a9b047d4cb99201 Mon Sep 17 00:00:00 2001 From: Dhrumil Mehta Date: Wed, 8 May 2019 17:10:59 -0400 Subject: [PATCH 06/15] add testfile --- test.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 test.py diff --git a/test.py b/test.py new file mode 100644 index 0000000..d26c801 --- /dev/null +++ b/test.py @@ -0,0 +1 @@ +from emailanalysis import Email \ No newline at end of file From 22bb6d3e8887e15e0c5861322035bde6eb77ad37 Mon Sep 17 00:00:00 2001 From: Dhrumil Mehta Date: Wed, 8 May 2019 17:23:10 -0400 Subject: [PATCH 07/15] new setup.py --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 204b958..61e8b7c 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,6 @@ from setuptools import setup setup(name='emailanalysis', - version='0.1', description='Analyze email d', url='https://github.com/dmil/EmailAnalysis', author='Dhrumil Mehta', From 4d180b3c8394960b928ca82c1b805000dbff382c Mon Sep 17 00:00:00 2001 From: Dhrumil Mehta Date: Wed, 8 May 2019 17:30:56 -0400 Subject: [PATCH 08/15] Rename Utils.py to utils.py --- emailanalysis/{Utils.py => utils.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename emailanalysis/{Utils.py => utils.py} (100%) diff --git a/emailanalysis/Utils.py b/emailanalysis/utils.py similarity index 100% rename from emailanalysis/Utils.py rename to emailanalysis/utils.py From ac9ad9a4d0e76ef0cf80494e65e4b92fa60b2748 Mon Sep 17 00:00:00 2001 From: Dhrumil Mehta Date: Thu, 9 May 2019 11:49:09 -0400 Subject: [PATCH 09/15] remove unnecessary package --- emailanalysis/downloader.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/emailanalysis/downloader.py b/emailanalysis/downloader.py index e723167..1459275 100644 --- a/emailanalysis/downloader.py +++ b/emailanalysis/downloader.py @@ -23,7 +23,7 @@ from peewee import * -import authenticator +import emailanalysis.authenticator gmail_service = authenticator.authenticate_gmail_service() t = Terminal() diff --git a/setup.py b/setup.py index 61e8b7c..a12d7bc 100644 --- a/setup.py +++ b/setup.py @@ -7,5 +7,5 @@ author_email='dhrumil.mehta@gmail.com', packages=['emailanalysis'], install_requires=['httplib2','dateutils','blessings','html2text', - 'peewee','authenticator','oauth2client', 'google-api-python-client'] + 'peewee','oauth2client', 'google-api-python-client'] ) From 66fe02da8266a78ed99a3ce980a1655aa50571b5 Mon Sep 17 00:00:00 2001 From: Dhrumil Mehta Date: Thu, 9 May 2019 11:53:05 -0400 Subject: [PATCH 10/15] update --- emailanalysis/downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/emailanalysis/downloader.py b/emailanalysis/downloader.py index 1459275..ff07c90 100644 --- a/emailanalysis/downloader.py +++ b/emailanalysis/downloader.py @@ -23,7 +23,7 @@ from peewee import * -import emailanalysis.authenticator +import emailanalysis.authenticator as authenticator gmail_service = authenticator.authenticate_gmail_service() t = Terminal() From df883b7a3db3b626ee7af6d048ec801e990652a1 Mon Sep 17 00:00:00 2001 From: Dhrumil Mehta Date: Thu, 9 May 2019 11:54:03 -0400 Subject: [PATCH 11/15] fix import --- emailanalysis/downloader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/emailanalysis/downloader.py b/emailanalysis/downloader.py index ff07c90..83eab34 100644 --- a/emailanalysis/downloader.py +++ b/emailanalysis/downloader.py @@ -23,9 +23,9 @@ from peewee import * -import emailanalysis.authenticator as authenticator +from emailanalysis.authenticator import authenticate_gmail_service -gmail_service = authenticator.authenticate_gmail_service() +gmail_service = authenticate_gmail_service() t = Terminal() From f7da96dccc408eaedce6105731b1774f44652e3e Mon Sep 17 00:00:00 2001 From: Dhrumil Mehta Date: Wed, 19 Jun 2019 17:26:58 -0400 Subject: [PATCH 12/15] remove un-necessary logger --- emailanalysis/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/emailanalysis/utils.py b/emailanalysis/utils.py index 5113b23..80d5598 100644 --- a/emailanalysis/utils.py +++ b/emailanalysis/utils.py @@ -3,7 +3,6 @@ import logging logger = logging.getLogger(__name__) -logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.DEBUG) From 3cb59d96389f3671a9d036502f2a4e6d213f11bd Mon Sep 17 00:00:00 2001 From: Dhrumil Mehta Date: Wed, 19 Jun 2019 17:39:16 -0400 Subject: [PATCH 13/15] improve logging --- emailanalysis/downloader.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/emailanalysis/downloader.py b/emailanalysis/downloader.py index 83eab34..de59295 100644 --- a/emailanalysis/downloader.py +++ b/emailanalysis/downloader.py @@ -11,10 +11,12 @@ import re import email import sys +import logging from pprint import pprint from collections import Counter from dateutil.parser import parse +from datetime import datetime from blessings import Terminal from emailanalysis.utils import logger, html_to_text @@ -28,6 +30,17 @@ gmail_service = authenticate_gmail_service() t = Terminal() +logfile_path = 'download.log' + +# Delete old logfile +if os.path.exists(logfile_path): + os.remove(logfile_path) + print(f"Deleted logfile '{logfile_path}'") + +# Setup logger to new file +file_handler = logging.FileHandler(logfile_path) +file_handler.setLevel(logging.DEBUG) +logger.addHandler(file_handler) def list_message_ids(): """ @@ -92,13 +105,14 @@ def get_text(email_object): # print "blah" print(content_type) - if msg.is_multipart() and content_type == 'multipart/mixed' or content_type == 'multipart/related': + if msg.is_multipart() and (content_type == 'multipart/mixed' or content_type == 'multipart/related'): text = "" for part in payload: - text += get_text(part) + text += get_text(part) + '\n' # Combine the text of each part separated by '\n' return text elif msg.is_multipart() and content_type == 'multipart/alternative': content_types = [x.get_content_type() for x in payload] + logger.debug(f"Detected {content_type} containing: {sorted(content_types)}") if sorted(content_types) == ['text/html']: html = payload[0] return parse_singlepart_text_message(html) @@ -188,6 +202,8 @@ def download_email(message_id): def download_all_to_database(): + logger.info(f"Starting download at {datetime.now()}") + # Delete 'emails.db' sqlite database if os.path.exists('emails.db'): os.remove('emails.db') @@ -208,7 +224,7 @@ def download_all_to_database(): print(t.red("Error downloading message: %s" % message_id)) print(t.red(str(e))) raise - print("") + logger.info("") if __name__ == '__main__': From 8848b82549e0a6f55de3ff085faa4dac65a1b81b Mon Sep 17 00:00:00 2001 From: Dhrumil Mehta Date: Wed, 19 Jun 2019 17:45:02 -0400 Subject: [PATCH 14/15] improve error logging --- emailanalysis/downloader.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/emailanalysis/downloader.py b/emailanalysis/downloader.py index de59295..9879588 100644 --- a/emailanalysis/downloader.py +++ b/emailanalysis/downloader.py @@ -102,9 +102,6 @@ def get_text(email_object): content_type = msg.get_content_type() payload = msg.get_payload() - # print "blah" - print(content_type) - if msg.is_multipart() and (content_type == 'multipart/mixed' or content_type == 'multipart/related'): text = "" for part in payload: @@ -150,26 +147,26 @@ def parse_message(gmail_message): message_id = gmail_message.get('id') if not message_id: - print(t.red("No message_id")) + logger.error(t.red("No message_id")) message_labels = gmail_message.get('labelIds') if not message_labels: - print(t.red("No message_labels")) + logger.error(t.red("No message_labels")) message_to = email_object['To'] if not message_to: - print(t.red("No message_to")) + logger.error(t.red("No message_to")) message_from = email_object['From'] if not message_from: - print(t.red("No message_from")) + logger.error(t.red("No message_from")) message_subject = email_object['Subject'] if not message_subject: - print(t.red("No message_subject")) + logger.error(t.red("No message_subject")) message_date = parse(email_object['date']) if not message_date: - print(t.red("No message_date")) + logger.error(t.red("No message_date")) text = get_text(email_object) if not text: - print(t.red("No text")) + logger.error(t.red("No text")) return { 'message_id': message_id, @@ -221,8 +218,8 @@ def download_all_to_database(): try: download_email(message_id) except Exception as e: - print(t.red("Error downloading message: %s" % message_id)) - print(t.red(str(e))) + logger.error(t.red("Error downloading message: %s" % message_id)) + logger.error(t.red(str(e))) raise logger.info("") @@ -234,6 +231,6 @@ def download_all_to_database(): # try: # download_email(message_id) # except Exception, e: - # print(t.red("FOUND ERROR ! %s" % message_id)) - # print(t.red( "Unexpected error: %s" % e )) + # logger.error(t.red("FOUND ERROR ! %s" % message_id)) + # logger.error(t.red( "Unexpected error: %s" % e )) # raise From f6131904b550feeea4ef22739e944cd50319bfe3 Mon Sep 17 00:00:00 2001 From: Dhrumil Mehta Date: Wed, 19 Jun 2019 17:46:08 -0400 Subject: [PATCH 15/15] change some logs to logger.warn --- emailanalysis/downloader.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/emailanalysis/downloader.py b/emailanalysis/downloader.py index 9879588..77855ee 100644 --- a/emailanalysis/downloader.py +++ b/emailanalysis/downloader.py @@ -147,26 +147,26 @@ def parse_message(gmail_message): message_id = gmail_message.get('id') if not message_id: - logger.error(t.red("No message_id")) + logger.warn(t.red("No message_id")) message_labels = gmail_message.get('labelIds') if not message_labels: - logger.error(t.red("No message_labels")) + logger.warn(t.red("No message_labels")) message_to = email_object['To'] if not message_to: - logger.error(t.red("No message_to")) + logger.warn(t.red("No message_to")) message_from = email_object['From'] if not message_from: - logger.error(t.red("No message_from")) + logger.warn(t.red("No message_from")) message_subject = email_object['Subject'] if not message_subject: - logger.error(t.red("No message_subject")) + logger.warn(t.red("No message_subject")) message_date = parse(email_object['date']) if not message_date: - logger.error(t.red("No message_date")) + logger.warn(t.red("No message_date")) text = get_text(email_object) if not text: - logger.error(t.red("No text")) + logger.warn(t.red("No text")) return { 'message_id': message_id,