From b91527def98256c1a13c6a197cc0950484d8be66 Mon Sep 17 00:00:00 2001 From: Scot Matson Date: Wed, 16 Mar 2016 14:31:03 -0700 Subject: [PATCH 1/6] Removed trailing whitespace. --- iocp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/iocp.py b/iocp.py index 9968cc2..54d3317 100755 --- a/iocp.py +++ b/iocp.py @@ -190,7 +190,7 @@ def parse_pdf_pypdf2(self, f, fpath): def parse_pdf_pdfminer(self, f, fpath): try: laparams = LAParams() - laparams.all_texts = True + laparams.all_texts = True rsrcmgr = PDFResourceManager() pagenos = set() @@ -223,7 +223,7 @@ def parse_pdf(self, f, fpath): except AttributeError: e = 'Selected PDF parser library is not supported: %s' % (self.library) raise NotImplementedError(e) - + self.parser_func(f, fpath) def parse_txt(self, f, fpath): @@ -244,7 +244,7 @@ def parse_html(self, f, fpath): try: if self.dedup: self.dedup_store = set() - + data = f.read() soup = BeautifulSoup(data) html = soup.findAll(text=True) From 3f2304e47205922cb88c5dea14364e6ee6dc6ada Mon Sep 17 00:00:00 2001 From: Scot Matson Date: Wed, 16 Mar 2016 14:31:47 -0700 Subject: [PATCH 2/6] Changed tab into space. Fixed TabError. --- whitelist.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/whitelist.py b/whitelist.py index 5f12d31..bf30aec 100644 --- a/whitelist.py +++ b/whitelist.py @@ -4,9 +4,9 @@ class WhiteList(dict): def __init__(self, basedir): - searchdir = os.path.join(basedir, "whitelists/whitelist_*.ini") + searchdir = os.path.join(basedir, "whitelists/whitelist_*.ini") fpaths = glob.glob(searchdir) for fpath in fpaths: t = os.path.splitext(os.path.split(fpath)[1])[0].split('_',1)[1] patterns = [line.strip() for line in open(fpath)] - self[t] = [re.compile(p) for p in patterns] \ No newline at end of file + self[t] = [re.compile(p) for p in patterns] From 8665424c9a5573533b0f15204695178283f8901a Mon Sep 17 00:00:00 2001 From: Scot Matson Date: Wed, 16 Mar 2016 14:33:18 -0700 Subject: [PATCH 3/6] Removed trailing whitespace. --- output.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/output.py b/output.py index d71c92c..288198b 100644 --- a/output.py +++ b/output.py @@ -70,7 +70,7 @@ def print_match(self, fpath, page, name, match): self.cnt[name] += 1 else: self.cnt[name] = 1 - + string_id = "$%s%d" % (name, self.cnt[name]) self.sids.append(string_id) string_value = match.replace('\\', '\\\\') @@ -92,7 +92,7 @@ def print_footer(self, fpath): print("\tcondition:") print("\t\t" + cond) print("}") - + class OutputHandler_netflow(OutputHandler): def __init__(self): print "host 255.255.255.255" From 8bb89826213cac8503a8f6d1c329d3eeac45ca02 Mon Sep 17 00:00:00 2001 From: Scot Matson Date: Wed, 16 Mar 2016 15:02:41 -0700 Subject: [PATCH 4/6] Modified StringIO import statement to support Python3 as the default. Python2 syntax is now the fallback. --- iocp.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/iocp.py b/iocp.py index 54d3317..aa92ae6 100755 --- a/iocp.py +++ b/iocp.py @@ -40,7 +40,10 @@ import fnmatch import argparse import re -from StringIO import StringIO +try: + from io import StringIO +except: + from StringIO import StringIO try: import configparser as ConfigParser except ImportError: @@ -304,7 +307,7 @@ def parse(self, path): argparser.add_argument('-i', dest='INPUT_FORMAT', default='pdf', help='Input format (pdf/txt/html)') argparser.add_argument('-o', dest='OUTPUT_FORMAT', default='csv', help='Output format (csv/json/yara/netflow)') argparser.add_argument('-d', dest='DEDUP', action='store_true', default=False, help='Deduplicate matches') - argparser.add_argument('-l', dest='LIB', default='pdfminer', help='PDF parsing library (pypdf2/pdfminer)') + argparser.add_argument('-l', dest='LIB', default='pypdf2', help='PDF parsing library (pypdf2/pdfminer)') args = argparser.parse_args() parser = IOC_Parser(args.INI, args.INPUT_FORMAT, args.DEDUP, args.LIB, args.OUTPUT_FORMAT) From aece74a7e427719622a22e48e1f6038f36ecf312 Mon Sep 17 00:00:00 2001 From: Scot Matson Date: Wed, 16 Mar 2016 15:04:33 -0700 Subject: [PATCH 5/6] Added missing parenthesis on print statements. --- output.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/output.py b/output.py index 288198b..af485d6 100644 --- a/output.py +++ b/output.py @@ -95,7 +95,7 @@ def print_footer(self, fpath): class OutputHandler_netflow(OutputHandler): def __init__(self): - print "host 255.255.255.255" + print("host 255.255.255.255") def print_match(self, fpath, page, name, match): data = { @@ -103,4 +103,4 @@ def print_match(self, fpath, page, name, match): 'match': match } if data["type"] == "IP": - print " or host %s " % data["match"] + print(" or host %s " % data["match"]) From 8dc8e7b801d8066b1db19d3548688990447f6ac0 Mon Sep 17 00:00:00 2001 From: Scot Matson Date: Wed, 16 Mar 2016 16:24:21 -0700 Subject: [PATCH 6/6] Removed unicode encoding from html_parser function. HTML files still parse properly from what I can tell and now ioc_parser fully supports Python2 and Python3 - at least for HTML. --- iocp.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/iocp.py b/iocp.py index aa92ae6..c2db9fe 100755 --- a/iocp.py +++ b/iocp.py @@ -35,6 +35,7 @@ # ################################################################################################### +#from __future__ import unicode_literals import os import sys import fnmatch @@ -252,14 +253,14 @@ def parse_html(self, f, fpath): soup = BeautifulSoup(data) html = soup.findAll(text=True) - text = u'' + text = '' for elem in html: if elem.parent.name in ['style', 'script', '[document]', 'head', 'title']: continue - elif re.match('', unicode(elem)): + elif re.match('', elem): continue else: - text += unicode(elem) + text += elem self.handler.print_header(fpath) self.parse_page(fpath, text, 1)