diff --git a/docxpy/__init__.py b/docxpy/__init__.py index f3cfa0d..130068a 100644 --- a/docxpy/__init__.py +++ b/docxpy/__init__.py @@ -2,4 +2,4 @@ from .docxreader import process_args from .docxreader import DOCReader -VERSION = '0.8.5' +VERSION = "0.8.5" diff --git a/docxpy/docxreader.py b/docxpy/docxreader.py index 3eac2e0..97b5a80 100755 --- a/docxpy/docxreader.py +++ b/docxpy/docxreader.py @@ -9,17 +9,20 @@ def process_args(): - parser = argparse.ArgumentParser(description='A pure python-based utility ' - 'to extract text and images ' - 'from docx files.') + parser = argparse.ArgumentParser( + description="A pure python-based utility " + "to extract text and images " + "from docx files." + ) parser.add_argument("docx", help="path of the docx file") - parser.add_argument('-i', '--img_dir', help='path of directory ' - 'to extract images') + parser.add_argument( + "-i", "--img_dir", help="path of directory " "to extract images" + ) args = parser.parse_args() if not os.path.exists(args.docx): - print('File {} does not exist.'.format(args.docx)) + print("File {} does not exist.".format(args.docx)) sys.exit(1) if args.img_dir is not None: @@ -39,19 +42,22 @@ def qn(tag): example, ``qn('p:cSld')`` returns ``'{http://schemas.../main}cSld'``. Source: https://github.com/python-openxml/python-docx/ """ - nsmap = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'} - prefix, tagroot = tag.split(':') + nsmap = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"} + prefix, tagroot = tag.split(":") uri = nsmap[prefix] - return '{{{}}}{}'.format(uri, tagroot) + return "{{{}}}{}".format(uri, tagroot) class DOCReader(object): def __init__(self, docx, img_dir=None): - if not os.path.exists(docx): - raise Exception('Can not file document: %s' % docx) + if isinstance(docx, str): + if not os.path.exists(docx): + raise Exception("Can not file document: %s" % docx) + elif not hasattr(docx, "read"): + raise Exception(" %s is not a valid path or file object" % docx) self.file = docx self.img_dir = img_dir - self.data = {'links': []} # save header, footer, document, links + self.data = {"links": []} # save header, footer, document, links self.links = {} # read file @@ -59,7 +65,7 @@ def __init__(self, docx, img_dir=None): self.filelist = self.zipf.namelist() # parse hyperlinks - hyperlink_document = 'word/_rels/document.xml.rels' + hyperlink_document = "word/_rels/document.xml.rels" if hyperlink_document in self.filelist: self.process_hyperlink(self.zipf.read(hyperlink_document)) @@ -69,8 +75,8 @@ def process_hyperlink(self, doc): """ root = ET.fromstring(doc) nodes = [node.attrib for node in root] - nodes = filter(lambda x: x.get('TargetMode', '') == 'External', nodes) - self.links = {node['Id']: node['Target'] for node in nodes} + nodes = filter(lambda x: x.get("TargetMode", "") == "External", nodes) + self.links = {node["Id"]: node["Target"] for node in nodes} def xml2text(self, xml): """ @@ -79,42 +85,55 @@ def xml2text(self, xml): equivalent. Adapted from: https://github.com/python-openxml/python-docx/ """ - text = u'' + text = "" root = ET.fromstring(xml) for child in root.iter(): attr = child.attrib for k, v in attr.items(): - if k.endswith('id') and v in self.links: - self.data['links'].append((ET.tostring(child, encoding='utf-8', method='text'), self.links[v])) - if child.tag == qn('w:t'): + if k.endswith("id") and v in self.links: + self.data["links"].append( + ( + ET.tostring(child, encoding="utf-8", method="text"), + self.links[v], + ) + ) + if child.tag == qn("w:t"): t_text = child.text - text += t_text if t_text is not None else '' - elif child.tag == qn('w:tab'): - text += '\t' - elif child.tag in (qn('w:br'), qn('w:cr')): - text += '\n' + text += t_text if t_text is not None else "" + elif child.tag == qn("w:tab"): + text += "\t" + elif child.tag in (qn("w:br"), qn("w:cr")): + text += "\n" elif child.tag == qn("w:p"): - text += '\n\n' + text += "\n\n" return text def process(self): - text = u'' + text = "" # get header text # there can be 3 header files in the zip - header_xmls = re.compile('word/header[0-9]*.xml') - self.data['header'] = [self.xml2text(self.zipf.read(fname)) for fname in self.filelist if header_xmls.match(fname)] - text += '\n'.join(self.data['header']) + header_xmls = re.compile("word/header[0-9]*.xml") + self.data["header"] = [ + self.xml2text(self.zipf.read(fname)) + for fname in self.filelist + if header_xmls.match(fname) + ] + text += "\n".join(self.data["header"]) # get main text - doc_xml = 'word/document.xml' - self.data['document'] = self.xml2text(self.zipf.read(doc_xml)) - text += self.data['document'] + doc_xml = "word/document.xml" + self.data["document"] = self.xml2text(self.zipf.read(doc_xml)) + text += self.data["document"] # get footer text # there can be 3 footer files in the zip - footer_xmls = re.compile('word/footer[0-9]*.xml') - self.data['footer'] = [self.xml2text(self.zipf.read(fname)) for fname in self.filelist if footer_xmls.match(fname)] - text += '\n'.join(self.data['footer']) + footer_xmls = re.compile("word/footer[0-9]*.xml") + self.data["footer"] = [ + self.xml2text(self.zipf.read(fname)) + for fname in self.filelist + if footer_xmls.match(fname) + ] + text += "\n".join(self.data["footer"]) if self.img_dir is not None: # extract images @@ -134,7 +153,7 @@ def process(docx, img_dir=None): return res -if __name__ == '__main__': +if __name__ == "__main__": args = process_args() text = process(args.docx, args.img_dir) - print(text.encode('utf-8')) + print(text.encode("utf-8")) diff --git a/setup.py b/setup.py index 2cb5490..9540931 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ from docxpy import VERSION # get all of the scripts -scripts = glob.glob('bin/*') +scripts = glob.glob("bin/*") def read(fname): @@ -12,23 +12,23 @@ def read(fname): setup( - name='docxpy', - packages=['docxpy'], + name="docxpy", + packages=["docxpy"], version=VERSION, - description='A pure python-based utility to extract text, hyperlinks and images' - 'from docx files.', + description="A pure python-based utility to extract text, hyperlinks and images" + "from docx files.", long_description=open("README.rst").read(), - author='Ankush Shah, Yalei Du', - author_email='yaleidu@163.com', - url='https://github.com/badbye/docxpy', - keywords=['python', 'docx', 'text', 'links', 'images', 'extract'], + author="Ankush Shah, Yalei Du", + author_email="yaleidu@163.com", + url="https://github.com/badbye/docxpy", + keywords=["python", "docx", "text", "links", "images", "extract"], scripts=scripts, - test_suite='nose.collector', - tests_require=['nose'], + test_suite="nose.collector", + tests_require=["nose"], classifiers=[ "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", - "Programming Language :: Python :: 3.5" - ] + "Programming Language :: Python :: 3.5", + ], ) diff --git a/tests/test-hello.py b/tests/test-hello.py index 80c81e0..fe5480c 100644 --- a/tests/test-hello.py +++ b/tests/test-hello.py @@ -1,26 +1,37 @@ +import os.path as pth import unittest from docxpy import DOCReader +test_file = pth.join(pth.split(__file__)[0], "Hello.docx") + class Test(unittest.TestCase): def setUp(self): - self.file = DOCReader('Hello.docx') + self.file = DOCReader(test_file) self.file.process() def test_file_data(self): self.assertIsInstance(self.file.data, dict) - self.assertTrue('header' in self.file.data) - self.assertTrue('footer' in self.file.data) - self.assertTrue('document' in self.file.data) + self.assertTrue("header" in self.file.data) + self.assertTrue("footer" in self.file.data) + self.assertTrue("document" in self.file.data) def test_hyperlinks(self): - links = self.file.data['links'] - self.assertEqual(links, [('This is a hyperlink.'.encode('utf-8'), 'https://www.google.com/')]) + links = self.file.data["links"] + self.assertEqual( + links, [("This is a hyperlink.".encode("utf-8"), "https://www.google.com/")] + ) + + def test_text_fobject(self): + file = DOCReader(open(test_file, "rb")) + file.process() + text = file.data["document"].replace("\n", "") + self.assertEqual(text, "TitleThis is a hyperlink.") def test_text(self): - text = self.file.data['document'].replace('\n', '') - self.assertEqual(text, 'TitleThis is a hyperlink.') + text = self.file.data["document"].replace("\n", "") + self.assertEqual(text, "TitleThis is a hyperlink.") -if __name__ == '__main__': +if __name__ == "__main__": unittest.main()