badbye · Tinny-Robot · Aug 26, 2022
diff --git a/docxpy/__init__.py b/docxpy/__init__.py
@@ -2,4 +2,4 @@
 from .docxreader import process_args
 from .docxreader import DOCReader
 
-VERSION = '0.8.5'
+VERSION = "0.8.5"
diff --git a/docxpy/docxreader.py b/docxpy/docxreader.py
@@ -9,17 +9,20 @@
 
 
 def process_args():
-    parser = argparse.ArgumentParser(description='A pure python-based utility '
-                                                 'to extract text and images '
-                                                 'from docx files.')
+    parser = argparse.ArgumentParser(
+        description="A pure python-based utility "
+        "to extract text and images "
+        "from docx files."
+    )
     parser.add_argument("docx", help="path of the docx file")
-    parser.add_argument('-i', '--img_dir', help='path of directory '
-                                                'to extract images')
+    parser.add_argument(
+        "-i", "--img_dir", help="path of directory " "to extract images"
+    )
 
     args = parser.parse_args()
 
     if not os.path.exists(args.docx):
-        print('File {} does not exist.'.format(args.docx))
+        print("File {} does not exist.".format(args.docx))
         sys.exit(1)
 
     if args.img_dir is not None:
@@ -39,27 +42,30 @@ def qn(tag):
     example, ``qn('p:cSld')`` returns ``'{http://schemas.../main}cSld'``.
     Source: https://github.com/python-openxml/python-docx/
     """
-    nsmap = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
-    prefix, tagroot = tag.split(':')
+    nsmap = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
+    prefix, tagroot = tag.split(":")
     uri = nsmap[prefix]
-    return '{{{}}}{}'.format(uri, tagroot)
+    return "{{{}}}{}".format(uri, tagroot)
 
 
 class DOCReader(object):
     def __init__(self, docx, img_dir=None):
-        if not os.path.exists(docx):
-            raise Exception('Can not file document: %s' % docx)
+        if isinstance(docx, str):
+            if not os.path.exists(docx):
+                raise Exception("Can not file document: %s" % docx)
+        elif not hasattr(docx, "read"):
+            raise Exception(" %s is not a valid path or file object" % docx)
         self.file = docx
         self.img_dir = img_dir
-        self.data = {'links': []}  # save header, footer, document, links
+        self.data = {"links": []}  # save header, footer, document, links
         self.links = {}
 
         # read file
         self.zipf = zipfile.ZipFile(self.file)
         self.filelist = self.zipf.namelist()
 
         # parse hyperlinks
-        hyperlink_document = 'word/_rels/document.xml.rels'
+        hyperlink_document = "word/_rels/document.xml.rels"
         if hyperlink_document in self.filelist:
             self.process_hyperlink(self.zipf.read(hyperlink_document))
 
@@ -69,8 +75,8 @@ def process_hyperlink(self, doc):
         """
         root = ET.fromstring(doc)
         nodes = [node.attrib for node in root]
-        nodes = filter(lambda x: x.get('TargetMode', '') == 'External', nodes)
-        self.links = {node['Id']: node['Target'] for node in nodes}
+        nodes = filter(lambda x: x.get("TargetMode", "") == "External", nodes)
+        self.links = {node["Id"]: node["Target"] for node in nodes}
 
     def xml2text(self, xml):
         """
@@ -79,42 +85,55 @@ def xml2text(self, xml):
         equivalent.
         Adapted from: https://github.com/python-openxml/python-docx/
         """
-        text = u''
+        text = ""
         root = ET.fromstring(xml)
         for child in root.iter():
             attr = child.attrib
             for k, v in attr.items():
-                if k.endswith('id') and v in self.links:
-                    self.data['links'].append((ET.tostring(child, encoding='utf-8', method='text'), self.links[v]))
-            if child.tag == qn('w:t'):
+                if k.endswith("id") and v in self.links:
+                    self.data["links"].append(
+                        (
+                            ET.tostring(child, encoding="utf-8", method="text"),
+                            self.links[v],
+                        )
+                    )
+            if child.tag == qn("w:t"):
                 t_text = child.text
-                text += t_text if t_text is not None else ''
-            elif child.tag == qn('w:tab'):
-                text += '\t'
-            elif child.tag in (qn('w:br'), qn('w:cr')):
-                text += '\n'
+                text += t_text if t_text is not None else ""
+            elif child.tag == qn("w:tab"):
+                text += "\t"
+            elif child.tag in (qn("w:br"), qn("w:cr")):
+                text += "\n"
             elif child.tag == qn("w:p"):
-                text += '\n\n'
+                text += "\n\n"
         return text
 
     def process(self):
-        text = u''
+        text = ""
         # get header text
         # there can be 3 header files in the zip
-        header_xmls = re.compile('word/header[0-9]*.xml')
-        self.data['header'] = [self.xml2text(self.zipf.read(fname)) for fname in self.filelist if header_xmls.match(fname)]
-        text += '\n'.join(self.data['header'])
+        header_xmls = re.compile("word/header[0-9]*.xml")
+        self.data["header"] = [
+            self.xml2text(self.zipf.read(fname))
+            for fname in self.filelist
+            if header_xmls.match(fname)
+        ]
+        text += "\n".join(self.data["header"])
 
         # get main text
-        doc_xml = 'word/document.xml'
-        self.data['document'] = self.xml2text(self.zipf.read(doc_xml))
-        text += self.data['document']
+        doc_xml = "word/document.xml"
+        self.data["document"] = self.xml2text(self.zipf.read(doc_xml))
+        text += self.data["document"]
 
         # get footer text
         # there can be 3 footer files in the zip
-        footer_xmls = re.compile('word/footer[0-9]*.xml')
-        self.data['footer'] = [self.xml2text(self.zipf.read(fname)) for fname in self.filelist if footer_xmls.match(fname)]
-        text += '\n'.join(self.data['footer'])
+        footer_xmls = re.compile("word/footer[0-9]*.xml")
+        self.data["footer"] = [
+            self.xml2text(self.zipf.read(fname))
+            for fname in self.filelist
+            if footer_xmls.match(fname)
+        ]
+        text += "\n".join(self.data["footer"])
 
         if self.img_dir is not None:
             # extract images
@@ -134,7 +153,7 @@ def process(docx, img_dir=None):
     return res
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     args = process_args()
     text = process(args.docx, args.img_dir)
-    print(text.encode('utf-8'))
+    print(text.encode("utf-8"))
diff --git a/setup.py b/setup.py
@@ -4,31 +4,31 @@
 from docxpy import VERSION
 
 # get all of the scripts
-scripts = glob.glob('bin/*')
+scripts = glob.glob("bin/*")
 
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 
 
 setup(
-    name='docxpy',
-    packages=['docxpy'],
+    name="docxpy",
+    packages=["docxpy"],
     version=VERSION,
-    description='A pure python-based utility to extract text, hyperlinks and images'
-              'from docx files.',
+    description="A pure python-based utility to extract text, hyperlinks and images"
+    "from docx files.",
     long_description=open("README.rst").read(),
-    author='Ankush Shah, Yalei Du',
-    author_email='yaleidu@163.com',
-    url='https://github.com/badbye/docxpy',
-    keywords=['python', 'docx', 'text', 'links', 'images', 'extract'],
+    author="Ankush Shah, Yalei Du",
+    author_email="yaleidu@163.com",
+    url="https://github.com/badbye/docxpy",
+    keywords=["python", "docx", "text", "links", "images", "extract"],
     scripts=scripts,
-    test_suite='nose.collector',
-    tests_require=['nose'],
+    test_suite="nose.collector",
+    tests_require=["nose"],
     classifiers=[
         "Programming Language :: Python :: 2.7",
         "Programming Language :: Python :: 3.3",
         "Programming Language :: Python :: 3.4",
-        "Programming Language :: Python :: 3.5"
-  ]
+        "Programming Language :: Python :: 3.5",
+    ],
 )
diff --git a/tests/test-hello.py b/tests/test-hello.py
@@ -1,26 +1,37 @@
+import os.path as pth
 import unittest
 from docxpy import DOCReader
 
+test_file = pth.join(pth.split(__file__)[0], "Hello.docx")
+
 
 class Test(unittest.TestCase):
     def setUp(self):
-        self.file = DOCReader('Hello.docx')
+        self.file = DOCReader(test_file)
         self.file.process()
 
     def test_file_data(self):
         self.assertIsInstance(self.file.data, dict)
-        self.assertTrue('header' in self.file.data)
-        self.assertTrue('footer' in self.file.data)
-        self.assertTrue('document' in self.file.data)
+        self.assertTrue("header" in self.file.data)
+        self.assertTrue("footer" in self.file.data)
+        self.assertTrue("document" in self.file.data)
 
     def test_hyperlinks(self):
-        links = self.file.data['links']
-        self.assertEqual(links, [('This is a hyperlink.'.encode('utf-8'), 'https://www.google.com/')])
+        links = self.file.data["links"]
+        self.assertEqual(
+            links, [("This is a hyperlink.".encode("utf-8"), "https://www.google.com/")]
+        )
+
+    def test_text_fobject(self):
+        file = DOCReader(open(test_file, "rb"))
+        file.process()
+        text = file.data["document"].replace("\n", "")
+        self.assertEqual(text, "TitleThis is a hyperlink.")
 
     def test_text(self):
-        text = self.file.data['document'].replace('\n', '')
-        self.assertEqual(text, 'TitleThis is a hyperlink.')
+        text = self.file.data["document"].replace("\n", "")
+        self.assertEqual(text, "TitleThis is a hyperlink.")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()