Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docxpy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
from .docxreader import process_args
from .docxreader import DOCReader

VERSION = '0.8.5'
VERSION = "0.8.5"
93 changes: 56 additions & 37 deletions docxpy/docxreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,20 @@


def process_args():
parser = argparse.ArgumentParser(description='A pure python-based utility '
'to extract text and images '
'from docx files.')
parser = argparse.ArgumentParser(
description="A pure python-based utility "
"to extract text and images "
"from docx files."
)
parser.add_argument("docx", help="path of the docx file")
parser.add_argument('-i', '--img_dir', help='path of directory '
'to extract images')
parser.add_argument(
"-i", "--img_dir", help="path of directory " "to extract images"
)

args = parser.parse_args()

if not os.path.exists(args.docx):
print('File {} does not exist.'.format(args.docx))
print("File {} does not exist.".format(args.docx))
sys.exit(1)

if args.img_dir is not None:
Expand All @@ -39,27 +42,30 @@ def qn(tag):
example, ``qn('p:cSld')`` returns ``'{http://schemas.../main}cSld'``.
Source: https://github.com/python-openxml/python-docx/
"""
nsmap = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
prefix, tagroot = tag.split(':')
nsmap = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
prefix, tagroot = tag.split(":")
uri = nsmap[prefix]
return '{{{}}}{}'.format(uri, tagroot)
return "{{{}}}{}".format(uri, tagroot)


class DOCReader(object):
def __init__(self, docx, img_dir=None):
if not os.path.exists(docx):
raise Exception('Can not file document: %s' % docx)
if isinstance(docx, str):
if not os.path.exists(docx):
raise Exception("Can not file document: %s" % docx)
elif not hasattr(docx, "read"):
raise Exception(" %s is not a valid path or file object" % docx)
self.file = docx
self.img_dir = img_dir
self.data = {'links': []} # save header, footer, document, links
self.data = {"links": []} # save header, footer, document, links
self.links = {}

# read file
self.zipf = zipfile.ZipFile(self.file)
self.filelist = self.zipf.namelist()

# parse hyperlinks
hyperlink_document = 'word/_rels/document.xml.rels'
hyperlink_document = "word/_rels/document.xml.rels"
if hyperlink_document in self.filelist:
self.process_hyperlink(self.zipf.read(hyperlink_document))

Expand All @@ -69,8 +75,8 @@ def process_hyperlink(self, doc):
"""
root = ET.fromstring(doc)
nodes = [node.attrib for node in root]
nodes = filter(lambda x: x.get('TargetMode', '') == 'External', nodes)
self.links = {node['Id']: node['Target'] for node in nodes}
nodes = filter(lambda x: x.get("TargetMode", "") == "External", nodes)
self.links = {node["Id"]: node["Target"] for node in nodes}

def xml2text(self, xml):
"""
Expand All @@ -79,42 +85,55 @@ def xml2text(self, xml):
equivalent.
Adapted from: https://github.com/python-openxml/python-docx/
"""
text = u''
text = ""
root = ET.fromstring(xml)
for child in root.iter():
attr = child.attrib
for k, v in attr.items():
if k.endswith('id') and v in self.links:
self.data['links'].append((ET.tostring(child, encoding='utf-8', method='text'), self.links[v]))
if child.tag == qn('w:t'):
if k.endswith("id") and v in self.links:
self.data["links"].append(
(
ET.tostring(child, encoding="utf-8", method="text"),
self.links[v],
)
)
if child.tag == qn("w:t"):
t_text = child.text
text += t_text if t_text is not None else ''
elif child.tag == qn('w:tab'):
text += '\t'
elif child.tag in (qn('w:br'), qn('w:cr')):
text += '\n'
text += t_text if t_text is not None else ""
elif child.tag == qn("w:tab"):
text += "\t"
elif child.tag in (qn("w:br"), qn("w:cr")):
text += "\n"
elif child.tag == qn("w:p"):
text += '\n\n'
text += "\n\n"
return text

def process(self):
text = u''
text = ""
# get header text
# there can be 3 header files in the zip
header_xmls = re.compile('word/header[0-9]*.xml')
self.data['header'] = [self.xml2text(self.zipf.read(fname)) for fname in self.filelist if header_xmls.match(fname)]
text += '\n'.join(self.data['header'])
header_xmls = re.compile("word/header[0-9]*.xml")
self.data["header"] = [
self.xml2text(self.zipf.read(fname))
for fname in self.filelist
if header_xmls.match(fname)
]
text += "\n".join(self.data["header"])

# get main text
doc_xml = 'word/document.xml'
self.data['document'] = self.xml2text(self.zipf.read(doc_xml))
text += self.data['document']
doc_xml = "word/document.xml"
self.data["document"] = self.xml2text(self.zipf.read(doc_xml))
text += self.data["document"]

# get footer text
# there can be 3 footer files in the zip
footer_xmls = re.compile('word/footer[0-9]*.xml')
self.data['footer'] = [self.xml2text(self.zipf.read(fname)) for fname in self.filelist if footer_xmls.match(fname)]
text += '\n'.join(self.data['footer'])
footer_xmls = re.compile("word/footer[0-9]*.xml")
self.data["footer"] = [
self.xml2text(self.zipf.read(fname))
for fname in self.filelist
if footer_xmls.match(fname)
]
text += "\n".join(self.data["footer"])

if self.img_dir is not None:
# extract images
Expand All @@ -134,7 +153,7 @@ def process(docx, img_dir=None):
return res


if __name__ == '__main__':
if __name__ == "__main__":
args = process_args()
text = process(args.docx, args.img_dir)
print(text.encode('utf-8'))
print(text.encode("utf-8"))
26 changes: 13 additions & 13 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,31 @@
from docxpy import VERSION

# get all of the scripts
scripts = glob.glob('bin/*')
scripts = glob.glob("bin/*")


def read(fname):
return open(os.path.join(os.path.dirname(__file__), fname)).read()


setup(
name='docxpy',
packages=['docxpy'],
name="docxpy",
packages=["docxpy"],
version=VERSION,
description='A pure python-based utility to extract text, hyperlinks and images'
'from docx files.',
description="A pure python-based utility to extract text, hyperlinks and images"
"from docx files.",
long_description=open("README.rst").read(),
author='Ankush Shah, Yalei Du',
author_email='yaleidu@163.com',
url='https://github.com/badbye/docxpy',
keywords=['python', 'docx', 'text', 'links', 'images', 'extract'],
author="Ankush Shah, Yalei Du",
author_email="yaleidu@163.com",
url="https://github.com/badbye/docxpy",
keywords=["python", "docx", "text", "links", "images", "extract"],
scripts=scripts,
test_suite='nose.collector',
tests_require=['nose'],
test_suite="nose.collector",
tests_require=["nose"],
classifiers=[
"Programming Language :: Python :: 2.7",
"Programming Language :: Python :: 3.3",
"Programming Language :: Python :: 3.4",
"Programming Language :: Python :: 3.5"
]
"Programming Language :: Python :: 3.5",
],
)
29 changes: 20 additions & 9 deletions tests/test-hello.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,37 @@
import os.path as pth
import unittest
from docxpy import DOCReader

test_file = pth.join(pth.split(__file__)[0], "Hello.docx")


class Test(unittest.TestCase):
def setUp(self):
self.file = DOCReader('Hello.docx')
self.file = DOCReader(test_file)
self.file.process()

def test_file_data(self):
self.assertIsInstance(self.file.data, dict)
self.assertTrue('header' in self.file.data)
self.assertTrue('footer' in self.file.data)
self.assertTrue('document' in self.file.data)
self.assertTrue("header" in self.file.data)
self.assertTrue("footer" in self.file.data)
self.assertTrue("document" in self.file.data)

def test_hyperlinks(self):
links = self.file.data['links']
self.assertEqual(links, [('This is a hyperlink.'.encode('utf-8'), 'https://www.google.com/')])
links = self.file.data["links"]
self.assertEqual(
links, [("This is a hyperlink.".encode("utf-8"), "https://www.google.com/")]
)

def test_text_fobject(self):
file = DOCReader(open(test_file, "rb"))
file.process()
text = file.data["document"].replace("\n", "")
self.assertEqual(text, "TitleThis is a hyperlink.")

def test_text(self):
text = self.file.data['document'].replace('\n', '')
self.assertEqual(text, 'TitleThis is a hyperlink.')
text = self.file.data["document"].replace("\n", "")
self.assertEqual(text, "TitleThis is a hyperlink.")


if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()