From d784086c05a34b1a994d1bc77406cc031c3e9dae Mon Sep 17 00:00:00 2001 From: Gonzalo Fernandez-Victorio Date: Sun, 16 May 2021 14:03:26 +0100 Subject: [PATCH 1/3] Move all the structures This creates a new package xml that deals with the actual marshalling and unmarshalling, allowing more possibilities expanding the API --- README.md | 6 +- docxlib.go | 46 ++++++------ getstructure/main.go | 54 -------------- inline.go | 10 +++ link.go | 9 +++ main/main.go | 57 +++++++------- para.go | 44 +++++++++++ text.go | 21 ++++++ apilink.go => xml/apilink.go | 4 +- apipara.go => xml/apipara.go | 6 +- apirun.go => xml/apirun.go | 2 +- empty.go => xml/empty.go | 6 +- empty_constants.go => xml/empty_constants.go | 2 +- xml/libxml.go | 78 ++++++++++++++++++++ pack.go => xml/pack.go | 4 +- structdoc.go => xml/structdoc.go | 2 +- structdoc_test.go => xml/structdoc_test.go | 2 +- structnodes.go => xml/structnodes.go | 4 +- structrel.go => xml/structrel.go | 2 +- structrun.go => xml/structrun.go | 2 +- unpack.go => xml/unpack.go | 6 +- 21 files changed, 236 insertions(+), 131 deletions(-) delete mode 100644 getstructure/main.go create mode 100644 inline.go create mode 100644 link.go create mode 100644 para.go create mode 100644 text.go rename apilink.go => xml/apilink.go (91%) rename apipara.go => xml/apipara.go (73%) rename apirun.go => xml/apirun.go (97%) rename empty.go => xml/empty.go (93%) rename empty_constants.go => xml/empty_constants.go (99%) create mode 100644 xml/libxml.go rename pack.go => xml/pack.go (93%) rename structdoc.go => xml/structdoc.go (97%) rename structdoc_test.go => xml/structdoc_test.go (99%) rename structnodes.go => xml/structnodes.go (98%) rename structrel.go => xml/structrel.go (97%) rename structrun.go => xml/structrun.go (99%) rename unpack.go => xml/unpack.go (95%) diff --git a/README.md b/README.md index 02cf46e..736bd44 100644 --- a/README.md +++ b/README.md @@ -51,9 +51,11 @@ Now trying to read it We've found a new hyperlink with ref http://google.com and the text google End of main ``` -You can also increase the log level (-logtostderr=true -v=0) and just dump a specific file(-file /tmp/new-file.docx). See [getstructure/main](getstructure/main.go) +You can also increase the log level (-logtostderr=true -v=0). + +And you can just dump a specific file(-file /tmp/new-file.docx -ro) ``` -$ go build -o docxlib ./getstructure/ && ./docxlib -logtostderr=true -v=0 -file /tmp/new-file.docx +$ go build -o docxlib ./main && ./docxlib -logtostderr=true -v=0 -file /tmp/new-file.docx -ro I0511 12:37:40.898493 18466 unpack.go:69] Relations: [...] I0511 12:37:40.898787 18466 unpack.go:47] Doc: [...] I0511 12:37:40.899330 18466 unpack.go:58] Paragraph [0xc000026d40 0xc000027d00 0xc000172340] diff --git a/docxlib.go b/docxlib.go index db93c23..abf3527 100644 --- a/docxlib.go +++ b/docxlib.go @@ -1,24 +1,22 @@ package docxlib import ( - "archive/zip" - "errors" "io" + + "github.com/gonfva/docxlib/xml" ) // DocxLib is the structure that allow to access the internal represntation // in memory of the doc (either read or about to be written) type DocxLib struct { - Document Document - DocRelation Relationships - - rId int + lib *xml.LibXML } // New generates a new empty docx file that we can manipulate and // later on, save func New() *DocxLib { - return emptyFile() + lib := xml.New() + return &DocxLib{lib: lib} } // Parse generates a new docx file in memory from a reader @@ -49,30 +47,28 @@ func New() *DocxLib { // docxlib.Parse(file, handler.Size) // } func Parse(reader io.ReaderAt, size int64) (doc *DocxLib, err error) { - zipReader, err := zip.NewReader(reader, size) - if err != nil { - return nil, err - } - doc, err = unpack(zipReader) + libxml, err := xml.Parse(reader, size) + doc = &DocxLib{lib: libxml} return } // Write allows to save a docx to a writer func (f *DocxLib) Write(writer io.Writer) (err error) { - zipWriter := zip.NewWriter(writer) - defer zipWriter.Close() - - return f.pack(zipWriter) + return f.lib.Write(writer) } -// References gets the url for a reference -func (f *DocxLib) References(id string) (href string, err error) { - for _, a := range f.DocRelation.Relationships { - if a.ID == id { - href = a.Target - return - } +func (f *DocxLib) Paragraphs() []*Paragraph { + pars := make([]*Paragraph, 0) + for _, p := range f.lib.Document.Body.Paragraphs { + pars = append(pars, &Paragraph{pxml: p, lib: f.lib}) } - err = errors.New("id not found") - return + + return pars +} + +// AddParagraph adds a new paragraph +func (f *DocxLib) AddParagraph() *Paragraph { + p := f.lib.AddParagraph() + para := &Paragraph{pxml: p, lib: f.lib} + return para } diff --git a/getstructure/main.go b/getstructure/main.go deleted file mode 100644 index db70522..0000000 --- a/getstructure/main.go +++ /dev/null @@ -1,54 +0,0 @@ -package main - -import ( - "flag" - "fmt" - "os" - - "github.com/golang/glog" - "github.com/gonfva/docxlib" -) - -var fileLocation *string - -func init() { - fileLocation = flag.String("file", "/tmp/new-file.docx", "file location") - flag.Parse() -} - -func main() { - //Now let's try to read the file - readFile, err := os.Open(*fileLocation) - if err != nil { - panic(err) - } - fileinfo, err := readFile.Stat() - if err != nil { - panic(err) - } - size := fileinfo.Size() - doc, err := docxlib.Parse(readFile, int64(size)) - if err != nil { - panic(err) - } - for _, para := range doc.Paragraphs() { - glog.Infoln("There is a new paragraph", para) - for _, child := range para.Children() { - if child.Run != nil && child.Run.Text != nil { - fmt.Printf("\tWe've found a new run with the text ->%s\n", child.Run.Text.Text) - } - if child.Link != nil { - id := child.Link.ID - text := child.Link.Run.InstrText - link, err := doc.References(id) - if err != nil { - fmt.Printf("\tWe found a link with id %s and text %s without target\n", id, text) - } else { - fmt.Printf("\tWe've found a new hyperlink with ref %s and the text %s\n", link, text) - } - - } - } - } - fmt.Println("End of main") -} diff --git a/inline.go b/inline.go new file mode 100644 index 0000000..eaba9dc --- /dev/null +++ b/inline.go @@ -0,0 +1,10 @@ +package docxlib + +import "github.com/gonfva/docxlib/xml" + +type Inline struct { + childXML *xml.ParagraphChild + Kind string + Link + Text +} diff --git a/link.go b/link.go new file mode 100644 index 0000000..cef7510 --- /dev/null +++ b/link.go @@ -0,0 +1,9 @@ +package docxlib + +import "github.com/gonfva/docxlib/xml" + +type Link struct { + Target string + Text string + pxml *xml.Hyperlink +} diff --git a/main/main.go b/main/main.go index f129a9c..930abea 100644 --- a/main/main.go +++ b/main/main.go @@ -9,35 +9,39 @@ import ( ) var fileLocation *string +var readOnly *bool func init() { fileLocation = flag.String("file", "/tmp/new-file.docx", "file location") + readOnly = flag.Bool("ro", false, "Don't attempt to generate a new file, just read one") flag.Parse() } func main() { - fmt.Printf("Preparing new document to write at %s\n", *fileLocation) + if !*readOnly { + fmt.Printf("Preparing new document to write at %s\n", *fileLocation) - w := docxlib.New() - // add new paragraph - para1 := w.AddParagraph() - // add text - para1.AddText("test") + w := docxlib.New() + // add new paragraph + para1 := w.AddParagraph() + // add text + para1.AddText("test") - para1.AddText("test font size").Size(22) - para1.AddText("test color").Color("808080") - para2 := w.AddParagraph() - para2.AddText("test font size and color").Size(22).Color("ff0000") + para1.AddText("test font size").SetSize(22) + para1.AddText("test color").SetColor("808080") + para2 := w.AddParagraph() + para2.AddText("test font size and color").SetSize(22).SetColor("ff0000") - nextPara := w.AddParagraph() - nextPara.AddLink("google", `http://google.com`) + nextPara := w.AddParagraph() + nextPara.AddLink("google", `http://google.com`) - f, err := os.Create(*fileLocation) - if err != nil { - panic(err) + f, err := os.Create(*fileLocation) + if err != nil { + panic(err) + } + defer f.Close() + w.Write(f) + fmt.Println("Document writen. \nNow trying to read it") } - defer f.Close() - w.Write(f) - fmt.Println("Document writen. \nNow trying to read it") // Now let's try to read the file readFile, err := os.Open(*fileLocation) if err != nil { @@ -54,18 +58,13 @@ func main() { } for _, para := range doc.Paragraphs() { for _, child := range para.Children() { - if child.Run != nil { - fmt.Printf("\tWe've found a new run with the text ->%s\n", child.Run.Text.Text) + if child.Kind == "Text" { + fmt.Printf("\tWe've found a new run with the text ->%s\n", child.Text.Content) } - if child.Link != nil { - id := child.Link.ID - text := child.Link.Run.InstrText - link, err := doc.References(id) - if err != nil { - fmt.Printf("\tWe found a link with id %s and text %s without target\n", id, text) - } else { - fmt.Printf("\tWe've found a new hyperlink with ref %s and the text %s\n", link, text) - } + if child.Kind == "Link" { + link := child.Link.Target + text := child.Link.Text + fmt.Printf("\tWe've found a new hyperlink with ref %s and the text %s\n", link, text) } } diff --git a/para.go b/para.go new file mode 100644 index 0000000..0a934c0 --- /dev/null +++ b/para.go @@ -0,0 +1,44 @@ +package docxlib + +import "github.com/gonfva/docxlib/xml" + +type Paragraph struct { + lib *xml.LibXML + pxml *xml.Paragraph +} + +func (p *Paragraph) Children() (ret []*Inline) { + var inline *Inline + ret = make([]*Inline, 0) + for _, child := range p.pxml.Data { + if child.Link != nil { + kind := "Link" + id := child.Link.ID + target, _ := p.lib.References(id) + link := Link{Target: target, Text: child.Link.Run.InstrText} + inline = &Inline{Kind: kind, childXML: &child, Link: link} + } + if child.Run != nil { + kind := "Text" + text := Text{Content: child.Run.Text.Text, pxml: child.Run} + inline = &Inline{Kind: kind, childXML: &child, Text: text} + } + ret = append(ret, inline) + } + return +} + +// AddLink adds an hyperlink to paragraph +func (p *Paragraph) AddLink(text string, link string) *Link { + l := p.pxml.AddLink(text, link) + hyperlink := Link{Target: link, Text: link, pxml: l} + + return &hyperlink +} + +// AddText adds text to paragraph +func (p *Paragraph) AddText(text string) *Text { + t := p.pxml.AddText(text) + txt := Text{Content: text, pxml: t} + return &txt +} diff --git a/text.go b/text.go new file mode 100644 index 0000000..5d1dd26 --- /dev/null +++ b/text.go @@ -0,0 +1,21 @@ +package docxlib + +import "github.com/gonfva/docxlib/xml" + +type Text struct { + Content string + + pxml *xml.Run +} + +// Color allows to set run color +func (r *Text) SetColor(color string) *Text { + r.pxml.Color(color) + return r +} + +// Size allows to set run size +func (r *Text) SetSize(size int) *Text { + r.pxml.Size(size) + return r +} diff --git a/apilink.go b/xml/apilink.go similarity index 91% rename from apilink.go rename to xml/apilink.go index 80260ae..a8459ea 100644 --- a/apilink.go +++ b/xml/apilink.go @@ -1,9 +1,9 @@ -package docxlib +package xml import "strconv" // when adding an hyperlink we need to store a reference in the relationship field -func (f *DocxLib) addLinkRelation(link string) string { +func (f *LibXML) addLinkRelation(link string) string { rel := &Relationship{ ID: "rId" + strconv.Itoa(f.rId), Type: REL_HYPERLINK, diff --git a/apipara.go b/xml/apipara.go similarity index 73% rename from apipara.go rename to xml/apipara.go index 6eff353..733603b 100644 --- a/apipara.go +++ b/xml/apipara.go @@ -1,7 +1,7 @@ -package docxlib +package xml // AddParagraph adds a new paragraph -func (f *DocxLib) AddParagraph() *Paragraph { +func (f *LibXML) AddParagraph() *Paragraph { p := &Paragraph{ Data: make([]ParagraphChild, 0), file: f, @@ -11,7 +11,7 @@ func (f *DocxLib) AddParagraph() *Paragraph { return p } -func (f *DocxLib) Paragraphs() []*Paragraph { +func (f *LibXML) Paragraphs() []*Paragraph { return f.Document.Body.Paragraphs } diff --git a/apirun.go b/xml/apirun.go similarity index 97% rename from apirun.go rename to xml/apirun.go index 25a9110..468c438 100644 --- a/apirun.go +++ b/xml/apirun.go @@ -1,4 +1,4 @@ -package docxlib +package xml // Color allows to set run color func (r *Run) Color(color string) *Run { diff --git a/empty.go b/xml/empty.go similarity index 93% rename from empty.go rename to xml/empty.go index 929e3e9..65acc59 100644 --- a/empty.go +++ b/xml/empty.go @@ -1,4 +1,4 @@ -package docxlib +package xml import "encoding/xml" @@ -23,8 +23,8 @@ func emptyRelationships() []*Relationship { return defaultRel } -func emptyFile() *DocxLib { - docx := &DocxLib{ +func emptyFile() *LibXML { + docx := &LibXML{ Document: Document{ XMLName: xml.Name{ Space: "w", diff --git a/empty_constants.go b/xml/empty_constants.go similarity index 99% rename from empty_constants.go rename to xml/empty_constants.go index ec8a94e..bd0b9e0 100644 --- a/empty_constants.go +++ b/xml/empty_constants.go @@ -1,4 +1,4 @@ -package docxlib +package xml const ( TEMP_REL = ` diff --git a/xml/libxml.go b/xml/libxml.go new file mode 100644 index 0000000..4da605a --- /dev/null +++ b/xml/libxml.go @@ -0,0 +1,78 @@ +// Package xml contains the stuff to read and write the xml structures. +// Office Open XML is basically a zip of xml files +package xml + +import ( + "archive/zip" + "errors" + "io" +) + +type LibXML struct { + Document Document + DocRelation Relationships + + rId int +} + +// New generates a new empty docx file that we can manipulate and +// later on, save +func New() *LibXML { + return emptyFile() +} + +// Parse generates a new docx file in memory from a reader +// You can it invoke from a file +// readFile, err := os.Open(FILE_PATH) +// if err != nil { +// panic(err) +// } +// fileinfo, err := readFile.Stat() +// if err != nil { +// panic(err) +// } +// size := fileinfo.Size() +// doc, err := docxlib.Parse(readFile, int64(size)) +// but also you can invoke from a webform (BEWARE of trusting users data!!!) +// +// func uploadFile(w http.ResponseWriter, r *http.Request) { +// r.ParseMultipartForm(10 << 20) +// +// file, handler, err := r.FormFile("file") +// if err != nil { +// fmt.Println("Error Retrieving the File") +// fmt.Println(err) +// http.Error(w, err.Error(), http.StatusBadRequest) +// return +// } +// defer file.Close() +// docxlib.Parse(file, handler.Size) +// } +func Parse(reader io.ReaderAt, size int64) (doc *LibXML, err error) { + zipReader, err := zip.NewReader(reader, size) + if err != nil { + return nil, err + } + doc, err = unpack(zipReader) + return +} + +// Write allows to save a docx to a writer +func (f *LibXML) Write(writer io.Writer) (err error) { + zipWriter := zip.NewWriter(writer) + defer zipWriter.Close() + + return f.pack(zipWriter) +} + +// References gets the url for a reference +func (f *LibXML) References(id string) (href string, err error) { + for _, a := range f.DocRelation.Relationships { + if a.ID == id { + href = a.Target + return + } + } + err = errors.New("id not found") + return +} diff --git a/pack.go b/xml/pack.go similarity index 93% rename from pack.go rename to xml/pack.go index 5569a86..c8efac6 100644 --- a/pack.go +++ b/xml/pack.go @@ -1,4 +1,4 @@ -package docxlib +package xml import ( "archive/zip" @@ -10,7 +10,7 @@ import ( // This receives a zip file writer (word documents are a zip with multiple xml inside) // and writes the relevant files. Some of them come from the empty_constants file, // others from the actual in-memory structure -func (f *DocxLib) pack(zipWriter *zip.Writer) (err error) { +func (f *LibXML) pack(zipWriter *zip.Writer) (err error) { files := map[string]string{} files["_rels/.rels"] = TEMP_REL diff --git a/structdoc.go b/xml/structdoc.go similarity index 97% rename from structdoc.go rename to xml/structdoc.go index 5d71039..d72689c 100644 --- a/structdoc.go +++ b/xml/structdoc.go @@ -1,4 +1,4 @@ -package docxlib +package xml import "encoding/xml" diff --git a/structdoc_test.go b/xml/structdoc_test.go similarity index 99% rename from structdoc_test.go rename to xml/structdoc_test.go index 869952b..7521b16 100644 --- a/structdoc_test.go +++ b/xml/structdoc_test.go @@ -1,4 +1,4 @@ -package docxlib +package xml import ( "encoding/xml" diff --git a/structnodes.go b/xml/structnodes.go similarity index 98% rename from structnodes.go rename to xml/structnodes.go index a08dbe0..7541f5b 100644 --- a/structnodes.go +++ b/xml/structnodes.go @@ -1,4 +1,4 @@ -package docxlib +package xml import ( "encoding/xml" @@ -17,7 +17,7 @@ type Paragraph struct { XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main p"` Data []ParagraphChild - file *DocxLib + file *LibXML } func (p *Paragraph) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { diff --git a/structrel.go b/xml/structrel.go similarity index 97% rename from structrel.go rename to xml/structrel.go index 8c96b0d..a1d10b3 100644 --- a/structrel.go +++ b/xml/structrel.go @@ -1,4 +1,4 @@ -package docxlib +package xml import "encoding/xml" diff --git a/structrun.go b/xml/structrun.go similarity index 99% rename from structrun.go rename to xml/structrun.go index 0805627..caeb52d 100644 --- a/structrun.go +++ b/xml/structrun.go @@ -1,4 +1,4 @@ -package docxlib +package xml import ( "encoding/xml" diff --git a/unpack.go b/xml/unpack.go similarity index 95% rename from unpack.go rename to xml/unpack.go index ba04311..6192d3d 100644 --- a/unpack.go +++ b/xml/unpack.go @@ -1,4 +1,4 @@ -package docxlib +package xml // This contains internal functions needed to unpack (read) a zip file import ( @@ -13,7 +13,7 @@ import ( // and parses the files that are relevant for us: // 1.-Document // 2.-Relationships -func unpack(zipReader *zip.Reader) (docx *DocxLib, err error) { +func unpack(zipReader *zip.Reader) (docx *LibXML, err error) { var doc *Document var relations *Relationships for _, f := range zipReader.File { @@ -30,7 +30,7 @@ func unpack(zipReader *zip.Reader) (docx *DocxLib, err error) { } } } - docx = &DocxLib{ + docx = &LibXML{ Document: *doc, DocRelation: *relations, } From cf29eb3b43e532969ff0383cd564c6ee04c481d1 Mon Sep 17 00:00:00 2001 From: Gonzalo Fernandez-Victorio Date: Mon, 17 May 2021 20:09:54 +0100 Subject: [PATCH 2/3] Small tweak README --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 736bd44..8057e2a 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,8 @@ You can also increase the log level (-logtostderr=true -v=0). And you can just dump a specific file(-file /tmp/new-file.docx -ro) ``` -$ go build -o docxlib ./main && ./docxlib -logtostderr=true -v=0 -file /tmp/new-file.docx -ro +$ go build -o docxlib ./main +$ ./docxlib -logtostderr=true -v=0 -file /tmp/new-file.docx -ro I0511 12:37:40.898493 18466 unpack.go:69] Relations: [...] I0511 12:37:40.898787 18466 unpack.go:47] Doc: [...] I0511 12:37:40.899330 18466 unpack.go:58] Paragraph [0xc000026d40 0xc000027d00 0xc000172340] From 8194effd25f19f99adc5be7f475d091525c51969 Mon Sep 17 00:00:00 2001 From: Gonzalo Fernandez-Victorio Date: Mon, 17 May 2021 20:18:02 +0100 Subject: [PATCH 3/3] Badges --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 8057e2a..8679b41 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # Docx library +[![Go](https://github.com/gonfva/docxlib/actions/workflows/build.yml/badge.svg)](https://github.com/gonfva/docxlib/actions/workflows/build.yml) [![CodeQL](https://github.com/gonfva/docxlib/actions/workflows/codeql-analysis.yml/badge.svg?branch=master)](https://github.com/gonfva/docxlib/actions/workflows/codeql-analysis.yml) + Yet another library to read and write .docx (a.k.a. Microsoft Word documents or ECMA-376 Office Open XML) files in Go. ## Introduction