dergachev · yelidrissi · Aug 9, 2018 · Aug 9, 2018 · Aug 22, 2018 · Aug 22, 2018
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM  ubuntu:trusty
+FROM  ubuntu:bionic
 MAINTAINER Alex Dergachev <alex@evolvingweb.ca>
 
 # check if the docker host is running squid-deb-proxy, and use it
@@ -9,14 +9,15 @@ RUN echo "HEAD /" | nc `cat /tmp/host_ip.txt` 8000 | grep squid-deb-proxy && (ec
 RUN apt-get update -y && apt-get install -y curl wget git fontconfig make vim
 
 RUN echo 'LC_ALL="en_US.UTF-8"' > /etc/default/locale
-RUN apt-get install -y ruby1.9.3
+RUN apt-get install -y ruby2.5
 
 # get pandocfilters, a helper library for writing pandoc filters in python
 RUN apt-get -y install python-pip
 RUN pip install pandocfilters
 
 # latex tools
-RUN apt-get update -y && apt-get install -y texlive-latex-base texlive-xetex latex-xcolor texlive-math-extra texlive-latex-extra texlive-fonts-extra rubber latexdiff
+ARG DEBIAN_FRONTEND=noninteractive
+RUN apt-get update -y && apt-get install -y apt-utils && apt-get install -y texlive-latex-base texlive-xetex texlive-pstricks texlive-science texlive-latex-extra texlive-fonts-extra rubber latexdiff
 
 # greatly speeds up nokogiri install
 # dependencies for nokogiri gem
@@ -28,10 +29,14 @@ RUN (gem list bundler | grep bundler) || gem install bundler
 # install gems
 ADD Gemfile /tmp/
 ADD Gemfile.lock /tmp/
-RUN cd /tmp && bundle config build.nokogiri --use-system-libraries && bundle install
+RUN apt-get install -y build-essential patch ruby-dev zlib1g-dev liblzma-dev
+RUN cd /tmp && bundle config build.nokogiri --use-system-libraries --with-xml2-include=/usr/include/libxml2 --with-xml2-lib=/usr/lib/ && bundle install
 
 # install pandoc 1.12 by from manually downloaded trusty deb packages (saucy only has 1.11, which is too old)
-RUN apt-get install -y pandoc
+#RUN apt-get install -y pandoc
+RUN mkdir -p /tmp/debs/ && cd /tmp/debs && \
+    wget https://github.com/jgm/pandoc/releases/download/2.2.3.2/pandoc-2.2.3.2-1-amd64.deb && \
+    dpkg -i *.deb
 
 EXPOSE 12736
 WORKDIR /var/gdocs-export/
diff --git a/Gemfile b/Gemfile
@@ -11,3 +11,5 @@ gem "nokogiri"
 
 gem "rspec", ">=3.1"
 gem "rspec_junit_formatter", :group => :development
+
+gem "css_parser", "~> 1.6"
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -7,6 +7,8 @@ GEM
       extlib (>= 0.9.15)
       multi_json (>= 1.0.0)
     builder (3.2.2)
+    css_parser (1.6.0)
+      addressable
     diff-lcs (1.2.5)
     extlib (0.9.16)
     faraday (0.8.11)
@@ -25,11 +27,13 @@ GEM
       multi_json (>= 1.5)
     launchy (2.4.3)
       addressable (~> 2.3)
-    mini_portile2 (2.0.0)
+    mini_portile2 (2.3.0)
     multi_json (1.11.2)
     multipart-post (1.2.0)
-    nokogiri (1.6.7.1)
-      mini_portile2 (~> 2.0.0.rc2)
+    nokogiri (1.8.4)
+      mini_portile2 (~> 2.3.0)
+    nokogiri (1.8.4-x64-mingw32)
+      mini_portile2 (~> 2.3.0)
     rspec (3.4.0)
       rspec-core (~> 3.4.0)
       rspec-expectations (~> 3.4.0)
@@ -56,8 +60,10 @@ GEM
 
 PLATFORMS
   ruby
+  x64-mingw32
 
 DEPENDENCIES
+  css_parser (~> 1.6)
   google-api-client (= 0.6.4)
   jwt (~> 0.1.4)
   nokogiri
@@ -66,4 +72,4 @@ DEPENDENCIES
   thor
 
 BUNDLED WITH
-   1.15.4
+   1.16.3
diff --git a/Makefile b/Makefile
@@ -48,12 +48,12 @@ latex:
 	cp $(input_file) $(OUTPUT)/in.html
 
 	bundle exec ruby -C$(OUTPUT) "$$PWD/lib/pandoc-preprocess.rb" in.html > $(OUTPUT)/preprocessed.html
-	pandoc --parse-raw $(OUTPUT)/preprocessed.html -t json > $(OUTPUT)/pre.json
+	pandoc $(OUTPUT)/preprocessed.html -f html+raw_html -t json > $(OUTPUT)/pre.json
 	cat $(OUTPUT)/pre.json | ./lib/pandoc-filter.py > $(OUTPUT)/post.json
 
 	# use pandoc to create metadata.tex, main.tex (these are included by ew-template.tex)
-	pandoc $(OUTPUT)/post.json --no-wrap -t latex --template $(OUTPUT)/template-metadata.tex > $(OUTPUT)/metadata.tex
-	pandoc $(OUTPUT)/post.json --chapters --no-wrap -t latex > $(OUTPUT)/main.tex
+	pandoc $(OUTPUT)/post.json --wrap=none -t latex --template $(OUTPUT)/template-metadata.tex > $(OUTPUT)/metadata.tex
+	pandoc $(OUTPUT)/post.json --top-level-division=chapter --wrap=none -t latex > $(OUTPUT)/main.tex
 
 	# must use -o with docx output format, since its binary
 	pandoc $(OUTPUT)/post.json -s -t docx -o $(OUTPUT)/$(name).docx
@@ -64,13 +64,13 @@ pdf:
 	echo "Created $(OUTPUT)/$(name).tex, compiling into $(name).pdf"
 	# rubber will set output PDF filename based on latex input filename
 	cp -f $(OUTPUT)/template.tex $(OUTPUT)/$(name).tex
-	( cd $(OUTPUT); latexmk -pdf $(name))
+	( cd $(OUTPUT); rubber --pdf $(name))
 
 convert: latex pdf
 
 diff:
 	/usr/bin/perl "`which latexdiff`" --flatten $(outdir)/$(before)/$(before).tex $(OUTPUT)/$(name).tex > $(OUTPUT)/diff.tex
-	(cd $(OUTPUT); latexmk -pdf diff)
+	(cd $(OUTPUT); rubber --pdf diff)
 
 
 #===============================================================================

diff --git a/assets/default/template.tex b/assets/default/template.tex
@@ -69,7 +69,8 @@
 \else\Gin@nat@width\fi}
 \makeatother
 \let\Oldincludegraphics\includegraphics
-\renewcommand{\includegraphics}[1]{\Oldincludegraphics[width=\maxwidth]{#1}}
+% The line below breaks the way that pandoc converts image metadata in unexpected ways.
+%\renewcommand{\includegraphics}[1]{\Oldincludegraphics[width=\maxwidth]{#1}}
 
 % Uncomment to make embedded links print out HREF in footnotes, good for printing
 % \renewcommand{\href}[2]{#2\footnote{\url{#1}}}

diff --git a/lib/include/preprocess.rb b/lib/include/preprocess.rb
@@ -1,13 +1,19 @@
 require 'nokogiri'
 require 'open-uri'
+require 'css_parser'
 
 class PandocPreprocess
   attr_reader :doc, :downloads
   def initialize(html)
     @source = html
     @doc = Nokogiri::HTML(html)
-    doc.encoding = 'UTF-8'
+    @doc.encoding = 'UTF-8'
+    @style_sheet = @doc.at_css("style").inner_text
     @downloads = {}
+
+    $style_sheet = @doc.at_css("style").inner_text
+    $parsed_style_sheet = CssParser::Parser.new
+    $parsed_style_sheet.load_string!(@style_sheet)
   end
 
   def download_resources
@@ -30,6 +36,8 @@ def process
     fixup_empty_headers
     fixup_page_breaks
     fixup_lists
+    fixup_image_attributes
+    add_colgroup_to_tables
   end
 
   # Replace remote with local images
@@ -181,4 +189,103 @@ def validate_colspan
       @errors << "Colspan > 1 for \"#{short}\""
     end
   end
+  # Add width and height attributes to images.
+  def fixup_image_attributes
+    doc.css("img").each do |img|
+      style = img.attr('style')
+      %w[height width].each do |att|
+        val = style.match(/#{att}\s*:\s*([\d.]+)px/)[1]
+        img.set_attribute(att, val)
+      end
+    end
+  end
+  # Adds a colgroup that includes col tags with a relative width attribute, to all tables. Necessary in order to be parsed by Pandoc.
+  def add_colgroup_to_tables
+    @doc.css("table").map {|t| GdocTable.new(t)}.each &:prepend_colgroup
+  end
+end
+
+# Class to simplify dealing with HTML tables
+class HtmlTable
+  attr_accessor :table, :index, :size
+  def initialize(html_table)
+    @table = html_table
+    table_cells_index
+  end
+  def table_cells_index
+    result = Hash.new
+    i = 0
+    width = 0
+    @table.search("tr").each do |tr|
+      j = 0
+      tr.search("td").each do |td|
+        result[[i, j]] = td
+        j += 1
+      end
+      width = [width, j].max
+      i += 1
+    end
+    height = i
+    @size = [height, width]
+    @index = result
+    result
+  end
+  def css_classes_index
+    @css_classes_index ||= @index.map { |k, v| [k, v.attributes["class"].value] }.to_h
+  end
+  def css_classes
+    @css_classes ||= css_classes_index.map { |_, v| v }.uniq
+  end
+  def self.h_to_a(h)
+    out = []
+    h.each do |k, v|
+      i = k[0]
+      j = k[1]
+      out[i] ||= []
+      out[i][j] = v
+    end
+    out
+  end
+end
+
+# More specialized HTML tables class to deal with how Google-Docs formats tables
+class GdocTable < HtmlTable
+  attr_reader :parsed_style_sheet
+  def initialize(html_table)
+    super
+    @parsed_style_sheet = $parsed_style_sheet
+  end
+  def width_by_class
+    @width_by_class ||= css_classes.map do |c|
+      width_regex = /width:\s*([\d.]{2,})(px|pt)/
+      rule_set = parsed_style_sheet.find_by_selector(".#{c}").first
+      width = rule_set.match(width_regex)[1] || 0
+      [c, width]
+    end.to_h
+  end
+  def width_by_cell
+    @width_by_cell ||= self.css_classes_index.map do |k, v|
+      [k, width_by_class[v]]
+    end.to_h
+  end
+  def width_by_column
+    @width_by_column ||= HtmlTable::h_to_a(width_by_cell).transpose.map { |col| col.map{|x| x.to_f}.max }
+  end
+  def relative_width_by_column
+    total = width_by_column.sum
+    @relative_width_by_column ||= width_by_column.map {|x| x/total}
+  end
+  def colgroup_statement
+    out = "<colgroup>"
+    relative_width_by_column.each do |w|
+      out += "<col width=\"#{sprintf('%.2f', w*100)}%\" />"
+    end
+    out += "</colgroup>"
+  end
+  def prepend(str)
+    @table.inner_html = str + @table.inner_html
+  end
+  def prepend_colgroup
+    prepend(self.colgroup_statement)
+  end
 end
Original file line number	Diff line number	Diff line change
Expand Up		@@ -11,3 +11,5 @@ gem "nokogiri"

		gem "rspec", ">=3.1"
		gem "rspec_junit_formatter", :group => :development

		gem "css_parser", "~> 1.6"