diff --git a/Dockerfile b/Dockerfile index dc97e7c..06a82c7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:trusty +FROM ubuntu:bionic MAINTAINER Alex Dergachev # check if the docker host is running squid-deb-proxy, and use it @@ -9,14 +9,15 @@ RUN echo "HEAD /" | nc `cat /tmp/host_ip.txt` 8000 | grep squid-deb-proxy && (ec RUN apt-get update -y && apt-get install -y curl wget git fontconfig make vim RUN echo 'LC_ALL="en_US.UTF-8"' > /etc/default/locale -RUN apt-get install -y ruby1.9.3 +RUN apt-get install -y ruby2.5 # get pandocfilters, a helper library for writing pandoc filters in python RUN apt-get -y install python-pip RUN pip install pandocfilters # latex tools -RUN apt-get update -y && apt-get install -y texlive-latex-base texlive-xetex latex-xcolor texlive-math-extra texlive-latex-extra texlive-fonts-extra rubber latexdiff +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get update -y && apt-get install -y apt-utils && apt-get install -y texlive-latex-base texlive-xetex texlive-pstricks texlive-science texlive-latex-extra texlive-fonts-extra rubber latexdiff # greatly speeds up nokogiri install # dependencies for nokogiri gem @@ -28,10 +29,14 @@ RUN (gem list bundler | grep bundler) || gem install bundler # install gems ADD Gemfile /tmp/ ADD Gemfile.lock /tmp/ -RUN cd /tmp && bundle config build.nokogiri --use-system-libraries && bundle install +RUN apt-get install -y build-essential patch ruby-dev zlib1g-dev liblzma-dev +RUN cd /tmp && bundle config build.nokogiri --use-system-libraries --with-xml2-include=/usr/include/libxml2 --with-xml2-lib=/usr/lib/ && bundle install # install pandoc 1.12 by from manually downloaded trusty deb packages (saucy only has 1.11, which is too old) -RUN apt-get install -y pandoc +#RUN apt-get install -y pandoc +RUN mkdir -p /tmp/debs/ && cd /tmp/debs && \ + wget https://github.com/jgm/pandoc/releases/download/2.2.3.2/pandoc-2.2.3.2-1-amd64.deb && \ + dpkg -i *.deb EXPOSE 12736 WORKDIR /var/gdocs-export/ diff --git a/Gemfile b/Gemfile index b526864..dbe5409 100644 --- a/Gemfile +++ b/Gemfile @@ -11,3 +11,5 @@ gem "nokogiri" gem "rspec", ">=3.1" gem "rspec_junit_formatter", :group => :development + +gem "css_parser", "~> 1.6" diff --git a/Gemfile.lock b/Gemfile.lock index 5cd4a9e..3529680 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -7,6 +7,8 @@ GEM extlib (>= 0.9.15) multi_json (>= 1.0.0) builder (3.2.2) + css_parser (1.6.0) + addressable diff-lcs (1.2.5) extlib (0.9.16) faraday (0.8.11) @@ -25,11 +27,13 @@ GEM multi_json (>= 1.5) launchy (2.4.3) addressable (~> 2.3) - mini_portile2 (2.0.0) + mini_portile2 (2.3.0) multi_json (1.11.2) multipart-post (1.2.0) - nokogiri (1.6.7.1) - mini_portile2 (~> 2.0.0.rc2) + nokogiri (1.8.4) + mini_portile2 (~> 2.3.0) + nokogiri (1.8.4-x64-mingw32) + mini_portile2 (~> 2.3.0) rspec (3.4.0) rspec-core (~> 3.4.0) rspec-expectations (~> 3.4.0) @@ -56,8 +60,10 @@ GEM PLATFORMS ruby + x64-mingw32 DEPENDENCIES + css_parser (~> 1.6) google-api-client (= 0.6.4) jwt (~> 0.1.4) nokogiri @@ -66,4 +72,4 @@ DEPENDENCIES thor BUNDLED WITH - 1.15.4 + 1.16.3 diff --git a/Makefile b/Makefile index f6ceb1f..627c47b 100644 --- a/Makefile +++ b/Makefile @@ -48,12 +48,12 @@ latex: cp $(input_file) $(OUTPUT)/in.html bundle exec ruby -C$(OUTPUT) "$$PWD/lib/pandoc-preprocess.rb" in.html > $(OUTPUT)/preprocessed.html - pandoc --parse-raw $(OUTPUT)/preprocessed.html -t json > $(OUTPUT)/pre.json + pandoc $(OUTPUT)/preprocessed.html -f html+raw_html -t json > $(OUTPUT)/pre.json cat $(OUTPUT)/pre.json | ./lib/pandoc-filter.py > $(OUTPUT)/post.json # use pandoc to create metadata.tex, main.tex (these are included by ew-template.tex) - pandoc $(OUTPUT)/post.json --no-wrap -t latex --template $(OUTPUT)/template-metadata.tex > $(OUTPUT)/metadata.tex - pandoc $(OUTPUT)/post.json --chapters --no-wrap -t latex > $(OUTPUT)/main.tex + pandoc $(OUTPUT)/post.json --wrap=none -t latex --template $(OUTPUT)/template-metadata.tex > $(OUTPUT)/metadata.tex + pandoc $(OUTPUT)/post.json --top-level-division=chapter --wrap=none -t latex > $(OUTPUT)/main.tex # must use -o with docx output format, since its binary pandoc $(OUTPUT)/post.json -s -t docx -o $(OUTPUT)/$(name).docx @@ -64,13 +64,13 @@ pdf: echo "Created $(OUTPUT)/$(name).tex, compiling into $(name).pdf" # rubber will set output PDF filename based on latex input filename cp -f $(OUTPUT)/template.tex $(OUTPUT)/$(name).tex - ( cd $(OUTPUT); latexmk -pdf $(name)) + ( cd $(OUTPUT); rubber --pdf $(name)) convert: latex pdf diff: /usr/bin/perl "`which latexdiff`" --flatten $(outdir)/$(before)/$(before).tex $(OUTPUT)/$(name).tex > $(OUTPUT)/diff.tex - (cd $(OUTPUT); latexmk -pdf diff) + (cd $(OUTPUT); rubber --pdf diff) #=============================================================================== diff --git a/assets/default/template.tex b/assets/default/template.tex index bc13445..b53cae4 100644 --- a/assets/default/template.tex +++ b/assets/default/template.tex @@ -69,7 +69,8 @@ \else\Gin@nat@width\fi} \makeatother \let\Oldincludegraphics\includegraphics -\renewcommand{\includegraphics}[1]{\Oldincludegraphics[width=\maxwidth]{#1}} +% The line below breaks the way that pandoc converts image metadata in unexpected ways. +%\renewcommand{\includegraphics}[1]{\Oldincludegraphics[width=\maxwidth]{#1}} % Uncomment to make embedded links print out HREF in footnotes, good for printing % \renewcommand{\href}[2]{#2\footnote{\url{#1}}} diff --git a/lib/include/preprocess.rb b/lib/include/preprocess.rb index 0056766..7a35969 100644 --- a/lib/include/preprocess.rb +++ b/lib/include/preprocess.rb @@ -1,13 +1,19 @@ require 'nokogiri' require 'open-uri' +require 'css_parser' class PandocPreprocess attr_reader :doc, :downloads def initialize(html) @source = html @doc = Nokogiri::HTML(html) - doc.encoding = 'UTF-8' + @doc.encoding = 'UTF-8' + @style_sheet = @doc.at_css("style").inner_text @downloads = {} + + $style_sheet = @doc.at_css("style").inner_text + $parsed_style_sheet = CssParser::Parser.new + $parsed_style_sheet.load_string!(@style_sheet) end def download_resources @@ -30,6 +36,8 @@ def process fixup_empty_headers fixup_page_breaks fixup_lists + fixup_image_attributes + add_colgroup_to_tables end # Replace remote with local images @@ -181,4 +189,103 @@ def validate_colspan @errors << "Colspan > 1 for \"#{short}\"" end end + # Add width and height attributes to images. + def fixup_image_attributes + doc.css("img").each do |img| + style = img.attr('style') + %w[height width].each do |att| + val = style.match(/#{att}\s*:\s*([\d.]+)px/)[1] + img.set_attribute(att, val) + end + end + end + # Adds a colgroup that includes col tags with a relative width attribute, to all tables. Necessary in order to be parsed by Pandoc. + def add_colgroup_to_tables + @doc.css("table").map {|t| GdocTable.new(t)}.each &:prepend_colgroup + end +end + +# Class to simplify dealing with HTML tables +class HtmlTable + attr_accessor :table, :index, :size + def initialize(html_table) + @table = html_table + table_cells_index + end + def table_cells_index + result = Hash.new + i = 0 + width = 0 + @table.search("tr").each do |tr| + j = 0 + tr.search("td").each do |td| + result[[i, j]] = td + j += 1 + end + width = [width, j].max + i += 1 + end + height = i + @size = [height, width] + @index = result + result + end + def css_classes_index + @css_classes_index ||= @index.map { |k, v| [k, v.attributes["class"].value] }.to_h + end + def css_classes + @css_classes ||= css_classes_index.map { |_, v| v }.uniq + end + def self.h_to_a(h) + out = [] + h.each do |k, v| + i = k[0] + j = k[1] + out[i] ||= [] + out[i][j] = v + end + out + end +end + +# More specialized HTML tables class to deal with how Google-Docs formats tables +class GdocTable < HtmlTable + attr_reader :parsed_style_sheet + def initialize(html_table) + super + @parsed_style_sheet = $parsed_style_sheet + end + def width_by_class + @width_by_class ||= css_classes.map do |c| + width_regex = /width:\s*([\d.]{2,})(px|pt)/ + rule_set = parsed_style_sheet.find_by_selector(".#{c}").first + width = rule_set.match(width_regex)[1] || 0 + [c, width] + end.to_h + end + def width_by_cell + @width_by_cell ||= self.css_classes_index.map do |k, v| + [k, width_by_class[v]] + end.to_h + end + def width_by_column + @width_by_column ||= HtmlTable::h_to_a(width_by_cell).transpose.map { |col| col.map{|x| x.to_f}.max } + end + def relative_width_by_column + total = width_by_column.sum + @relative_width_by_column ||= width_by_column.map {|x| x/total} + end + def colgroup_statement + out = "" + relative_width_by_column.each do |w| + out += "" + end + out += "" + end + def prepend(str) + @table.inner_html = str + @table.inner_html + end + def prepend_colgroup + prepend(self.colgroup_statement) + end end