Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM ubuntu:trusty
FROM ubuntu:bionic
MAINTAINER Alex Dergachev <alex@evolvingweb.ca>

# check if the docker host is running squid-deb-proxy, and use it
Expand All @@ -9,14 +9,15 @@ RUN echo "HEAD /" | nc `cat /tmp/host_ip.txt` 8000 | grep squid-deb-proxy && (ec
RUN apt-get update -y && apt-get install -y curl wget git fontconfig make vim

RUN echo 'LC_ALL="en_US.UTF-8"' > /etc/default/locale
RUN apt-get install -y ruby1.9.3
RUN apt-get install -y ruby2.5

# get pandocfilters, a helper library for writing pandoc filters in python
RUN apt-get -y install python-pip
RUN pip install pandocfilters

# latex tools
RUN apt-get update -y && apt-get install -y texlive-latex-base texlive-xetex latex-xcolor texlive-math-extra texlive-latex-extra texlive-fonts-extra rubber latexdiff
ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get update -y && apt-get install -y apt-utils && apt-get install -y texlive-latex-base texlive-xetex texlive-pstricks texlive-science texlive-latex-extra texlive-fonts-extra rubber latexdiff

# greatly speeds up nokogiri install
# dependencies for nokogiri gem
Expand All @@ -28,10 +29,14 @@ RUN (gem list bundler | grep bundler) || gem install bundler
# install gems
ADD Gemfile /tmp/
ADD Gemfile.lock /tmp/
RUN cd /tmp && bundle config build.nokogiri --use-system-libraries && bundle install
RUN apt-get install -y build-essential patch ruby-dev zlib1g-dev liblzma-dev
RUN cd /tmp && bundle config build.nokogiri --use-system-libraries --with-xml2-include=/usr/include/libxml2 --with-xml2-lib=/usr/lib/ && bundle install

# install pandoc 1.12 by from manually downloaded trusty deb packages (saucy only has 1.11, which is too old)
RUN apt-get install -y pandoc
#RUN apt-get install -y pandoc
RUN mkdir -p /tmp/debs/ && cd /tmp/debs && \
wget https://github.com/jgm/pandoc/releases/download/2.2.3.2/pandoc-2.2.3.2-1-amd64.deb && \
dpkg -i *.deb

EXPOSE 12736
WORKDIR /var/gdocs-export/
2 changes: 2 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,5 @@ gem "nokogiri"

gem "rspec", ">=3.1"
gem "rspec_junit_formatter", :group => :development

gem "css_parser", "~> 1.6"
14 changes: 10 additions & 4 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ GEM
extlib (>= 0.9.15)
multi_json (>= 1.0.0)
builder (3.2.2)
css_parser (1.6.0)
addressable
diff-lcs (1.2.5)
extlib (0.9.16)
faraday (0.8.11)
Expand All @@ -25,11 +27,13 @@ GEM
multi_json (>= 1.5)
launchy (2.4.3)
addressable (~> 2.3)
mini_portile2 (2.0.0)
mini_portile2 (2.3.0)
multi_json (1.11.2)
multipart-post (1.2.0)
nokogiri (1.6.7.1)
mini_portile2 (~> 2.0.0.rc2)
nokogiri (1.8.4)
mini_portile2 (~> 2.3.0)
nokogiri (1.8.4-x64-mingw32)
mini_portile2 (~> 2.3.0)
rspec (3.4.0)
rspec-core (~> 3.4.0)
rspec-expectations (~> 3.4.0)
Expand All @@ -56,8 +60,10 @@ GEM

PLATFORMS
ruby
x64-mingw32

DEPENDENCIES
css_parser (~> 1.6)
google-api-client (= 0.6.4)
jwt (~> 0.1.4)
nokogiri
Expand All @@ -66,4 +72,4 @@ DEPENDENCIES
thor

BUNDLED WITH
1.15.4
1.16.3
10 changes: 5 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,12 @@ latex:
cp $(input_file) $(OUTPUT)/in.html

bundle exec ruby -C$(OUTPUT) "$$PWD/lib/pandoc-preprocess.rb" in.html > $(OUTPUT)/preprocessed.html
pandoc --parse-raw $(OUTPUT)/preprocessed.html -t json > $(OUTPUT)/pre.json
pandoc $(OUTPUT)/preprocessed.html -f html+raw_html -t json > $(OUTPUT)/pre.json
cat $(OUTPUT)/pre.json | ./lib/pandoc-filter.py > $(OUTPUT)/post.json

# use pandoc to create metadata.tex, main.tex (these are included by ew-template.tex)
pandoc $(OUTPUT)/post.json --no-wrap -t latex --template $(OUTPUT)/template-metadata.tex > $(OUTPUT)/metadata.tex
pandoc $(OUTPUT)/post.json --chapters --no-wrap -t latex > $(OUTPUT)/main.tex
pandoc $(OUTPUT)/post.json --wrap=none -t latex --template $(OUTPUT)/template-metadata.tex > $(OUTPUT)/metadata.tex
pandoc $(OUTPUT)/post.json --top-level-division=chapter --wrap=none -t latex > $(OUTPUT)/main.tex

# must use -o with docx output format, since its binary
pandoc $(OUTPUT)/post.json -s -t docx -o $(OUTPUT)/$(name).docx
Expand All @@ -64,13 +64,13 @@ pdf:
echo "Created $(OUTPUT)/$(name).tex, compiling into $(name).pdf"
# rubber will set output PDF filename based on latex input filename
cp -f $(OUTPUT)/template.tex $(OUTPUT)/$(name).tex
( cd $(OUTPUT); latexmk -pdf $(name))
( cd $(OUTPUT); rubber --pdf $(name))

convert: latex pdf

diff:
/usr/bin/perl "`which latexdiff`" --flatten $(outdir)/$(before)/$(before).tex $(OUTPUT)/$(name).tex > $(OUTPUT)/diff.tex
(cd $(OUTPUT); latexmk -pdf diff)
(cd $(OUTPUT); rubber --pdf diff)


#===============================================================================
Expand Down
3 changes: 2 additions & 1 deletion assets/default/template.tex
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@
\else\Gin@nat@width\fi}
\makeatother
\let\Oldincludegraphics\includegraphics
\renewcommand{\includegraphics}[1]{\Oldincludegraphics[width=\maxwidth]{#1}}
% The line below breaks the way that pandoc converts image metadata in unexpected ways.
%\renewcommand{\includegraphics}[1]{\Oldincludegraphics[width=\maxwidth]{#1}}

% Uncomment to make embedded links print out HREF in footnotes, good for printing
% \renewcommand{\href}[2]{#2\footnote{\url{#1}}}
Expand Down
109 changes: 108 additions & 1 deletion lib/include/preprocess.rb
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
require 'nokogiri'
require 'open-uri'
require 'css_parser'

class PandocPreprocess
attr_reader :doc, :downloads
def initialize(html)
@source = html
@doc = Nokogiri::HTML(html)
doc.encoding = 'UTF-8'
@doc.encoding = 'UTF-8'
@style_sheet = @doc.at_css("style").inner_text
@downloads = {}

$style_sheet = @doc.at_css("style").inner_text
$parsed_style_sheet = CssParser::Parser.new
$parsed_style_sheet.load_string!(@style_sheet)
end

def download_resources
Expand All @@ -30,6 +36,8 @@ def process
fixup_empty_headers
fixup_page_breaks
fixup_lists
fixup_image_attributes
add_colgroup_to_tables
end

# Replace remote with local images
Expand Down Expand Up @@ -181,4 +189,103 @@ def validate_colspan
@errors << "Colspan > 1 for \"#{short}\""
end
end
# Add width and height attributes to images.
def fixup_image_attributes
doc.css("img").each do |img|
style = img.attr('style')
%w[height width].each do |att|
val = style.match(/#{att}\s*:\s*([\d.]+)px/)[1]
img.set_attribute(att, val)
end
end
end
# Adds a colgroup that includes col tags with a relative width attribute, to all tables. Necessary in order to be parsed by Pandoc.
def add_colgroup_to_tables
@doc.css("table").map {|t| GdocTable.new(t)}.each &:prepend_colgroup
end
end

# Class to simplify dealing with HTML tables
class HtmlTable
attr_accessor :table, :index, :size
def initialize(html_table)
@table = html_table
table_cells_index
end
def table_cells_index
result = Hash.new
i = 0
width = 0
@table.search("tr").each do |tr|
j = 0
tr.search("td").each do |td|
result[[i, j]] = td
j += 1
end
width = [width, j].max
i += 1
end
height = i
@size = [height, width]
@index = result
result
end
def css_classes_index
@css_classes_index ||= @index.map { |k, v| [k, v.attributes["class"].value] }.to_h
end
def css_classes
@css_classes ||= css_classes_index.map { |_, v| v }.uniq
end
def self.h_to_a(h)
out = []
h.each do |k, v|
i = k[0]
j = k[1]
out[i] ||= []
out[i][j] = v
end
out
end
end

# More specialized HTML tables class to deal with how Google-Docs formats tables
class GdocTable < HtmlTable
attr_reader :parsed_style_sheet
def initialize(html_table)
super
@parsed_style_sheet = $parsed_style_sheet
end
def width_by_class
@width_by_class ||= css_classes.map do |c|
width_regex = /width:\s*([\d.]{2,})(px|pt)/
rule_set = parsed_style_sheet.find_by_selector(".#{c}").first
width = rule_set.match(width_regex)[1] || 0
[c, width]
end.to_h
end
def width_by_cell
@width_by_cell ||= self.css_classes_index.map do |k, v|
[k, width_by_class[v]]
end.to_h
end
def width_by_column
@width_by_column ||= HtmlTable::h_to_a(width_by_cell).transpose.map { |col| col.map{|x| x.to_f}.max }
end
def relative_width_by_column
total = width_by_column.sum
@relative_width_by_column ||= width_by_column.map {|x| x/total}
end
def colgroup_statement
out = "<colgroup>"
relative_width_by_column.each do |w|
out += "<col width=\"#{sprintf('%.2f', w*100)}%\" />"
end
out += "</colgroup>"
end
def prepend(str)
@table.inner_html = str + @table.inner_html
end
def prepend_colgroup
prepend(self.colgroup_statement)
end
end