1#!/usr/local/bin/ruby
2
3#
4#       HTML to LaTeX converter
5#         by A. Ito, 16 June, 1997
6#
7
8require 'kconv'
9
10# configuration
11def gif2eps(giffile,epsfile)
12  cmd = "convert #{giffile} #{epsfile}"
13  STDERR.print cmd,"\n"
14  system cmd
15end
16
17###########################################################################
18class Tag
19  def initialize(str)
20    if str =~ /<(.+)>/ then
21      str = $1
22    end
23    tags = str.split
24    @tagname = tags.shift.downcase
25    @vals = {}
26    tags.each do |t|
27      if t =~ /=/ then
28	tn,tv = t.split(/\s*=\s*/,2)
29	tv.sub!(/^"/,"")
30	tv.sub!(/"$/,"")
31	@vals[tn.downcase] = tv
32      else
33	@vals[t.downcase] = TRUE
34      end
35    end
36  end
37  def tagname
38    return @tagname
39  end
40  def each
41    @vals.each do |k,v|
42      yield k,v
43    end
44  end
45  def switch(k)
46    return @vals[k]
47  end
48end
49
50class TokenStream
51  TAG_START = ?<
52  TAG_END = ?>
53  AMP_START = ?&
54  AMP_END = ?;
55
56  AMP_REPLACE_TABLE = {
57    '&amp;'   => '\\&',
58    '&gt;'    => '$>$',
59    '&lt;'    => '$<$',
60    '&nbsp;'  => '~',
61    '&quot;'  => '"',
62  }
63  def initialize(file)
64    if file.kind_of?(File) then
65      @f = file
66    else
67      @f = File.new(file)
68    end
69    @buf = nil
70    @bpos = 0
71  end
72
73  def read_until(endsym)
74    complete = FALSE
75    tag = []
76    begin
77      while @bpos < @buf.size
78	c = @buf[@bpos]
79	if c == endsym then
80	  tag.push(c.chr)
81	  complete = TRUE
82	  @bpos += 1
83	  break
84	end
85	if c == 10 || c == 13 then
86	  tag.push(' ')
87	else
88	  tag.push(c.chr)
89	end
90	@bpos += 1
91      end
92      unless complete
93	@buf = @f.gets
94	@bpos = 0
95	break if @f.eof?
96      end
97    end until complete
98    return tag.join('')
99  end
100
101  def get
102    while TRUE
103      if @buf.nil? then
104	@buf = Kconv.toeuc(@f.gets)
105	if @f.eof? then
106	  return nil
107	end
108	@bpos = 0
109      end
110      if @buf[@bpos] == TAG_START then
111	return Tag.new(read_until(TAG_END))
112      elsif @buf[@bpos] == AMP_START then
113	return replace_amp(read_until(AMP_END))
114      else
115	i = @bpos
116	while i < @buf.size && @buf[i] != TAG_START && @buf[i] != AMP_START
117	  i += 1
118	end
119	r = @buf[@bpos,i-@bpos]
120	if i == @buf.size then
121	  @buf = nil
122	else
123	  @bpos = i
124	end
125	redo if r =~ /^\s+$/
126	return r
127      end
128    end
129  end
130  public :eof?
131  def eof?
132    @f.eof?
133  end
134  def replace_amp(s)
135    if AMP_REPLACE_TABLE.key?(s) then
136      return AMP_REPLACE_TABLE[s]
137    else
138      return s
139    end
140  end
141end
142
143
144def print_header
145  print '
146\documentstyle[epsf]{jarticle}
147\def\hr{\par\hbox to \textwidth{\hrulefill}}
148\def\pre{\begin{quote}\def\baselinestretch{0.8}\tt\obeylines}
149\def\endpre{\end{quote}}
150\makeatletter
151\@ifundefined{gt}{\let\gt=\dg}{}
152\makeatother
153'
154end
155
156
157class Environ_stack
158  def initialize(*envs)
159    @stack = envs
160  end
161  def action(tag)
162    if tag =~ /^!/ then # comment
163      return ["",nil]
164    end
165    i = @stack.size-1
166    while i >= 0
167      a = @stack[i].action(tag)
168      unless a.nil? then
169	return a
170      end
171      i -= 1
172    end
173    return nil
174  end
175  def pop
176    @stack.pop
177  end
178  def push(env)
179    @stack.push(env)
180  end
181  def top
182    @stack[@stack.size-1]
183  end
184  def dup
185    @stack.push(top.clone)
186  end
187end
188
189
190class Environment
191  def initialize(interp)
192    @silent = FALSE
193    @in_table = FALSE
194    @interp = interp;
195    @align = nil;
196  end
197  def action(tag)
198    return @interp[tag]
199  end
200
201  def flush(tok)
202    if tok.kind_of?(String) then
203      tok = tok.gsub(/&/,"\\&");
204      tok = tok.gsub(/%/,"\\%");
205      tok = tok.gsub(/#/,"\\#");
206      tok = tok.gsub(/\$/,"\\$");
207      tok = tok.gsub(/_/,"\\verb+_+");
208      tok = tok.gsub(/\^/,"\\verb+^+");
209      tok = tok.gsub(/~/,"\\verb+~+");
210    end
211    if @in_table then
212      @table[@table_rows][@table_cols] += tok
213    elsif !@silent then
214      if !@align.nil? && tok =~ /\n$/ then
215	print tok.chop,"\\\\\n"
216      else
217	print tok
218      end
219    end
220  end
221
222  def set_interp(interp)
223    @interp = interp
224  end
225
226  # tag processing methods
227
228  # <TITLE>
229  def do_silent(tag)
230    @silent = TRUE
231  end
232
233  # </TITLE>
234  def undo_silent(tag)
235    @silent = FALSE
236  end
237
238  # <IMG>
239  def img_proc(tag)
240    src = tag.switch('src')
241    newfile = src.sub(/\.GIF/i,".eps")
242    gif2eps(src,newfile)
243    flush "\\epsfile{file=#{newfile}}\n"
244  end
245
246  # <TABLE>
247  def starttable(tag)
248    @table = []
249    @tablespan = []
250    @table_rows = -1
251    @table_cols_max = 0
252    @in_table = TRUE
253    unless tag.switch('border').nil? then
254      @table_border = TRUE
255    else
256      @table_border = FALSE
257    end
258  end
259
260  # <TR>
261  def start_row(tag)
262    @table_rows += 1
263    @table[@table_rows] = []
264    @tablespan[@table_rows] = []
265    @table_cols = -1
266    @colspan = 1
267  end
268
269  # <TD>
270  def start_col(tag)
271    @colspan = tag.switch('colspan')
272    if @colspan.nil? then
273      @colspan = 1
274    else
275      @colspan = @colspan.to_i
276    end
277    @tablespan[@table_rows][@table_cols+1] = @colspan
278    @table_cols += @colspan
279    if @table_cols > @table_cols_max then
280      @table_cols_max = @table_cols
281    end
282  end
283
284  # </TABLE>
285  def endtable(tag)
286    @in_table = FALSE
287    flush "\\begin{tabular}{*{"
288    flush @table_cols_max+1
289    if @table_border then
290      flush "}{|l}|}\n\\hline\n"
291    else
292      flush "}{l}}\n"
293    end
294    for i in 0..@table_rows
295      j = 0
296      while j <= @table_cols
297	span = @tablespan[i][j]
298	if span == 1 then
299	  flush @table[i][j]
300	elsif @table_border then
301	  form = "|l"
302	  if j+span > @table_cols then
303	    form = "|l|"
304	  end
305	  flush "\\multicolumn{"+span.to_s+"}{"+form+"}{"
306	  flush @table[i][j+span-1]
307	  flush "}"
308	else
309	  flush "\\multicolumn{"+span.to_s+"}{l}{"
310	  flush @table[i][j+span-1]
311	  flush "}"
312	end
313	j += span
314	if j <= @table_cols then
315	  flush "&"
316	end
317      end
318      flush "\\\\\n"
319      flush "\\hline\n" if @table_border
320    end
321    flush "\\end{tabular}\n"
322  end
323
324  # <CENTER>
325  def startcenter(tag)
326    if @in_table then
327      flush "\\hfil"
328    else
329      flush "\\begin{center}\n"
330    end
331  end
332
333  # </CENTER>
334  def endcenter(tag)
335    if @in_table then
336      flush "\\hfil"
337    else
338      flush "\\end{center}\n"
339    end
340  end
341
342  # <P>
343  def paragraph(tag)
344    align = tag.switch('align')
345    if align.nil? then
346      flush "\\par\n"
347      @endparagraph = ""
348    else
349      align = align.downcase
350      case align
351      when "left" then
352	flush "\\begin{flushleft}\n"
353	@endparagraph = "\\end{flushleft}\n"
354      when "center" then
355	flush "\\begin{center}\n"
356	@endparagraph = "\\end{center}\n"
357      when "right" then
358	flush "\\begin{flushright}\n"
359	@endparagraph = "\\end{flushright}\n"
360      end
361    end
362    @align = align
363  end
364
365  # </P>
366  def endparagraph(tag)
367    unless @align.nil? then
368      @align = nil
369      flush @endparagraph
370    end
371  end
372end
373
374
375enum_interp = {
376  'li' => ["\\item ",nil]
377}
378
379item_interp = {
380  'li' => ["\\item ",nil]
381}
382
383desc_interp = {
384  'dt' => ["\\item[",nil],
385  'dd' => ["]\n",nil]
386}
387
388table_interp = {
389  'tr' => [:start_row,nil],
390  'td' => [:start_col,nil],
391  '/tr' => ["",nil],
392  '/td' => ["",nil],
393}
394
395para_interp = {
396  '/p'      => [:endparagraph ,"pop",TRUE],
397}
398
399main_interp = {
400  'body'    => ["\\begin{document}\n",nil,FALSE],
401  '/body'   => ["\\end{document}\n",nil,FALSE],
402  'head'    => ["",nil,FALSE],
403  '/head'   => ["",nil,FALSE],
404  'html'    => ["",nil,FALSE],
405  '/html'   => ["",nil,FALSE],
406  'title'   => [:do_silent,nil,FALSE],
407  '/title'  => [:undo_silent,nil,FALSE],
408  '!'       => ["",nil,FALSE],
409  'h1'      => ["\\section{",nil,TRUE],
410  'h2'      => ["\\subsection{",nil,TRUE],
411  'h3'      => ["\\subsubsection{",nil,TRUE],
412  'h4'      => ["\\paragraph{",nil,TRUE],
413  '/h1'     => ["}\n",nil,TRUE],
414  '/h2'     => ["}\n",nil,TRUE],
415  '/h3'     => ["}\n",nil,TRUE],
416  '/h4'     => ["}\n",nil,TRUE],
417  'a'       => ["",nil,TRUE],
418  '/a'      => ["",nil,TRUE],
419  'center'  => [:startcenter,nil,TRUE],
420  '/center' => [:endcenter,nil,TRUE],
421  'ol'      => ["\\begin{enumerate}\n",enum_interp,TRUE],
422  '/ol'     => ["\\end{enumerate}\n","pop",TRUE],
423  'ul'      => ["\\begin{itemize}\n",item_interp,TRUE],
424  '/ul'     => ["\\end{itemize}\n","pop",TRUE],
425  'dl'      => ["\\begin{description}\n",desc_interp,TRUE],
426  '/dl'     => ["\\end{description}\n","pop",TRUE],
427  'pre'     => ["\\begin{pre}\n",nil,TRUE],
428  '/pre'    => ["\\end{pre}\n",nil,TRUE],
429  'p'       => [:paragraph ,para_interp,TRUE],
430  'br'      => ["\\par ",nil,TRUE],
431  'img'     => [:img_proc,nil,TRUE],
432  'hr'      => ["\\hr ",nil,TRUE],
433  'b'       => ["{\\bf\\gt ",nil,TRUE],
434  '/b'      => ["}",nil,TRUE],
435  'strong'  => ["{\\bf\\gt ",nil,TRUE],
436  '/strong' => ["}",nil,TRUE],
437  'dfn'     => ["{\\bf\\gt ",nil,TRUE],
438  '/dfn'    => ["}",nil,TRUE],
439  'i'       => ["{\\it",nil,TRUE],
440  '/i'      => ["}",nil,TRUE],
441  'address' => ["{\\it",nil,TRUE],
442  '/address'=> ["}",nil,TRUE],
443  'cite'    => ["{\\it",nil,TRUE],
444  '/cite'   => ["}",nil,TRUE],
445  'code'    => ["{\\tt",nil,TRUE],
446  '/code'   => ["}",nil,TRUE],
447  'kbd'     => ["{\\tt",nil,TRUE],
448  '/kbd'    => ["}",nil,TRUE],
449  'tt'      => ["{\\tt",nil,TRUE],
450  '/tt'     => ["}",nil,TRUE],
451  'samp'    => ["{\\tt",nil,TRUE],
452  '/samp'   => ["}",nil,TRUE],
453  'em'      => ["{\\em",nil,TRUE],
454  '/em'     => ["}",nil,TRUE],
455  'u'       => ["$\\underline{\\mbox{",nil,TRUE],
456  '/u'      => ["}}$",nil,TRUE],
457  'sub'     => ["${}_\mbox{",nil,TRUE],
458  '/sub'    => ["}$",nil,TRUE],
459  'sup'     => ["${}^\mbox{",nil,TRUE],
460  '/sup'    => ["}$",nil,TRUE],
461  'table'   => [:starttable, table_interp,TRUE],
462  '/table'  => [:endtable, "pop",TRUE],
463  'font'    => ["",nil,TRUE],
464  '/font'   => ["",nil,TRUE],
465}
466
467
468
469
470################################ MAIN ####################################
471
472$in_document = FALSE
473print_header
474intp = Environ_stack.new(Environment.new(main_interp))
475f = TokenStream.new(ARGV[0])
476until f.eof?
477  tok = f.get
478  if tok.kind_of?(Tag) then
479    case tok.tagname
480    when "body"
481      $in_document = TRUE
482    when "/body"
483      $in_document = FALSE
484    end
485    act = intp.action(tok.tagname)
486    if act.nil? then
487      STDERR.print "tag ",tok.tagname," ignored\n"
488    else
489      if act[2] && !$in_document then
490        print "\\begin{document}\n"
491	$in_document = TRUE
492      end
493      # environment push
494      if act[1].kind_of?(Hash) &&
495	  (tok.tagname != "p" || tok.switch('align') != nil) then
496	  intp.dup
497	  intp.top.set_interp(act[1])
498      end
499
500      if act[0].kind_of?(String) then
501	intp.top.flush act[0]
502      elsif act[0].kind_of?(Fixnum) then # interned symbol
503	intp.top.send(act[0],tok)
504      end
505
506      # environment pop
507      if act[1] == "pop" then
508	intp.pop
509      end
510    end
511  elsif !tok.nil? then
512    intp.top.flush tok
513  end
514end
515if $in_document then
516  print "\\end{document}\n"
517end
518