#!/usr/bin/env ruby # = sisu - SiSU information Structuring Universe # # Copyright (c) Ralph Amissah 1997,2004 # # Ralph Amissah mailto:ralph@amissah.com # # * Name: SiSU information Structuring Universe # * Author: Ralph@Amissah.com # * Description: document conversion tool, to sisu from other formats # * License: GPL 3 or later # * Notes: word conversion uses wvWare and wvSiSU.xml (a modified/stripped wvHtml.xml) # * http://wvware.sourceforge.net/ # * http://sourceforge.net/projects/wvware # * |sisu.lnk|@|^| # * module CONVERT class MyOutput def initialize(data, filename, instruct) @data=data.compact @filename=filename @instruct=instruct end def headerBasic <\n" #: <<#{@@html_title}>> data=WareWord97.new(data.collect,@filename,@instruct).strip data=WareWord97.new(data.collect,@filename,@instruct).strip data=WareWord97.new(data.collect,@filename,@instruct).markup_rules data=MyOutput.new(data.collect,@filename,@instruct).hardOutput end def strip data=@data tuned_file=Array.new endnote_no=1 data.each do |para| para.strip! para.gsub!(/\s*<\/u>/,'') para.gsub!(/<\/u>\s*/,'') para.gsub!(/\s*<\/b>/,'') para.gsub!(/<\/b>\s*/,'') para.gsub!(/\s*<\/i>/,'') para.gsub!(/<\/i>\s*/,'') tuned_file << para unless para == nil end tuned_file end def markup_rules data=@data tuned_file=Array.new endnote_no=1 data.each do |para| para.strip! para.gsub!(/\s+/,' ') para.gsub!(/^(Chapter|Article)(.+?)<\/b>/i,'1~ \1 \2') #watch case insensitivity para.gsub!(/^(Part|Section|Book)(.+?)<\/b>/i,':C~ \1 \2') #watch case insensitivity para.gsub!(/^(\d+\.\d+\.\d+\.?)(.+?)<\/b>/i,'3~ \1 \2') #numeric, decide what to do, can be different para.gsub!(/^(\d+\.\d+\.?)(.+?)<\/b>/i,'2~ \1 \2') #numeric, decide what to do, can be different para.gsub!(/^(\d.+?)<\/b>/i,'4~ \1 \2') #numeric, decide what to do, can be different #para.gsub!(/^([\d.]+?)<\/b>/i,'4~ \1 \2') #numeric, decide what to do, can be different para.gsub!(/(.+?)<\/u>/,'_{\1}_') para.gsub!(/(.+?)<\/b>/,'!{\1}!') para.gsub!(/(.+?)<\/i>/,'/{\1}/') tuned_file << para unless para == nil end tuned_file end end class Html def initialize(data, filename, instruct) @data=data @filename=filename @instruct=instruct end def songsheet data=@data print "Convert to SiSU file from #{@filename}.html << gvim ,,#{@filename}.sst >\n" #: <<#{@@html_title}>> #data=Html.new(data.collect, @filename, @instruct).space_paragraphs #data=Html.new(data.split(''), @filename, @instruct).space_paragraphs data=Html.new(data.join.split(/\n\n+/), @filename, @instruct).space_paragraphs #data=Html.new(data.split("\n"), @filename, @instruct).space_paragraphs #data=Html.new(data.collect.join.split("\n"), @filename, @instruct).space_paragraphs data=Html.new(data.collect, @filename, @instruct).multiline data=Html.new(data.collect.join.split("\n\n"), @filename, @instruct).markup_rules data=MyOutput.new(data.collect, @filename, @instruct).hardOutput end def space_paragraphs #data=@data.join.split(/\n/) data=@data #p data.length tuned_file=Array.new data.each do |para| para.strip! para.gsub!(/\r/,'') #para.gsub!(/\n/, ' ') #PROBLEM, serious time issues on a few files also for \n (or multiline matches which is less surprising), edit out if necessary para.gsub!(/<\/?p>/i,'zZz') para.gsub!(/<\/?\s*p(?:\s+ALIGN=.+?)?>/i,'zZz') #all manner of

para.gsub!(/<\/?p>/i, "\n\n") para.gsub!(//i,'zZz') # para.gsub!(/<\/p>/i,'zZz') # repeat actually para.gsub!(/<(?:dir|tr|br)>/i,'zZz') # #para.gsub!(/<(?:\/\s*)?(?:dir|tr|br)>/i, "zZz") # para.gsub!(/(<\/center>)/i,'\1zZz') para.gsub!(/(<\/h[1-6]>)/i,'\1zZz') para.gsub!(/ \s+/i,' ') para.gsub!(/(?:\s*zZz\s*)+/i,'zZz') # tuned_file << para unless para == nil end tuned_file end def blockquotes(sub='') # SERIOUS PROBLEM INTRODUCED, some blockquotes go missing !, quite unacceptable, debug, for now not used res=Array.new sub.each do |x| if x=~/(<\/blockquote>)/i m = $1 res << x[/(.+?)#{m}/mi,1].gsub!(/zZz/,'zZz_1 ') if x =~/.+?#{m}/mi res << x[/#{m}(.+)/mi,1] else res << x #[/(.+)/mi,1] end end res.join end def multiline data=@data tuned_file=Array.new data.each do |para| para.gsub!(/\n/,' ') para.gsub!(/ \s+/mi,' ') #ALL HERE could be very time EXPENSIVE but tamed? compromise ... /mi para.gsub!(/<([biu]|h[1-6])>(?:zZz)?([^<]+)?zZz(.+?)<\/\1>/i,'zZz<\1>\2 \3') para.gsub!(/<([biu]|h[1-6])>(?:

|zZz)+(.+?)(?:<\/center>)?zZz(.+?)?<\/\1>/i,'zZz<\1>\2 \3') #para.gsub!(/<([biu]|h[1-6])>(?:
|zZz)+(.+?)<\/center>zZz(.+?)?<\/\1>/i,'zZz<\1>\2 \3') para.gsub!(/<([biu]|h[1-6])>(?:
|zZz)+(.+?)<\/\1>/i,'zZz<\1>\2') para.gsub!(/<(h[1-6])>(.+?)(?:
|zZz)+<\/\1>/i,'zZz<\1>\2zZz') #does catch some h1, h2 etc, too expensive to have biu #para.gsub!(/<([biu]|h[1-6])>(.+?)(?:
|zZz)+<\/\1>/i,'zZz<\1>\2 \3') #may go too far? useful for h1 h2 etc, remove biu? #para.gsub!(/<([biu]|h[1-6])>([^<]+)?zZz(.+?)<\/\1>/i,'zZz<\1>\2 \3') #para.gsub!(/<([biu]|h[1-6])>([^<]+)?zZz(.+?)<\/\1>/i,'zZz<\1>\2 \3') ### SERIOUS PROBLEM INTRODUCED # sub = para.split(/
/i) # para = blockquotes(sub) if sub.length > 0 #check was on >1 could have serious repercussions 2004w29 para.gsub!(/zZz(\s*zZz)*/,"\n\n") tuned_file << para << "\n\n" unless para == nil end tuned_file end def markup_rules @@flag_blockquote=false data=@data tuned_file=Array.new data.each do |para| if para=~//i #p para.grep(//i) #m=$1 #para.gsub!(/(?:<\s*)?#{m}<\/a>(?:\s*>)?\.?/i, "#{m}") para.gsub!(/(?:<\s*)?http:\/\/.+?<\/a>(?:\s*>)?\.?/i,'\1') #risk that url & url are not to match #para.gsub!(/(?:<\s*)?(http:\/\/.+?\/\1)<\/a>(?:\s*>)?\.?/i, "\\2") #does not match end if para=~/
/i @@flag_blockquote=true end if @@flag_blockquote para.gsub!(/^/,'_1 ') unless para.empty? or para =~/^\s*<\/?blockquote?>\s*$/i end if para=~/<\/BLOCKQUOTE>/i @@flag_blockquote=false end para.gsub!(/<\/?blockquote?>/i,'') ### clean para.gsub!(/^\s+/i,'') para.gsub!(/<([bui]|em|su[pb])>\s*<\/\1>/i,'') para.gsub!(/<\/?center>/i,'') para.gsub!(/\s*<\/dir>/i,'') para.gsub!(/
/i,'') para.gsub!(/\s*
\[(\*+)\]<\/a>/i,'^{[\1]}^ ') #other endnote marker para.gsub!(/\[[a-z]?\d+\](?:<\/[bi]>)?<\/a>/i,'~^ ') #endnote marker para.gsub!(/\[[a-z]?\d+\](?:<\/[bi]>)?<\/a>/i,'~^ ') #endnote marker para.gsub!(/\s*(<\/a>)?\s*\d+\.?\s*(<\/a>)?\s*/i,'^~ ') #endnote #para.gsub!(/\s*(.+?)\s*<\/h\1>\s*/i,'\1~ \2') # para.gsub!(/\s*(.+?)\s*<\/h\1>\s*/i,'\1~ \2') # para.gsub!(/^(Chapter|Article)(.+?)<\/b>/i,'4~ \1 \2') #watch case insensitivity para.gsub!(/^(Part|Section|Book)(.+?)<\/b>/i,'3~ \1 \2') #watch case insensitivity para.gsub!(/^(\d+\.\d+\.\d+\.?)(.+?)<\/b>/i,'6~ \1 \2') #numeric, decide what to do, can be different para.gsub!(/^(\d+\.\d+\.?)(.+?)<\/b>/i,'5~ \1 \2') #numeric, decide what to do, can be different para.gsub!(/^(\d+\.?)(.+?)<\/b>/i,'4~ \1 \2') #numeric, decide what to do, can be different # para.gsub!(/^()(?:)?<(?:b|strong)>\s*(.+?)\s*<\/(?:b|strong)>/i,'5~ \2 \1') #watch para.gsub!(/^(<(a name|A NAME)=".+?">)(\s*|<\/[aA]>)?([A-Z][A-Z])+/,'5~ \2 \1') #watch para.gsub!(/^(\s+|

)?()(\s*|<\/a>)?/i,'5~ \2 \1') #watch para.gsub!(/\s*(.+?)\s*<\/h\1>\s*/i,'\1~ \2') # para.gsub!(/^\s*(.+?)<\/b>\s*(<\/i>\s*)?$/i,'4~ \1\2') # wish it all were less messy para.gsub!(/^\s*([^"(].+?)<\/i>\s*(<\/b>\s*)?$/i,'5~ \1\2') # wish it all were less messy para.gsub!(/<\/?[biu]>/i,'') if para =~/[1-6]\{/ para.gsub!(/\s*(.+?)\s*<\/u>/i,'_{\1}_') para.gsub!(/<(b|strong)>\s*(.+?)\s*<\/\1>/i,'*{\2}*') para.gsub!(/<(i|em)>\s*(.+?)\s*<\/\1>/i,'/{\2}/') para.gsub!(/\s*(.+?)\s*<\/sup>/i,'^{\1}^') para.gsub!(/(([\/\*!_])\{.+?\}\2)\s\s+/i,'\1 ') para.gsub!(/(([\/\*!_])\{.+?\}\2)\s+([.,;?\)])\s+/i,'\1\3 ') para.gsub!(/(([\/\*!_])\{.+?\}\2)(["'])\s+/i,'\1\3 ') para.gsub!(/(([\/\*!_])\{.+?\}\2)\s*([a-z0-9])/i,'\1 \3') para.gsub!(/(([\/\*_])\{.+?\}\2)\s*([a-z0-9])/i,'\1 \3') para.gsub!(/([a-z0-9])(([\/\*_])\{.+?\}\3)/i,' \1 \2') #eg this/{problem}/ para.gsub!(/([\/\*_])\{([,.;; ]+)\}\1/i,'\2') #eg /{,}/ or *{ }* etc. para.gsub!(/ \s+/i,' ') #para.gsub!(/\/\{\*\{/i, '*{/{') #para.gsub!(/\}\*\}\//i, '}/}*') para.gsub!(/"/i,'"') para.gsub!(/&/i,'and') para.gsub!(//i,'') para.gsub!(/<\/(?:title)>/i,'') para.gsub!(//i,'#{~title? ') para.gsub!(/<blockquote>(.+?)<\/blockquote>/mi,"\n\n_1 \\1\n\n") para.gsub!(/<div align=.+?>|<\/div>|<font size=.+?>|<\/a><\/em><\/strong>/i,'') para.gsub!(/~^\s+\.\s*/i,'.~^ ') #check vim equiv # %s/\~e\s\+\.\s*/.\~e /c para.gsub!(/\s+~^\s+/i,'~^ ') para.gsub!(/ \s+/i,' ') para.gsub!(/\s+$/i,'') para.gsub!(/^(?:<\/[bi]>)+$/i,'') para.gsub!(/^(?:(?:<i>)+<b>|(?:<b>)+<i>)\s*([^"(].+?)/i,'5~ \1\2') # wish it all were less messy para.gsub!(/^(?:<\/?(?:[ib]|em)>\s*)+$/i,'') # cleaning up left over <i> etc. para.gsub!(/<(?:i|em)>\s*(.+)/i,'/{\1}/') # using up left over <i> para.gsub!(/<b>\s*(.+)/i,'*{\1}*') # using up left over <b> para.gsub!(/<dd>([\d.]+)/i,'5~ \1') para.gsub!(/<dd>(?: )+([\d.]+)/i,'6~ \1') para.gsub!(/<dd>(\([a-z]\))/i,'7~ \1') para.gsub!(/^([1-9]~)( .+?)<a name="(\S+?)">(.+?)(<\/a>)/i,'\1\3\2\4') para.gsub!(/^([1-9]~)( .+?)<a name="(\S+?)">/i,'\1\3\2') para.gsub!(/http\/\/(\S+)/i,'http:\/\/\1') para.gsub!(/\s*<a href="\S+?">(http:\/\/\S+?)<\/a>\s*/i,' \1 ') para.gsub!(/([a-zA-Z.,!?;:])([*\/_-]\{)/,'\1 \2') para.gsub!(/^\s*( ){10,12}/i,'_2 ') para.gsub!(/^\s*( ){4,5}/i,'_1 ') para.gsub!(/ /,' ') #check ## glyphs & tildes para.gsub!(/¡/, '¡') #'Inverted exclamation para.gsub!(/¢/, '¢') #'Cent sign ¢ para.gsub!(/£/, '£') #'Pound sign £ para.gsub!(/¤/, '¤') #'General currency sign para.gsub!(/¥/, '¥') #'Yen sign ¥ para.gsub!(/¦/, '¦') #'Broken vertical bar para.gsub!(/§/, '§') #'Section sign § para.gsub!(/¨/, '¨') #'Umlaut para.gsub!(/©/, '©') #'Copyright © para.gsub!(/ª/, 'ª') #'Feminine ordinal ª para.gsub!(/«/, '«') #'Left angle quote « para.gsub!(/¬/, '¬') #'Not sign para.gsub!(/­/, '­') #'Soft hyphen para.gsub!(/®/, '®') #'Registered trademark ® para.gsub!(/¯/, '¯') #'Macron accent para.gsub!(/°/, '°') #'Degree sign ° para.gsub!(/&plusmin;/,'±') #'Plus or minus ± para.gsub!(/²/, '²') #'Superscript 2 ² para.gsub!(/³/, '³') #'Superscript 3 ³ para.gsub!(/´/, '') #'Acute accent para.gsub!(/µ/, 'µ') #'Micro sign (Greek mu) µ para.gsub!(/¶/, '¶') #'Paragraph sign ¶ para.gsub!(/·/, '·') #'Middle dot para.gsub!(/¸/, '¸') #'Cedilla para.gsub!(/¹/, '¹') #'Superscript 1 ¹ para.gsub!(/º/, 'º') #'Masculine ordinal º para.gsub!(/»/, '»') #'Right angle quote para.gsub!(/¼/, '¼') #'Fraction one quarter ¼ para.gsub!(/½/, '½') #'Fraction on half ½ para.gsub!(/¾/, '¾') #'Fraction three quarters ¾ para.gsub!(/¿/, '¿') #'Inverted question mark ¿ para.gsub!(/À/, 'À') #'Capital A, grave accent À para.gsub!(/Á/, 'Á') #'Capital A, acute accent Á para.gsub!(/Â/, 'Â') #'Capital A, circumflex accent  para.gsub!(/Ã/, 'Ã') #'Capital A, tilde à para.gsub!(/Ä/, 'Ä') #'Capital A, umlaut Ä para.gsub!(/Å/, 'Å') #'Capital A, ring Å para.gsub!(/Æ/, 'Æ') #'Capital AE ligature Æ para.gsub!(/Ç/, 'Ç') #'Capital C, cedilla Ç para.gsub!(/È/, 'È') #'Capital E, grave accent È para.gsub!(/É/, 'É') #'Capital E, acute accent É para.gsub!(/Ê/, 'Ê') #'Capital E, circumflex accent Ê para.gsub!(/Ë/, 'Ë') #'Capital E, umlaut Ë para.gsub!(/Ì/, 'Ì') #'Capital I, grave accent Ì para.gsub!(/Í/, 'Í') #'Capital I, acute accent Í para.gsub!(/Î/, 'Î') #'Capital I, circumflex accent Î para.gsub!(/Ï/, 'Ï') #'Capital I, umlaut Ï para.gsub!(/Ð/, 'Ð') #'Capital eth, Icelandic para.gsub!(/Ñ/, 'Ñ') #'Capital N, tilde Ñ para.gsub!(/Ò/, 'Ò') #'Capital O, grave accent Ò para.gsub!(/Ó/, 'Ó') #'Capital O, acute accent Ó para.gsub!(/Ô/, 'Ô') #'Capital O, circumflex accent Ô para.gsub!(/Õ/, 'Õ') #'Capital O, tilde Õ para.gsub!(/Ö/, 'Ö') #'Capital O, umlaut Ö para.gsub!(/×/, '×') #'Multiply sign × para.gsub!(/Ø/, 'Ø') #'Capital O, slash Ø para.gsub!(/Ù/, 'Ù') #'Capital U, grave accent Ù para.gsub!(/Ú/, 'Ú') #'Capital U, acute accent Ú para.gsub!(/Û/, 'Û') #'Capital U, circumflex accent Û para.gsub!(/Ü/, 'Ü') #'Capital U, umlaut Ü para.gsub!(/Ý/, 'Ý') #'Capital Y, acute accent Ý para.gsub!(/Þ/, 'Þ') #'Capital thorn, Icelandic Þ para.gsub!(/ß/, 'ß') #'Small sz ligature, German ß para.gsub!(/à/, 'à') #'Small a, grave accent à para.gsub!(/á/, 'á') #'Small a, acute accent á para.gsub!(/â/, 'â') #'Small a, circumflex accent â para.gsub!(/ã/, 'ã') #'Small a, tilde ã para.gsub!(/ä/, 'ä') #'Small a, umlaut ä para.gsub!(/å/, 'å') #'Small a, ring å para.gsub!(/æ/, 'æ') #'Small ae ligature æ para.gsub!(/ç/, 'ç') #'Small c, cedilla ç para.gsub!(/è/, 'è') #'Small e, grave accent è para.gsub!(/é/, 'é') #'Small e, acute accent é para.gsub!(/ê/, 'ê') #'Small e, circumflex accent ê para.gsub!(/ë/, 'ë') #'Small e, umlaut ë para.gsub!(/ì/, 'ì') #'Small i, grave accent ì para.gsub!(/í/, 'í') #'Small i, acute accent í para.gsub!(/î/, 'î') #'Small i, circumflex accent î para.gsub!(/ï/, 'ï') #'Small i, umlaut ï para.gsub!(/ð/, 'ð') #'Small eth, Icelandic ð para.gsub!(/ñ/, 'ñ') #'Small n, tilde ñ para.gsub!(/ò/, 'ò') #'Small o, grave accent ò para.gsub!(/ó/, 'ó') #'Small o, acute accent ó para.gsub!(/ô/, 'ô') #'Small o, circumflex accent ô para.gsub!(/õ/, 'õ') #'Small o, tilde õ para.gsub!(/ö/, 'ö') #'Small o, umlaut ö para.gsub!(/÷/, '÷') #'Divide sign ÷ para.gsub!(/ø/, 'ø') #'Small o, slash ø para.gsub!(/ù/, 'ù') #'Small u, grave accent ù para.gsub!(/ú/, 'ú') #'Small u, acute accent ú para.gsub!(/û/, 'û') #'Small u, circumflex accent û para.gsub!(/ü/, 'ü') #'Small u, umlaut ü para.gsub!(/ý/, 'ý') #'Small y, acute accent ý para.gsub!(/þ/, 'þ') #'Small thorn, Icelandic þ para.gsub!(/ÿ/, 'ÿ') #'Smally y, umlaut ÿ ## para.gsub!(/\s\s+/,' ') para.gsub!(/\t+/,' ') #para.gsub!(/ +/,' ') #para.gsub!(/^(?:<(?:\/)?[bi]>)+$/i, '') tuned_file << para unless para == nil end tuned_file end end class Default < Html def initialize(data, filename, instruct) @data=data @filename=filename @instruct=instruct end def songsheet data=@data print "Convert to SiSU file from #{@filename}.html << gvim ,,#{@filename}.sst >\n" #: <<#{@@html_title}>> data=Default.new(data.collect, @filename, @instruct).space_paragraphs data=Default.new(data.collect, @filename, @instruct).multiline data=Default.new(data.collect.join.split("\n\n"), @filename, @instruct).markup_rules data=Default.new(data.collect, @filename, @instruct).markup_default data=MyOutput.new(data.collect, @filename, @instruct).hardOutput end def markup_default data=@data tuned_file=Array.new data.each do |para| para.gsub!(/<i>(Id\.?)(\s|$)/i,'/\{\1\}\2/') para.gsub!(/^(~\{\{ .+?)(<\/LI>\s*|<\/OL>\s*)+$/i,'\1') para.gsub!(/\/\{Id\.\s*<\/LI>\s*\}\//i,'/{Id.}/') tuned_file << para unless para == nil end tuned_file end end end def help puts <<WOK conversion program initial SiSU markup from other file formats zxy_convert --word does initial conversion from word97 to sisu markup, expects [filename].doc (can also use --doc) zxy_convert --html does initial conversion from html to sisu markup, expects [filename].html zxy_convert --default does initial conversion from defalt html to sisu markup, expects [filename].html WOK end def do_word(argv, instruct) argv.each do |f| if f =~/.+?\.doc$/ @argv << f[/(.+?)\.doc$/, 1] else print "not .doc? << #{f} >> " end end @argv.each do |filename| system(%{wvWare -x #{@dir.path.home}/.sisu/convert/wvSiSU.xml #{filename}.doc > #{filename}.wv}) file_array=IO.readlines("#{filename}.wv", "") CONVERT::WareWord97.new(file_array, filename, instruct).songsheet # metaverse created here end end def do_html(argv, instruct) argv.each do |f| if f =~/.+?\.html?$/ @argv << f[/(.+?)\.html?$/, 1] else print "not .html? << #{f} >> " end end @argv.each do |filename| file_end=if FileTest.file?("#{filename}.html") 'html' elsif FileTest.file?("#{filename}.htm") 'htm' end file_array=IO.readlines("#{filename}.#{file_end}","\n\r") CONVERT::Html.new(file_array,filename,instruct).songsheet # metaverse created here end end def do_default(argv, instruct) argv.each do |f| if f =~/.+?\.html$/ @argv << f[/(.+?)\.html$/, 1] else print "not .html? << #{f} >> " end end @argv.each do |filename| file_array=IO.readlines("#{filename}.html", "\n\r") CONVERT::Default.new(file_array, filename, instruct).songsheet # metaverse created here end end def cases(argv, instruct) case instruct when/^--(word(97)?|doc)$/i #creates minimal sisu_small.gz package to send do_word(argv, instruct) when/^--(html)$/i #creates sisu.gz package to send do_html(argv, instruct) when/^--(default)$/i #creates sisu.gz package to send do_default(argv, instruct) else help end end $KCODE='u' branch='v2' @argv=Array.new argv=$* SiSU_version_dir=(argv.inspect=~/--v1/) ? 'v1' : 'v2' SiSU_lib="sisu/#{SiSU_version_dir}" require "#{SiSU_lib}/sysenv" include SiSU_Env @dir=SiSU_Env::Info_env.new instruct = "#{argv[0].to_s}" argv.shift instruct.chomp! instruct = "help" if instruct.nil? or instruct == ""; cases(argv, instruct)