aboutsummaryrefslogtreecommitdiffhomepage
path: root/data/sisu/conf/convert/sisu_convert
diff options
context:
space:
mode:
Diffstat (limited to 'data/sisu/conf/convert/sisu_convert')
-rw-r--r--data/sisu/conf/convert/sisu_convert392
1 files changed, 392 insertions, 0 deletions
diff --git a/data/sisu/conf/convert/sisu_convert b/data/sisu/conf/convert/sisu_convert
new file mode 100644
index 00000000..d7876083
--- /dev/null
+++ b/data/sisu/conf/convert/sisu_convert
@@ -0,0 +1,392 @@
+#!/usr/bin/env ruby
+# = sisu - SiSU information Structuring Universe
+#
+# Copyright (c) Ralph Amissah 1997,2004
+#
+# Ralph Amissah mailto:ralph@amissah.com
+#
+# * Name: SiSU information Structuring Universe
+# * Author: Ralph@Amissah.com
+# * Description: document conversion tool, to sisu from other formats
+# * arch-tag: document conversion tool to sisu markup
+# * $Date: 2004/10/16 15:51:06 $
+# * $Id: sisu_convert,v 1.37 2004/10/16 15:51:06 ralph Exp $
+# * License: GPL 2 or later
+# * Notes: word conversion uses wvWare and wvSiSU.xml (a modified/stripped wvHtml.xml)
+# * http://wvware.sourceforge.net/
+# * http://sourceforge.net/projects/wvware
+# * <url:sisu.lnk>|sisu.lnk|@|^|
+# * <url:sisu>
+# * <url:zxy_param.rb>|zxy_param.rb|@|^|
+module CONVERT
+ class MyOutput
+ def initialize(data, filename, instruct)
+ @data=data.compact
+ @filename=filename
+ @instruct=instruct
+ end
+ def headerBasic
+ <<WOK
+0~title
+
+0~subtitle
+
+0~creator
+
+0~type
+
+0~subject
+
+0~date
+
+0~date.available
+
+0~publisher SiSU
+
+0~rights
+
+0~level
+
+WOK
+ end
+ def headerDefault
+ <<WOK
+0~title
+
+0~subtitle
+
+0~creator
+
+0~type
+
+0~subject
+
+0~date
+
+0~date.available
+
+0~publisher SiSU
+
+0~rights ...
+
+WOK
+ end
+ def hardOutput
+ pre = Array.new
+ case @instruct
+ when /default/
+ pre << headerDefault
+ else
+ pre << headerBasic
+ end
+ @filename_wv=File.new(%{,,#{@filename}.er9}, "w+")
+ @filename_wv << pre
+ @data.each do |x|
+ y = x.split("\n")
+ y.each do |z| # cleaner output this way
+ z.strip!
+ @filename_wv.puts "#{z}\n\n" unless z =~/^$/
+ end
+ end
+ end
+ end
+ class WareWord97
+ def initialize(data, filename, instruct)
+ @data=data
+ @filename=filename
+ @instruct=instruct
+ end
+ def songsheet
+ data=@data
+ print "Convert to SiSU file from Word97 << gvim ,,#{@filename}.er9 >\n" #: <<#{@@html_title}>>
+ data=WareWord97.new(data.collect, @filename, @instruct).strip
+ data=WareWord97.new(data.collect, @filename, @instruct).strip
+ data=WareWord97.new(data.collect, @filename, @instruct).markup_rules
+ data=MyOutput.new(data.collect, @filename, @instruct).hardOutput
+ end
+ def strip
+ data=@data
+ tuned_file=Array.new
+ endnote_no=1
+ data.each do |para|
+ para.strip!
+ para.gsub!(/<u>\s*<\/u>/, '')
+ para.gsub!(/<\/u>\s*<u>/, '')
+ para.gsub!(/<b>\s*<\/b>/, '')
+ para.gsub!(/<\/b>\s*<b>/, '')
+ para.gsub!(/<i>\s*<\/i>/, '')
+ para.gsub!(/<\/i>\s*<i>/, '')
+ tuned_file << para unless para == nil
+ end
+ tuned_file
+ end
+ def markup_rules
+ data=@data
+ tuned_file=Array.new
+ endnote_no=1
+ data.each do |para|
+ para.strip!
+ para.gsub!(/\s+/, ' ')
+ para.gsub!(/^<b>(Chapter|Article)(.+?)<\/b>/i, "4{ \\1 \\2") #watch case insensitivity
+ para.gsub!(/^<b>(Part|Section|Book)(.+?)<\/b>/i, "3{ \\1 \\2") #watch case insensitivity
+ para.gsub!(/^<b>(\d+\.\d+\.\d+)(.+?)<\/b>/i, "6{ \\1 \\2") #numeric, decide what to do, can be different
+ para.gsub!(/^<b>(\d+\.\d+)(.+?)<\/b>/i, "5{ \\1 \\2") #numeric, decide what to do, can be different
+ para.gsub!(/^<b>(\d+)(.+?)<\/b>/i, "4{ \\1 \\2") #numeric, decide what to do, can be different
+ para.gsub!(/<u>(.+?)<\/u>/, "_{\\1}_")
+ para.gsub!(/<b>(.+?)<\/b>/, "*{\\1}*")
+ para.gsub!(/<i>(.+?)<\/i>/, "/{\\1}/")
+ tuned_file << para unless para == nil
+ end
+ tuned_file
+ end
+ end
+ class Html
+ def initialize(data, filename, instruct)
+ @data=data
+ @filename=filename
+ @instruct=instruct
+ end
+ def songsheet
+ data=@data
+ print "Convert to SiSU file from #{@filename}.html << gvim ,,#{@filename}.er9 >\n" #: <<#{@@html_title}>>
+ #data=Html.new(data.collect, @filename, @instruct).space_paragraphs
+ #data=Html.new(data.split(''), @filename, @instruct).space_paragraphs
+ data=Html.new(data.split("\n"), @filename, @instruct).space_paragraphs
+ #data=Html.new(data.collect.join.split("\n"), @filename, @instruct).space_paragraphs
+ data=Html.new(data.collect, @filename, @instruct).multiline
+ data=Html.new(data.collect.join.split("\n\n"), @filename, @instruct).markup_rules
+ data=MyOutput.new(data.collect, @filename, @instruct).hardOutput
+ end
+ def space_paragraphs
+ #data=@data.join.split(/\n/)
+ data=@data
+ #p data.length
+ tuned_file=Array.new
+ data.each do |para|
+ para.strip!
+ para.gsub!(/\r/, '')
+ #para.gsub!(/\n/, ' ') #PROBLEM, serious time issues on a few files also for \n (or multiline matches which is less surprising), edit out if necessary
+ para.gsub!(/<\/?p>/i, 'zZz')
+ para.gsub!(/<\/?\s*p(?:\s+ALIGN=.+?)?>/i, "zZz") #all manner of <p> para.gsub!(/<\/?p>/i, "\n\n")
+ para.gsub!(/<p\s+(class|align).+?>/i, "zZz") #
+ para.gsub!(/<\/p>/i, "zZz") # repeat actually
+ para.gsub!(/<(?:dir|tr|br)>/i, "zZz") #
+ #para.gsub!(/<(?:\/\s*)?(?:dir|tr|br)>/i, "zZz") #
+ para.gsub!(/(<\/center>)/i, "\\1zZz")
+ para.gsub!(/(<\/h[1-6]>)/i, "\\1zZz")
+ para.gsub!(/ \s+/i, ' ')
+ para.gsub!(/(?:\s*zZz\s*)+/i, "zZz") #
+ tuned_file << para unless para == nil
+ end
+ tuned_file
+ end
+ def blockquotes(sub='') # SERIOUS PROBLEM INTRODUCED, some blockquotes go missing !, quite unacceptable, debug, for now not used
+ res=Array.new
+ sub.each do |x|
+ if x=~/(<\/blockquote>)/i
+ m = $1
+ res << x[/(.+?)#{m}/mi, 1].gsub!(/zZz/,"zZz_1 ") if x =~/.+?#{m}/mi
+ res << x[/#{m}(.+)/mi, 1]
+ else
+ res << x #[/(.+)/mi, 1]
+ end
+ end
+ res.join
+ end
+ def multiline
+ data=@data
+ tuned_file=Array.new
+ data.each do |para|
+ para.gsub!(/\n/, ' ')
+ para.gsub!(/ \s+/mi, ' ')
+ #ALL HERE could be very time EXPENSIVE but tamed? compromise ... /mi
+ para.gsub!(/<([biu]|h[1-6])>(?:zZz)?([^<]+)?zZz(.+?)<\/\1>/i, "zZz<\\1>\\2 \\3</\\1>")
+ para.gsub!(/<([biu]|h[1-6])>(?:<center>|zZz)+(.+?)(?:<\/center>)?zZz(.+?)?<\/\1>/i, "zZz<\\1>\\2 \\3</\\1>")
+ #para.gsub!(/<([biu]|h[1-6])>(?:<center>|zZz)+(.+?)<\/center>zZz(.+?)?<\/\1>/i, "zZz<\\1>\\2 \\3</\\1>")
+ para.gsub!(/<([biu]|h[1-6])>(?:<center>|zZz)+(.+?)<\/\1>/i, "zZz<\\1>\\2</\\1>")
+ para.gsub!(/<(h[1-6])>(.+?)(?:<center>|zZz)+<\/\1>/i, "zZz<\\1>\\2</\\1>zZz") #does catch some h1, h2 etc, too expensive to have biu
+ #para.gsub!(/<([biu]|h[1-6])>(.+?)(?:<center>|zZz)+<\/\1>/i, "zZz<\\1>\\2 \\3</\\1>") #may go too far? useful for h1 h2 etc, remove biu?
+ #para.gsub!(/<([biu]|h[1-6])>([^<]+)?zZz(.+?)<\/\1>/i, "zZz<\\1>\\2 \\3</\\1>")
+ #para.gsub!(/<([biu]|h[1-6])>([^<]+)?zZz(.+?)<\/\1>/i, "zZz<\\1>\\2 \\3</\\1>")
+ ### SERIOUS PROBLEM INTRODUCED
+ # sub = para.split(/<blockquote>/i)
+ # para = blockquotes(sub) if sub.length > 0 #check was on >1 could have serious repercussions 2004w29
+ para.gsub!(/zZz(\s*zZz)*/, "\n\n")
+ tuned_file << para << "\n\n" unless para == nil
+ end
+ tuned_file
+ end
+ def markup_rules
+ data=@data
+ tuned_file=Array.new
+ data.each do |para|
+ if para=~/<a href="(http:\/\/.+?)">/i
+ #p para.grep(/<a href="(http:\/\/.+?)">/i)
+ #m=$1
+ #para.gsub!(/(?:&lt;\s*)?<a href="#{m}">#{m}<\/a>(?:\s*&gt;)?\.?/i, "#{m}")
+ para.gsub!(/(?:&lt;\s*)?<a href="(http:\/\/.+?)">http:\/\/.+?<\/a>(?:\s*&gt;)?\.?/i, "\\1") #risk that url & url are not to match
+ #para.gsub!(/(?:&lt;\s*)?<a href="(\w+\.html)">(http:\/\/.+?\/\1)<\/a>(?:\s*&gt;)?\.?/i, "\\2") #does not match
+ end
+ ### clean
+ para.gsub!(/^\s+/i, '')
+ para.gsub!(/<([bui]|em|su[pb])>\s*<\/\1>/i, '')
+ para.gsub!(/<\/?center>/i, '')
+ para.gsub!(/\s*<\/dir>/i, '')
+ para.gsub!(/<hr>/i, '')
+ para.gsub!(/\s*<a href=".+?\.html#(?:[a-z_]+)?(?:[a-z0-9_-]|\*)+">\[(\*+)\]<\/a>/i, "^{[\\1]}^ ") #other endnote marker
+ para.gsub!(/<a href=".+?\.html#(?:[a-z_$]+)?[0-9_-]+"(?:\s+name=".+?")?>\[[a-z]?\d+\](?:<\/[bi]>)?<\/a>/i, '~e ') #endnote marker
+ para.gsub!(/<a name=".+?"\s+href=".+?\.html#(?:[a-z_$]+)?[0-9_-]+"?>\[[a-z]?\d+\](?:<\/[bi]>)?<\/a>/i, '~e ') #endnote marker
+ para.gsub!(/<a name="(?:[a-z$]+)?[0-9_-]+">\s*(<\/a>)?\s*\d+\.?\s*(<\/a>)?\s*/i, '~{{ ') #endnote
+ #para.gsub!(/<h([1-6])>\s*(.+?)\s*<\/h\1>\s*/i, "\\1{ \\2") #
+ para.gsub!(/<h([1-6])(?: align=.+?)?>\s*(.+?)\s*<\/h\1>\s*/i, "\\1{ \\2") #
+ para.gsub!(/^<b>(Chapter|Article)(.+?)<\/b>/i, "4{ \\1 \\2") #watch case insensitivity
+ para.gsub!(/^<b>(Part|Section|Book)(.+?)<\/b>/i, "3{ \\1 \\2") #watch case insensitivity
+ para.gsub!(/^<b>(\d+\.\d+\.\d+)(.+?)<\/b>/i, "6{ \\1 \\2") #numeric, decide what to do, can be different
+ para.gsub!(/^<b>(\d+\.\d+)(.+?)<\/b>/i, "5{ \\1 \\2") #numeric, decide what to do, can be different
+ para.gsub!(/^<b>(\d+)(.+?)<\/b>/i, "4{ \\1 \\2") #numeric, decide what to do, can be different
+ #<a name="ii"></a><B>
+ para.gsub!(/^(<a name=".+?">)(?:<small>)?<(?:b|strong)>\s*(.+?)\s*<\/(?:b|strong)>/i, "5{ \\2 \\1") #watch
+ para.gsub!(/^(<(a name|A NAME)=".+?">)(\s*|<\/[aA]>)?([A-Z][A-Z])+/, "5{ \\2 \\1") #watch
+ para.gsub!(/^(\s+|<p>)?(<a name=".+?">)(\s*|<\/a>)?<b>/i, "5{ \\2 \\1") #watch
+ para.gsub!(/<h([1-6])>\s*(.+?)\s*<\/h\1>\s*/i, "\\1{ \\2") #
+ para.gsub!(/^<b>\s*(.+?)<\/b>\s*(<\/i>\s*)?$/i, "4{ \\1\\2") # wish it all were less messy
+ para.gsub!(/^<i>\s*([^"(].+?)<\/i>\s*(<\/b>\s*)?$/i, "5{ \\1\\2") # wish it all were less messy
+ para.gsub!(/<\/?[biu]>/i, '') if para =~/[1-6]\{/
+ para.gsub!(/<u>\s*(.+?)\s*<\/u>/i, "_{\\1}_")
+ para.gsub!(/<(b|strong)>\s*(.+?)\s*<\/\1>/i, "*{\\2}*")
+ para.gsub!(/<(i|em)>\s*(.+?)\s*<\/\1>/i, "/{\\2}/")
+ para.gsub!(/<sup>\s*(.+?)\s*<\/sup>/i, "^{\\1}^")
+ para.gsub!(/(([\/\*!_])\{.+?\}\2)\s\s+/i, "\\1 ")
+ para.gsub!(/(([\/\*!_])\{.+?\}\2)\s+([.,;?\)])\s+/i, "\\1\\3 ")
+ para.gsub!(/(([\/\*!_])\{.+?\}\2)(["'])\s+/i, "\\1\\3 ")
+ para.gsub!(/(([\/\*!_])\{.+?\}\2)\s*([a-z0-9])/i, "\\1 \\3")
+ para.gsub!(/(([\/\*_])\{.+?\}\2)\s*([a-z0-9])/i, "\\1 \\3")
+ para.gsub!(/([a-z0-9])(([\/\*_])\{.+?\}\3)/i, " \\1 \\2") #eg this/{problem}/
+ para.gsub!(/([\/\*_])\{([,.;; ]+)\}\1/i, "\\2") #eg /{,}/ or *{ }* etc.
+ para.gsub!(/ \s+/i, ' ')
+ #para.gsub!(/\/\{\*\{/i, '*{/{')
+ #para.gsub!(/\}\*\}\//i, '}/}*')
+ para.gsub!(/&quot;/i, '"')
+ para.gsub!(/&amp;/i, 'and')
+ para.gsub!(/<!doctype html public .+/i, '')
+ para.gsub!(/<\/?(?:html|head|body|font|small)>/i, '')
+ para.gsub!(/<\/(?:title)>/i, '')
+ para.gsub!(/<title>/i, '#{~title? ')
+ para.gsub!(/<blockquote>(.+?)<\/blockquote>/mi, "\n\n_1 \\1\n\n")
+ para.gsub!(/<div align=.+?>|<\/div>|<font size=.+?>|<\/a><\/em><\/strong>/i, '')
+ para.gsub!(/~e\s+\.\s*/i, ".~e ") #check vim equiv # %s/\~e\s\+\.\s*/.\~e /c
+ para.gsub!(/\s+~e\s+/i, "~e ")
+ para.gsub!(/ \s+/i, ' ')
+ para.gsub!(/\s+$/i, '')
+ para.gsub!(/^(?:<\/[bi]>)+$/i, '')
+ para.gsub!(/^(?:(?:<i>)+<b>|(?:<b>)+<i>)\s*([^"(].+?)/i, "5{ \\1\\2") # wish it all were less messy
+ para.gsub!(/^(?:<\/?(?:[ib]|em)>\s*)+$/i, '') # cleaning up left over <i> etc.
+ para.gsub!(/<(?:i|em)>\s*(.+)/i, "/{\\1}/") # using up left over <i>
+ para.gsub!(/<b>\s*(.+)/i, "*{\\1}*") # using up left over <b>
+ #para.gsub!(/^(?:<(?:\/)?[bi]>)+$/i, '')
+ tuned_file << para unless para == nil
+ end
+ tuned_file
+ end
+ end
+ class Default < Html
+ def initialize(data, filename, instruct)
+ @data=data
+ @filename=filename
+ @instruct=instruct
+ end
+ def songsheet
+ data=@data
+ print "Convert to SiSU file from #{@filename}.html << gvim ,,#{@filename}.er9 >\n" #: <<#{@@html_title}>>
+ data=Default.new(data.collect, @filename, @instruct).space_paragraphs
+ data=Default.new(data.collect, @filename, @instruct).multiline
+ data=Default.new(data.collect.join.split("\n\n"), @filename, @instruct).markup_rules
+ data=Default.new(data.collect, @filename, @instruct).markup_default
+ data=MyOutput.new(data.collect, @filename, @instruct).hardOutput
+ end
+ def markup_default
+ data=@data
+ tuned_file=Array.new
+ data.each do |para|
+ para.gsub!(/<i>(Id\.?)(\s|$)/i, "/\{\\1\}\\2/")
+ para.gsub!(/^(~\{\{ .+?)(<\/LI>\s*|<\/OL>\s*)+$/i, "\\1")
+ para.gsub!(/\/\{Id\.\s*<\/LI>\s*\}\//i, '/{Id.}/')
+ tuned_file << para unless para == nil
+ end
+ tuned_file
+ end
+ end
+end
+def help
+ puts <<WOK
+conversion program
+initial SiSU markup from other file formats
+
+ zxy_convert --word does initial conversion from word97 to sisu markup, expects [filename].doc (can also use --doc)
+ zxy_convert --html does initial conversion from html to sisu markup, expects [filename].html
+ zxy_convert --default does initial conversion from defalt html to sisu markup, expects [filename].html
+
+WOK
+end
+def doWord(argv, instruct)
+ argv.each do |f|
+ if f =~/.+?\.doc$/
+ @argv << f[/(.+?)\.doc$/, 1]
+ else
+ print "not .doc? << #{f} >> "
+ end
+ end
+ @argv.each do |filename|
+ system(%{wvWare -x #{@dir.home}/.sisu/convert/wvSiSU.xml #{filename}.doc > #{filename}.wv})
+ file_array=IO.readlines("#{filename}.wv", "")
+ CONVERT::WareWord97.new(file_array, filename, instruct).songsheet # metaverse created here
+ end
+end
+def doHtml(argv, instruct)
+ argv.each do |f|
+ if f =~/.+?\.html$/
+ @argv << f[/(.+?)\.html$/, 1]
+ else
+ print "not .html? << #{f} >> "
+ end
+ end
+ @argv.each do |filename|
+ file_array=IO.readlines("#{filename}.html", "\n\r")
+ CONVERT::Html.new(file_array, filename, instruct).songsheet # metaverse created here
+ end
+end
+def doDefault(argv, instruct)
+ argv.each do |f|
+ if f =~/.+?\.html$/
+ @argv << f[/(.+?)\.html$/, 1]
+ else
+ print "not .html? << #{f} >> "
+ end
+ end
+ @argv.each do |filename|
+ file_array=IO.readlines("#{filename}.html", "\n\r")
+ CONVERT::Default.new(file_array, filename, instruct).songsheet # metaverse created here
+ end
+end
+def cases(argv, instruct)
+ case instruct
+ when/^--(word(97)?|doc)$/i #creates minimal sisu_small.gz package to send
+ doWord(argv, instruct)
+ when/^--(html)$/i #creates sisu.gz package to send
+ doHtml(argv, instruct)
+ when/^--(default)$/i #creates sisu.gz package to send
+ doDefault(argv, instruct)
+ else
+ help
+ end
+end
+require 'zxy_sysenv.rb'
+include SiSU_Env
+@dir=SiSU_Env::Info_dir.new
+@argv=Array.new
+argv=$*
+instruct = "#{argv[0].to_s}"
+argv.shift
+instruct.chomp!
+instruct = "help" if instruct.nil? or instruct == "";
+cases(argv, instruct)