aboutsummaryrefslogtreecommitdiffhomepage
path: root/lib/sisu/develop/object_munge.rb
diff options
context:
space:
mode:
authorRalph Amissah <ralph@amissah.com>2014-12-31 08:50:09 -0500
committerRalph Amissah <ralph@amissah.com>2015-01-07 22:42:40 -0500
commita25c64083c10dc2b2b02fcee06b1e305a379ce90 (patch)
treed2d8d3fbf9bf37868afaa22a1759e55a8ee86273 /lib/sisu/develop/object_munge.rb
parentd: po4a, reorganize a bit (diff)
d: po4a, continue reorganization (translation request dev stopped)
* --po4a run against different language versions of sisu markup files (representing the same document in different languages) initializes po4a structure to place those documents under future po4a management * includes command line translation request (use command line program 'trans' to pass request to translation.google.com, stopped for now) NOTE in case future development is restarted: to test, remove code line 'auto_translate?(:skip)' that follows 'def auto_translation(src_txt,markup=:src)'; place identical english sisu markup files in en/test.sst fr/test.sst (or under the language code to be tested) and run against en/test.sst fr/test.sst e.g. '--po4a --trans --glob test.sst' this send identical text objects to translate.google using 'trans' and populate po file with results for placing future translation under po4a management, however in initial tests insufficient cleanly translated paragraphs were returned, so stopped development for now. (no attempt at initialization of new non-existent files using --trans-en:es,fr,de format) * [hub_options, if development of trans is pursued later consider the possibility of modifying the --trans string options to add an optional timeout value, &; possibly modifying po4a code so timeout may occur a fixed number of times before skipping translation request attempts for the remainder].
Diffstat (limited to 'lib/sisu/develop/object_munge.rb')
-rw-r--r--lib/sisu/develop/object_munge.rb307
1 files changed, 307 insertions, 0 deletions
diff --git a/lib/sisu/develop/object_munge.rb b/lib/sisu/develop/object_munge.rb
new file mode 100644
index 00000000..0c2b5137
--- /dev/null
+++ b/lib/sisu/develop/object_munge.rb
@@ -0,0 +1,307 @@
+# encoding: utf-8
+=begin
+
+* Name: SiSU
+
+** Description: documents, structuring, processing, publishing, search
+*** object munge
+
+** Author: Ralph Amissah
+ <ralph@amissah.com>
+ <ralph.amissah@gmail.com>
+
+** Copyright: (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
+ 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Ralph Amissah,
+ All Rights Reserved.
+
+** License: GPL 3 or later:
+
+ SiSU, a framework for document structuring, publishing and search
+
+ Copyright (C) Ralph Amissah
+
+ This program is free software: you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the Free
+ Software Foundation, either version 3 of the License, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ You should have received a copy of the GNU General Public License along with
+ this program. If not, see <http://www.gnu.org/licenses/>.
+
+ If you have Internet connection, the latest version of the GPL should be
+ available at these locations:
+ <http://www.fsf.org/licensing/licenses/gpl.html>
+ <http://www.gnu.org/licenses/gpl.html>
+
+ <http://www.sisudoc.org/sisu/en/manifest/gpl.fsf.html>
+
+** SiSU uses:
+ * Standard SiSU markup syntax,
+ * Standard SiSU meta-markup syntax, and the
+ * Standard SiSU object citation numbering and system
+
+** Hompages:
+ <http://www.jus.uio.no/sisu>
+ <http://www.sisudoc.org>
+
+** Git
+ <http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=summary>
+ <http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=blob;f=lib/sisu/develop/html_parts.rb;hb=HEAD>
+
+=end
+module SiSU_Object_Munge
+ def i_src_o_strip_markup(txtobj)
+ txtobj=txtobj.
+ gsub(/#{Mx[:srcrgx_bold_o]}(.+?)#{Mx[:srcrgx_bold_c]}/m,'\1').
+ gsub(/#{Mx[:srcrgx_italics_o]}(.+?)#{Mx[:srcrgx_italics_c]}/m,'\1').
+ gsub(/#{Mx[:srcrgx_underscore_o]}(.+?)#{Mx[:srcrgx_underscore_c]}/m,'\1').
+ gsub(/#{Mx[:srcrgx_cite_o]}(.+?)#{Mx[:srcrgx_cite_c]}/m,'\1').
+ gsub(/#{Mx[:srcrgx_insert_o]}(.+?)#{Mx[:srcrgx_insert_c]}/m,'\1').
+ gsub(/#{Mx[:srcrgx_strike_o]}(.+?)#{Mx[:srcrgx_strike_c]}/m,'\1').
+ gsub(/#{Mx[:srcrgx_superscript_o]}(\d+)#{Mx[:srcrgx_superscript_c]}/m,'[\1]').
+ gsub(/#{Mx[:srcrgx_superscript_o]}(.+?)#{Mx[:srcrgx_superscript_c]}/m,'\1').
+ gsub(/#{Mx[:srcrgx_subscript_o]}(.+?)#{Mx[:srcrgx_subscript_c]}/m,'\1').
+ gsub(/#{Mx[:srcrgx_hilite_o]}(.+?)#{Mx[:srcrgx_hilite_c]}/m,'\1').
+ gsub(/#{Mx[:gl_o]}#(?:126|152)#{Mx[:gl_c]}/i,'~').
+ gsub(/#{Mx[:en_a_o]}([\d*+]+)\s+(?:.+?)#{Mx[:en_a_c]}/m,''). # endnote removed
+ gsub(/#{Mx[:en_b_o]}([\d*+]+)\s+(?:.+?)#{Mx[:en_b_c]}/m,''). # endnote removed
+ gsub(/(?:#{Mx[:nbsp]})+/,' ').
+ gsub(/(?:#{Mx[:br_nl]})+/,"\n").
+ gsub(/(?:#{Mx[:br_paragraph]})+/,"\n").
+ gsub(/(?:#{Mx[:br_line]})+/,"\n").
+ gsub(/#{Mx[:gl_o]}(?:#lt|#060)#{Mx[:gl_c]}/,'<').
+ gsub(/#{Mx[:gl_o]}(?:#gt|#062)#{Mx[:gl_c]}/,'>').
+ gsub(/#{Mx[:gl_o]}#(?:038|amp)#{Mx[:gl_c]}/,'&').
+ gsub(/#{Mx[:gl_o]}#033#{Mx[:gl_c]}/,'!').
+ gsub(/#{Mx[:gl_o]}#035#{Mx[:gl_c]}/,'#').
+ gsub(/#{Mx[:gl_o]}#042#{Mx[:gl_c]}/,'*').
+ gsub(/#{Mx[:gl_o]}#045#{Mx[:gl_c]}/,'-').
+ gsub(/#{Mx[:gl_o]}#047#{Mx[:gl_c]}/,'/').
+ gsub(/#{Mx[:gl_o]}#095#{Mx[:gl_c]}/,'_').
+ gsub(/#{Mx[:gl_o]}#123#{Mx[:gl_c]}/,'{').
+ gsub(/#{Mx[:gl_o]}#125#{Mx[:gl_c]}/,'}').
+ gsub(/#{Mx[:gl_o]}#126#{Mx[:gl_c]}/,'~').
+ gsub(/#{Mx[:gl_o]}#169#{Mx[:gl_c]}/,'©').
+ gsub(/[ ][ ]s+/,' ').
+ strip
+if txtobj =~/Reading this/
+ puts txtobj
+ if txtobj =~ /#{Mx[:srcrgx_italics_o]}(.+?)#{Mx[:srcrgx_italics_c]}/
+ puts __LINE__
+ puts Mx[:srcrgx_italics_o]
+ puts txtobj
+ end
+end
+; txtobj
+ end
+ def i_ao_o_strip_markup(txtobj)
+ txtobj=txtobj.gsub(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'\1').
+ gsub(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'\1').
+ gsub(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'\1').
+ gsub(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'\1').
+ gsub(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'\1').
+ gsub(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,'\1').
+ gsub(/#{Mx[:fa_superscript_o]}(\d+)#{Mx[:fa_superscript_c]}/,'[\1]').
+ gsub(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'\1').
+ gsub(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'\1').
+ gsub(/#{Mx[:fa_hilite_o]}(.+?)#{Mx[:fa_hilite_c]}/,'\1').
+ gsub(/#{Mx[:gl_o]}#(?:126|152)#{Mx[:gl_c]}/i,'~').
+ gsub(/#{Mx[:en_a_o]}([\d*+]+)\s+(?:.+?)#{Mx[:en_a_c]}/,''). # endnote removed
+ gsub(/#{Mx[:en_b_o]}([\d*+]+)\s+(?:.+?)#{Mx[:en_b_c]}/,''). # endnote removed
+ gsub(/(?:#{Mx[:nbsp]})+/,' ').
+ gsub(/(?:#{Mx[:br_nl]})+/,"\n").
+ gsub(/(?:#{Mx[:br_paragraph]})+/,"\n").
+ gsub(/(?:#{Mx[:br_line]})+/,"\n").
+ gsub(/#{Mx[:gl_o]}(?:#lt|#060)#{Mx[:gl_c]}/,'<').
+ gsub(/#{Mx[:gl_o]}(?:#gt|#062)#{Mx[:gl_c]}/,'>').
+ gsub(/#{Mx[:gl_o]}#(?:038|amp)#{Mx[:gl_c]}/,'&').
+ gsub(/#{Mx[:gl_o]}#033#{Mx[:gl_c]}/,'!').
+ gsub(/#{Mx[:gl_o]}#035#{Mx[:gl_c]}/,'#').
+ gsub(/#{Mx[:gl_o]}#042#{Mx[:gl_c]}/,'*').
+ gsub(/#{Mx[:gl_o]}#045#{Mx[:gl_c]}/,'-').
+ gsub(/#{Mx[:gl_o]}#047#{Mx[:gl_c]}/,'/').
+ gsub(/#{Mx[:gl_o]}#095#{Mx[:gl_c]}/,'_').
+ gsub(/#{Mx[:gl_o]}#123#{Mx[:gl_c]}/,'{').
+ gsub(/#{Mx[:gl_o]}#125#{Mx[:gl_c]}/,'}').
+ gsub(/#{Mx[:gl_o]}#126#{Mx[:gl_c]}/,'~').
+ gsub(/#{Mx[:gl_o]}#169#{Mx[:gl_c]}/,'©').
+ gsub(/[ ][ ]s+/,' ').
+ strip
+ end
+ def i_ao_o_src_markup_restore(txtobj)
+ @txtobj=txtobj
+ def textface_marks
+ @txtobj.gsub(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'*{\1}*').
+ gsub(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'/{\1}/').
+ gsub(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'_{\1}_').
+ gsub(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'"{\1}"').
+ gsub(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'+{\1}+').
+ gsub(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,'-{\1}-').
+ gsub(/#{Mx[:fa_superscript_o]}(\d+)#{Mx[:fa_superscript_c]}/,'^{[\1]}^').
+ gsub(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'^{\1}^').
+ gsub(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,',{\1},').
+ gsub(/#{Mx[:fa_hilite_o]}(.+?)#{Mx[:fa_hilite_c]}/,'\1').
+ gsub(/#{Mx[:gl_o]}#(?:126|152)#{Mx[:gl_c]}/i,'~').
+ gsub(/#{Mx[:en_a_o]}([\d*+]+)\s+(?:.+?)#{Mx[:en_a_c]}/,'~{\1 \2}~').
+ gsub(/#{Mx[:en_b_o]}([\d*+]+)\s+(?:.+?)#{Mx[:en_b_c]}/,''). # endnote removed
+ gsub(/(?:#{Mx[:nbsp]})+/,' ').
+ gsub(/(?:#{Mx[:br_nl]})+/,"\n").
+ gsub(/(?:#{Mx[:br_paragraph]})+/,"\n").
+ gsub(/(?:#{Mx[:br_line]})+/,"\n").
+ gsub(/#{Mx[:gl_o]}(?:#lt|#060)#{Mx[:gl_c]}/,'<').
+ gsub(/#{Mx[:gl_o]}(?:#gt|#062)#{Mx[:gl_c]}/,'>').
+ gsub(/#{Mx[:gl_o]}#(?:038|amp)#{Mx[:gl_c]}/,'&').
+ gsub(/#{Mx[:gl_o]}#033#{Mx[:gl_c]}/,'!').
+ gsub(/#{Mx[:gl_o]}#035#{Mx[:gl_c]}/,'#').
+ gsub(/#{Mx[:gl_o]}#042#{Mx[:gl_c]}/,'*').
+ gsub(/#{Mx[:gl_o]}#045#{Mx[:gl_c]}/,'-').
+ gsub(/#{Mx[:gl_o]}#047#{Mx[:gl_c]}/,'/').
+ gsub(/#{Mx[:gl_o]}#095#{Mx[:gl_c]}/,'_').
+ gsub(/#{Mx[:gl_o]}#123#{Mx[:gl_c]}/,'{').
+ gsub(/#{Mx[:gl_o]}#125#{Mx[:gl_c]}/,'}').
+ gsub(/#{Mx[:gl_o]}#126#{Mx[:gl_c]}/,'~').
+ gsub(/#{Mx[:gl_o]}#169#{Mx[:gl_c]}/,'©').
+ gsub(/[ ][ ]s+/,' ').
+ strip
+ end
+ def object_marks
+ @txtobj
+ end
+ self
+ end
+ def clean_text(txtobj,markup=:ao)
+ if txtobj.class==String
+ txtobj=if markup ==:ao
+ i_ao_o_strip_markup(txtobj)
+ elsif markup ==:src
+ i_src_o_strip_markup(txtobj)
+ else p __FILE__; p __LINE__
+ end
+ elsif txtobj.class.inspect=~/^SiSU_AO_DocumentStructure::/
+ txtobj.obj=i_ao_o_strip_markup(txtobj.obj)
+ else p 'error'
+ end
+ txtobj
+ end
+ def footnotes_inline(txtobj)
+ end
+ def footnotes_ref_and_note(txtobj)
+ end
+ def src_markup(txtobj)
+ txtobj
+ end
+ def extract_endnotes(doc_obj_txt,endnotes_) #% used for extraction of endnotes from paragraphs
+ if endnotes_ ==:separate
+ notes_a=doc_obj_txt.scan(/#{Mx[:en_a_o]}([\d]+\s+.+?)#{Mx[:en_a_c]}/)
+ ##notes_a=doc_obj_txt.scan(/#{Mx[:en_a_o]}([\d*+]+\s+.+?)#{Mx[:en_a_c]}/)
+ #notes_b=doc_obj_txt.scan(/#{Mx[:en_b_o]}([\d*+]+\s+.+?)#{Mx[:en_b_c]}/)
+ n=[]
+ notes_a.flatten.each do |note| #high cost to deal with <br> appropriately within plaintext, consider
+ note=note.dup.to_s
+ note=note.gsub(/^([\d]+)\s+/,'^~\1 ').
+ gsub(/#{Mx[:br_line]}|#{Mx[:br_nl]}/,
+ ' \\\\\\ ')
+ n << note
+ end
+ notes_a=n.flatten
+ doc_obj_txt=doc_obj_txt.
+ gsub(/#{Mx[:en_a_o]}([\d]+)\s+(?:.+?)#{Mx[:en_a_c]}/,'~^') # endnote marker marked up
+ else
+ doc_obj_txt=doc_obj_txt.
+ gsub(/#{Mx[:en_b_o]}[\d]+\s+(.+?)#{Mx[:en_b_c]}/,
+ '~[ \1 ]~'). # inline endnote with marker marked up
+ gsub(/#{Mx[:en_a_o]}([*+]+)\s+(.+?)#{Mx[:en_a_c]}/,
+ '~{\1 \2 }~'). # inline endnote with marker marked up
+ gsub(/#{Mx[:en_b_o]}([*+]+)\s+(.+?)#{Mx[:en_b_c]}/,
+ '~[\1 \2 ]~') # inline endnote with marker marked up
+ end
+ [doc_obj_txt,notes_a]
+ end
+ def objects #def i_ao_o_src_markup_restore(txtobj)
+ def code_(dob)
+ if dob.is==:code
+ dob.obj=dob.obj.gsub(/(^|[^}])_([<>])/m,'\1\2'). # _> _<
+ gsub(/(^|[^}])_([<>])/m,'\1\2') # _<_<
+ end
+ dob
+ end
+ def block_(dob)
+ dob.obj=if dob.of==:block # watch
+ dob.obj.gsub(/#{Mx[:gl_o]}●#{Mx[:gl_c]}/,"* ").
+ gsub(/#{Mx[:br_line]}|#{Mx[:br_nl]}/,"\n")
+ else dob.obj.gsub(/#{Mx[:br_line]}|#{Mx[:br_nl]}/,"\n\n")
+ end
+ dob
+ end
+ def textface_marks_po4a(dob,endnotes_=:inline)
+ notes=''
+ dob.obj=dob.obj.
+ gsub(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,
+ Mx[:src_bold_o] + '\1' + Mx[:src_bold_c]).
+ gsub(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,
+ Mx[:src_italics_o] + '\1' + Mx[:src_italics_c]).
+ gsub(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,
+ Mx[:src_underscore_o] + '\1' + Mx[:src_underscore_c]).
+ gsub(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,
+ Mx[:src_subscript_o] + '\1' + Mx[:src_subscript_c]).
+ gsub(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,
+ Mx[:src_superscript_o] + '\1' + Mx[:src_superscript_c]).
+ gsub(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,
+ Mx[:src_insert_o] + '\1' + Mx[:src_insert_c]).
+ gsub(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,
+ Mx[:src_cite_o] + '\1' + Mx[:src_cite_c]).
+ gsub(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,
+ Mx[:src_strike_o] + '\1' + Mx[:src_strike_c]).
+ gsub(/#{Mx[:fa_monospace_o]}(.+?)#{Mx[:fa_monospace_c]}/,
+ Mx[:src_monospace_o] + '\1' + Mx[:src_monospace_c])
+ unless dob.is==:code
+ dob.obj=dob.obj.
+ gsub(/#{Mx[:lnk_o]}(.+?)#{Mx[:lnk_c]}#{Mx[:rel_o]}\S+?#{Mx[:rel_c]}/,'\1').
+ gsub(/#{Mx[:url_o]}_(\S+?)#{Mx[:url_c]}/,'\1').
+ gsub(/#{Mx[:lnk_o]}(.+?)#{Mx[:lnk_c]}#{Mx[:url_o]}(\S+?)#{Mx[:url_c]}/,
+ '\1 [link: <\2>]').
+ gsub(/#{Mx[:lnk_o]}(.+?)#{Mx[:lnk_c]}image/,
+ '\1 [link: local image]').
+ gsub(/#{Mx[:url_o]}(\S+?)#{Mx[:url_c]}/,'\1')
+ dob.obj,notes=extract_endnotes(dob.obj,endnotes_)
+ dob.obj=dob.obj.
+ gsub(/#{Mx[:gl_o]}(?:#lt|#060)#{Mx[:gl_c]}/,'<').
+ gsub(/#{Mx[:gl_o]}(?:#gt|#062)#{Mx[:gl_c]}/,'>').
+ gsub(/#{Mx[:gl_o]}#(?:038|amp)#{Mx[:gl_c]}/,'&').
+ gsub(/#{Mx[:gl_o]}#033#{Mx[:gl_c]}/,'!').
+ gsub(/#{Mx[:gl_o]}#035#{Mx[:gl_c]}/,'#').
+ gsub(/#{Mx[:gl_o]}#042#{Mx[:gl_c]}/,'*').
+ gsub(/#{Mx[:gl_o]}#045#{Mx[:gl_c]}/,'-').
+ gsub(/#{Mx[:gl_o]}#047#{Mx[:gl_c]}/,'/').
+ gsub(/#{Mx[:gl_o]}#095#{Mx[:gl_c]}/,'_').
+ gsub(/#{Mx[:gl_o]}#123#{Mx[:gl_c]}/,'{').
+ gsub(/#{Mx[:gl_o]}#125#{Mx[:gl_c]}/,'}').
+ gsub(/#{Mx[:gl_o]}#126#{Mx[:gl_c]}/,'~').
+ gsub(/#{Mx[:gl_o]}#169#{Mx[:gl_c]}/,'©')
+ end
+ dob=block_(dob)
+ dob=code_(dob)
+ dob.obj=dob.obj.gsub(/#{Mx[:br_page]}\s*|#{Mx[:br_page_new]}/,''). # remove page breaks
+ gsub(/#{Mx[:url_o]}_(\S+?)#{Mx[:url_c]}/,'\1').
+ gsub(/#{Mx[:mk_o]}:name#(\S+?)#{Mx[:mk_c]}/,''). # remove name links
+ gsub(/&nbsp;|#{Mx[:nbsp]}/,' '). # decide on
+ gsub(/(?:^|[^_\\])#{Mx[:lnk_o]}(\S+?\.(?:png|jpg|gif)) .+?#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/,
+ ' [ \1 ]'). #"[ #{dir.url.images_local}\/\\1 ]")
+ gsub(/(?:^|[^_\\])#{Mx[:lnk_o]}(\S+?\.(?:png|jpg|gif)) .+?#{Mx[:lnk_c]}image/,
+ ' [ \1 ]'). #"[ #{dir.url.images_local}\/\\1 ]")
+ gsub(/(?:^|[^_\\])\{\s*\S+?\.(?:png|jpg|gif)\s+.+?"(.*?)"\s*\}\S+/,
+ '[image: "\1"]')
+ [dob,notes]
+ end
+ def object_marks
+ @txtobj
+ end
+ self
+ end
+end
+__END__