From d29a3e5469d8468084641c385ebf16948f7c2437 Mon Sep 17 00:00:00 2001 From: Ralph Amissah Date: Tue, 22 Jul 2008 20:00:59 -0400 Subject: sisu-0.68.0 proposed * middle layer document representation changed, (accounting for substantial patch) * texpdf multiple document sizes as specified in config * numerous small fixes [should on the whole be easier to maintain] --- lib/sisu/v0/plaintext.rb | 162 ++++++++++++++++++----------------------------- 1 file changed, 61 insertions(+), 101 deletions(-) (limited to 'lib/sisu/v0/plaintext.rb') diff --git a/lib/sisu/v0/plaintext.rb b/lib/sisu/v0/plaintext.rb index 2cf26b93..b89a6252 100644 --- a/lib/sisu/v0/plaintext.rb +++ b/lib/sisu/v0/plaintext.rb @@ -68,6 +68,7 @@ module SiSU_Plaintext require "#{SiSU_lib}/plaintext_format" include Format require "#{SiSU_lib}/shared_txt" + require "#{SiSU_lib}/shared_structure" pwd=Dir.pwd @@alt_id_count,@@alt_id_count,@@tablehead,@@number_of_cols=0,0,0,0 @@tablefoot='' @@ -111,54 +112,6 @@ module SiSU_Plaintext end end private - class Split_text_object ).+?<~(\d+);(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>$/ - if /^(([1-6])~(\S+))\s+(\S.+?)<~(\d+);(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>$/m.match(@para) - @format,@lev,segname,@text,@ocn=$1,$2,$3,$4,$5 - elsif /^(([1-6])~)\s+(\S.+?)<~(\d+);(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>$/m.match(@para) - @format,@lev,@text,@ocn=$1,$2,$3,$4 - elsif /<:(.+?)>\s*(\S.+?)<~(\d+);(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>$/m.match(@para) - @format,@text,@ocn=$1,$2,$3 - elsif /^(([1-6])~(\S+))\s+(\S.+?)<~(\d+);(?:\w|[0-6]:)\d+;[um]\d+><#@dp:#@dp>$/m.match(@para) - @@alt_id_count+=1 - @format,@lev,segname,@text,@ocn=$1,$2,$3,$4,"x#{@@alt_id_count}" - elsif /^(([1-6])~)\s+(\S.+?)<~(\d+);[um]\d+;\w\d+><#@dp:#@dp>$/m.match(@para) - @@alt_id_count+=1 - @format,@lev,@text,@ocn=$1,$2,$3,"x#{@@alt_id_count}" - end - else - if /(.+?)<~(\d+);(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>$/m.match(@para) - @text,@ocn=$1,$2 - end - if @para !~/<~(\d+);(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>$|^$/ #added 2002w06 - @text=/(.+?)/m.match(@para)[1] - end - if /^((\d)~(?:~\S+)?)\s+(.+)/m.match(@para) - @format,@lev,@text=$1,$2,$3 - end - end - format=@format.dup - @lev_para_ocn=if @para =~/.+<~\d+;(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>$/ - Format::Format_text_object.new(format,@text,@ocn) - else - Format::Format_text_object.new(format,@text,"<~(\d+);[um]\d+;\w\d+><#@dp:#@dp>") - end - self - end - end class Scroll \s*)?\d~(?:(\S+))?\s+)?(.+?)\s*<~(\d+);(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>$/m # 2004w18 pb pn removal added + @regx=/^(?:(?:#{Mx[:br_page]}\s*|#{Mx[:br_page_new]}\s*)?#{Mx[:lv_o]}\d:(\S*?)#{Mx[:lv_c]})?\s*(.+?)\s*#{Mx[:id_o]}~(\d+);(?:\w|[0-6]:)\d+;\w\d+#{Mx[:id_c]}#{Mx[:id_o]}#@dp:#@dp#{Mx[:id_c]}$/m # 2004w18 pb pn removal added @tab="\t" @br=if md.mod.inspect =~ /--footnote/ \ and md.mod.inspect =~ /--dos/ @@ -198,12 +151,12 @@ module SiSU_Plaintext end # Used for extraction of endnotes from paragraphs def extract_endnotes(para='') - notes=para.scan(/~[{\[]([\d*+]+\s+.+?)\s*<#@dp>[}\]]~/) + notes=para.scan(/(?:#{Mx[:en_a_o]}|#{Mx[:en_b_o]})([\d*+]+\s+.+?)\s*#{Mx[:id_o]}#@dp#{Mx[:id_c]}(?:#{Mx[:en_a_c]}|#{Mx[:en_b_c]})/) @n=[] notes.flatten.each do |n| #high cost to deal with
appropriately within plaintext, consider n=n.dup.to_s - if n =~// - fix = n.split(//) #watch #added + if n =~/#{Mx[:br_line]}|#{Mx[:br_nl]}/ + fix = n.split(/#{Mx[:br_line]}|#{Mx[:br_nl]}/) #watch #added fix.each do |x| unless x.empty?; @n << x end @@ -283,11 +236,11 @@ WOK lv=nil if lv == 0 wrapped=if para[@regx] paragraph=para[@regx,2] - if paragraph =~/<:i([1-9])>/ + if paragraph =~/#{Mx[:pa_o]}:i([1-9])#{Mx[:pa_c]}/ m=$1.to_i - paragraph.gsub!(/<:i#{m}>/,'') + paragraph.gsub!(/#{Mx[:pa_o]}:i#{m}#{Mx[:pa_c]}/,'') util=SiSU_text_utils::Wrap.new(paragraph,78,m*2) - else util=SiSU_text_utils::Wrap.new(paragraph,78,0) + else util=SiSU_text_utils::Wrap.new(paragraph.gsub(/#{Mx[:lv_o]}[1-9]:\S*?#{Mx[:lv_c]}/,''),78,0) end util.line_wrap end @@ -322,83 +275,89 @@ WOK table_message='[table omitted, see other document formats]' fix=[] data.each do |para| - para.gsub!(//,'') # remove dummy headings (used by html) #check - para.gsub!(/_\*\s+/,'* ') # bullet markup, marked down - para.gsub!(/(.+?)<\/sup>/,'^\1^') - para.gsub!(/(.+?)<\/sub>/,'[\1]') - para.gsub!(/(.+?)<\/i>/,'/\1/') - para.gsub!(/(.+?)<\/b>/,'*\1*') - para.gsub!(/(.+?)<\/u>/,'_\1_') - unless para =~/<:code>/ + para.gsub!(/#{Mx[:gr_o]}Th?#{Mx[:tc_p]}.+/um,"#@br#{table_message}") + para.gsub!(/.+?#{Mx[:gl_o]}-##{Mx[:gl_c]}/,'') # remove dummy headings (used by html) #check + para.gsub!(/#{Mx[:gl_bullet]}\s*/,'* ') # bullet markup, marked down + para.gsub!(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'*\1*') + para.gsub!(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'/\1/') + para.gsub!(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'[\1]') + para.gsub!(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'_\1_') + para.gsub!(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'^\1^') + para.gsub!(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'+\1+') + para.gsub!(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'"\1"') + para.gsub!(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,'-\1-') + unless para =~/#{Mx[:gr_o]}code#{Mx[:gr_c]}/ para.gsub!(/\{(.+?)\}((?:https?|file|ftp):\/\/\S+|image)/,'\1 [link:] \2') - para.gsub!(/(^|\s)((?:https?|file|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,"\\1#{@url_brace.txt_open}\\2#{@url_brace.txt_close}\\3") + para.gsub!(/(^|#{Mx[:gl_c]}|\s)((?:https?|file|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,"\\1#{@url_brace.txt_open}\\2#{@url_brace.txt_close}\\3") para.gsub!(/_((?:https?|file|ftp):\/\/\S+)/,'\1') extract_endnotes(para) - para.gsub!(/~[{\[]([\d*+]+)\s+(?:.+?)[}\]]~/,'[^\1]') # endnote marker marked up - para.gsub!(/&/,'&') - para.gsub!(/!/,'!') - para.gsub!(/#/,'#') - para.gsub!(/*/,'*') - para.gsub!(/-/,'-') - para.gsub!(///,'/') - para.gsub!(/_/,'_') - para.gsub!(/{/,'{') - para.gsub!(/}/,'}') - para.gsub!(/~/,'~') - para.gsub!(/©/,'©') + para.gsub!(/#{Mx[:en_a_o]}([\d*+]+)\s+(?:.+?)#{Mx[:en_a_c]}/,'[^\1]') # endnote marker marked up + para.gsub!(/#{Mx[:en_b_o]}([\d*+]+)\s+(?:.+?)#{Mx[:en_b_c]}/,'[^\1]') # endnote marker marked up + para.gsub!(/#{Mx[:gl_o]}(?:#lt|#060)#{Mx[:gl_c]}/,'<') + para.gsub!(/#{Mx[:gl_o]}(?:#gt|#062)#{Mx[:gl_c]}/,'>') + para.gsub!(/#{Mx[:gl_o]}#(?:038|amp)#{Mx[:gl_c]}/,'&') + para.gsub!(/#{Mx[:gl_o]}#033#{Mx[:gl_c]}/,'!') + para.gsub!(/#{Mx[:gl_o]}#035#{Mx[:gl_c]}/,'#') + para.gsub!(/#{Mx[:gl_o]}#042#{Mx[:gl_c]}/,'*') + para.gsub!(/#{Mx[:gl_o]}#045#{Mx[:gl_c]}/,'-') + para.gsub!(/#{Mx[:gl_o]}#047#{Mx[:gl_c]}/,'/') + para.gsub!(/#{Mx[:gl_o]}#095#{Mx[:gl_c]}/,'_') + para.gsub!(/#{Mx[:gl_o]}#123#{Mx[:gl_c]}/,'{') + para.gsub!(/#{Mx[:gl_o]}#125#{Mx[:gl_c]}/,'}') + para.gsub!(/#{Mx[:gl_o]}#126#{Mx[:gl_c]}/,'~') + para.gsub!(/#{Mx[:gl_o]}#169#{Mx[:gl_c]}/,'©') end - if para =~/<:(?:group|verse|alt|code)(?:-end)?>(?:\s+<~(\d+);(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>)?/ - if para =~/<:code>/ #code-block: angle brackets special characters + if para =~/#{Mx[:gr_o]}(?:group|verse|alt|code)(?:-end)?#{Mx[:gr_c]}(?:\s+#{Mx[:id_o]}~(\d+);(?:\w|[0-6]:)\d+;\w\d+#{Mx[:id_c]}#{Mx[:id_o]}#@dp:#@dp#{Mx[:id_c]})?/ ##{Mx[:gr_o]}codeline#{Mx[:gr_c]} + if para =~/#{Mx[:gr_o]}code#{Mx[:gr_c]}/ #code-block: angle brackets special characters para.gsub!(/(^|[^}])_([<>])/m,'\1\2') # _> _< para.gsub!(/(^|[^}])_([<>])/m,'\1\2') # _<_< end - para.gsub!(//,"\n") # watch - para.gsub!(/<:(?:group|verse|alt|code)(?:-end)?>(?:\s+<~(\d+);(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>)?/,'') - else para.gsub!(//,"\n\n") # watch introduces a bug + para.gsub!(/#{Mx[:br_line]}|#{Mx[:br_nl]}/,"\n") # watch + para.gsub!(/#{Mx[:gr_o]}(?:group|verse|alt|code)(?:-end)?#{Mx[:gr_c]}(?:\s+#{Mx[:id_o]}~(\d+);(?:\w|[0-6]:)\d+;\w\d+#{Mx[:id_c]}#{Mx[:id_o]}#@dp:#@dp#{Mx[:id_c]})?/,'') + else para.gsub!(/#{Mx[:br_line]}|#{Mx[:br_nl]}/,"\n\n") # watch introduces a bug end - para.gsub!(/<:p[bn]>/,'') # remove page breaks - para.gsub!(/^\s*<~\d+;(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>$/,'') # remove empty lines - check - para.gsub!(/(^|\s)[_\\]((?:https?|file|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\2\3') + para.gsub!(/#{Mx[:br_page]}\s*|#{Mx[:br_page_new]}/,'') # remove page breaks + para.gsub!(/^\s*#{Mx[:id_o]}~\d+;(?:\w|[0-6]:)\d+;\w\d+#{Mx[:id_c]}#{Mx[:id_o]}#@dp:#@dp#{Mx[:id_c]}$/,'') # remove empty lines - check + para.gsub!(/(^|#{Mx[:gl_c]}|\s)[_\\]((?:https?|file|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\2\3') para.gsub!(/(.+?)<\/a>/m,'\1') - para.gsub!(/<:name#\S+?>/,'') # remove name links + para.gsub!(/#{Mx[:mk_o]}:name#(\S+?)#{Mx[:mk_c]}/,'') # remove name links para.gsub!(/ /,' ') # decide on para.gsub!(/(?:^|[^_\\])\{(\S+?\.(?:png|jpg|gif)) .+?\}(?:(?:https?|file|ftp):\/\/\S+|image)/,' [ \1 ]') #"[ #{dir.url.images_local}\/\\1 ]") para.gsub!(/(?:^|[^_\\])\{\s*\S+?\.(?:png|jpg|gif)\s+.+?"(.*?)"\s*\}\S+/,'[image: "\1"]') #para.gsub!(/^\{\S+?\.(?:png|jpg|gif)\s+.+?"(.*?)"\s*\}\S+/,'[image: "\1"]') wordlist=para.scan(/\S+/) - if para =~/^0~(\S+)\s+(.+?)\Z/m # for headers + if para =~/^#{Rx[:meta]}\s*(.+?)\Z/m # for headers d_meta=SiSU_text_utils::Header_scan.new(@md,para).meta if d_meta; plaintext_metadata(d_meta) end end - if para !~/(^0~||)/ + if para !~/(^#{Rx[:meta]}|#{Mx[:br_eof]}|#{Mx[:br_endnotes]})/ if para =~@regx #/.+?<~\d+;\w\d+;\w\d+>.*/ #watch change paranum=para[@regx,3] @p_num=Format::Paragraph_number.new(paranum) end - @sto=Split_text_object.new(para).lev_segname_para_ocn + @sto=SiSU_Structure::Split_text_object.new(@md,para).txt ### problem in scroll, it appears tables are getting paragraph numbers - m=/<~(\d+);(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>$/ + m=/#{Mx[:id_o]}~(\d+);(?:\w|[0-6]:)\d+;\w\d+#{Mx[:id_c]}#{Mx[:id_o]}#@dp:#@dp#{Mx[:id_c]}$/ if para =~m \ and para=~/\S+/ para=case @sto.format - when /^(1)~(?:(\S+))?/ + when /^(1):(\S*?)/ plaintext_structure(para,$1,@sto.ocn,$2) @sto.lev_para_ocn.heading_body1 - when /^(2)~(?:(\S+))?/ + when /^(2):(\S*?)/ plaintext_structure(para,$1,@sto.ocn,$2) @sto.lev_para_ocn.heading_body2 - when /^(3)~(?:(\S+))?/ + when /^(3):(\S*?)/ plaintext_structure(para,$1,@sto.ocn,$2) @sto.lev_para_ocn.heading_body3 - when /^(4)~(\S+)/ # work on see SiSU_text_parts::Split_text_object + when /^(4):(\S+?)/ # work on see SiSU_text_parts::Split_text_object plaintext_structure(para,$1,@sto.ocn,$2) @sto.lev_para_ocn.heading_body4 - when /^(5)~(?:(\S+))?/ + when /^(5):(\S*?)/ plaintext_structure(para,$1,@sto.ocn,$2) @sto.lev_para_ocn.heading_body5 - when /^(6)~(?:(\S+))?/ + when /^(6):(\S*?)/ plaintext_structure(para,$1,@sto.ocn,$2) @sto.lev_para_ocn.heading_body6 #when /^(i1)$/ @@ -427,17 +386,17 @@ WOK elsif para =~/#{table_message}/ @plaintext[:body] << para << @br elsif para =~/(Note|Endnotes?)/ \ - and para !~/<~\d+;(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>$/ + and para !~/#{Mx[:id_o]}~\d+;(?:\w|[0-6]:)\d+;\w\d+#{Mx[:id_c]}#{Mx[:id_o]}#@dp:#@dp#{Mx[:id_c]}$/ elsif para =~/(MetaData)/ \ - and para =~/<~(\d+);[um]\d+;\w\d+><#@dp:#@dp>$/ #debug 2003w46 add rc info ####suspect visit + and para =~/#{Mx[:id_o]}~(\d+);[um]\d+;\w\d+#{Mx[:id_c]}#{Mx[:id_o]}#@dp:#@dp#{Mx[:id_c]}$/ #debug 2003w46 add rc info ####suspect visit #formatMono=MonoSiSU.new('
MetaData') #para=formatMono.bold_para elsif para.include? 'Owner Details' \ - and para !~/<~(\d+);(?:[oh]|[0-6]:)\d+;\w\d+><#@dp:#@dp>$/ + and para !~/#{Mx[:id_o]}~(\d+);(?:[oh]|[0-6]:)\d+;\w\d+#{Mx[:id_c]}#{Mx[:id_o]}#@dp:#@dp#{Mx[:id_c]}$/ #formatMono=MonoSiSU.new('
Owner Details') #@plaintext[:owner_details]=formatMono.bold_para #para='' - elsif para =~/(¡|(.*)/ one,two=$1,$2 format_text=Format_text_object.new(one,two) @@ -446,7 +405,7 @@ WOK para='' if (para =~// \ and para =~/^(-\{{2}~\d+|)/) # -endnote case para - when /<:i[1-9]>/ + when /#{Mx[:pa_o]}:i[1-9]#{Mx[:pa_c]}/ if para =~/.*<:#>.*$/m format_text=Format_text_object.new(para,'') para=format_text.scr_indent_one_no_paranum @@ -460,6 +419,7 @@ WOK format_text=Format_text_object.new(one,two) para=format_text.center end + para.gsub!(/#{Mx[:id_o]}.+?#{Mx[:id_c]}/,' ') if para ## Clean Prepared Text para.gsub!(//,' ') if para ## Clean Prepared Text para.gsub!(/<:\S+>/,' ') if para ## Clean Prepared Text end -- cgit v1.2.3