aboutsummaryrefslogtreecommitdiffhomepage
path: root/lib/sisu/v0/plaintext.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/sisu/v0/plaintext.rb')
-rw-r--r--lib/sisu/v0/plaintext.rb162
1 files changed, 61 insertions, 101 deletions
diff --git a/lib/sisu/v0/plaintext.rb b/lib/sisu/v0/plaintext.rb
index 2cf26b93..b89a6252 100644
--- a/lib/sisu/v0/plaintext.rb
+++ b/lib/sisu/v0/plaintext.rb
@@ -68,6 +68,7 @@ module SiSU_Plaintext
require "#{SiSU_lib}/plaintext_format"
include Format
require "#{SiSU_lib}/shared_txt"
+ require "#{SiSU_lib}/shared_structure"
pwd=Dir.pwd
@@alt_id_count,@@alt_id_count,@@tablehead,@@number_of_cols=0,0,0,0
@@tablefoot=''
@@ -111,54 +112,6 @@ module SiSU_Plaintext
end
end
private
- class Split_text_object <Source
- require "#{SiSU_lib}/plaintext_format"
- include SiSU_Viz
- include Format
- @@alt_id_count=0
- @@dp=nil
- attr_reader :format,:lev,:text,:ocn,:lev_para_ocn
- def initialize(para)
- @para=para
- @format,@ocn='null','null'
- @dp=@@dp ||=SiSU_Env::Info_env.new.digest.pattern
- end
- def lev_segname_para_ocn
- @text=nil
- if @para =~/^(\d~|<:.+?>).+?<~(\d+);(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>$/
- if /^(([1-6])~(\S+))\s+(\S.+?)<~(\d+);(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>$/m.match(@para)
- @format,@lev,segname,@text,@ocn=$1,$2,$3,$4,$5
- elsif /^(([1-6])~)\s+(\S.+?)<~(\d+);(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>$/m.match(@para)
- @format,@lev,@text,@ocn=$1,$2,$3,$4
- elsif /<:(.+?)>\s*(\S.+?)<~(\d+);(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>$/m.match(@para)
- @format,@text,@ocn=$1,$2,$3
- elsif /^(([1-6])~(\S+))\s+(\S.+?)<~(\d+);(?:\w|[0-6]:)\d+;[um]\d+><#@dp:#@dp>$/m.match(@para)
- @@alt_id_count+=1
- @format,@lev,segname,@text,@ocn=$1,$2,$3,$4,"x#{@@alt_id_count}"
- elsif /^(([1-6])~)\s+(\S.+?)<~(\d+);[um]\d+;\w\d+><#@dp:#@dp>$/m.match(@para)
- @@alt_id_count+=1
- @format,@lev,@text,@ocn=$1,$2,$3,"x#{@@alt_id_count}"
- end
- else
- if /(.+?)<~(\d+);(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>$/m.match(@para)
- @text,@ocn=$1,$2
- end
- if @para !~/<~(\d+);(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>$|^$/ #added 2002w06
- @text=/(.+?)/m.match(@para)[1]
- end
- if /^((\d)~(?:~\S+)?)\s+(.+)/m.match(@para)
- @format,@lev,@text=$1,$2,$3
- end
- end
- format=@format.dup
- @lev_para_ocn=if @para =~/.+<~\d+;(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>$/
- Format::Format_text_object.new(format,@text,@ocn)
- else
- Format::Format_text_object.new(format,@text,"<~(\d+);[um]\d+;\w\d+><#@dp:#@dp>")
- end
- self
- end
- end
class Scroll <Source
require "#{SiSU_lib}/defaults"
require "#{SiSU_lib}/shared_txt"
@@ -170,7 +123,7 @@ module SiSU_Plaintext
@url_brace=SiSU_Viz::Skin.new.url_decoration
@vz=SiSU_Env::Get_init.instance.skin
@dp=@@dp ||=SiSU_Env::Info_env.new.digest.pattern
- @regx=/^(?:(?:<:p[bn]>\s*)?\d~(?:(\S+))?\s+)?(.+?)\s*<~(\d+);(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>$/m # 2004w18 pb pn removal added
+ @regx=/^(?:(?:#{Mx[:br_page]}\s*|#{Mx[:br_page_new]}\s*)?#{Mx[:lv_o]}\d:(\S*?)#{Mx[:lv_c]})?\s*(.+?)\s*#{Mx[:id_o]}~(\d+);(?:\w|[0-6]:)\d+;\w\d+#{Mx[:id_c]}#{Mx[:id_o]}#@dp:#@dp#{Mx[:id_c]}$/m # 2004w18 pb pn removal added
@tab="\t"
@br=if md.mod.inspect =~ /--footnote/ \
and md.mod.inspect =~ /--dos/
@@ -198,12 +151,12 @@ module SiSU_Plaintext
end
# Used for extraction of endnotes from paragraphs
def extract_endnotes(para='')
- notes=para.scan(/~[{\[]([\d*+]+\s+.+?)\s*<#@dp>[}\]]~/)
+ notes=para.scan(/(?:#{Mx[:en_a_o]}|#{Mx[:en_b_o]})([\d*+]+\s+.+?)\s*#{Mx[:id_o]}#@dp#{Mx[:id_c]}(?:#{Mx[:en_a_c]}|#{Mx[:en_b_c]})/)
@n=[]
notes.flatten.each do |n| #high cost to deal with <br> appropriately within plaintext, consider
n=n.dup.to_s
- if n =~/<br(?: \/)?>/
- fix = n.split(/<br(?: \/)?>/) #watch #added
+ if n =~/#{Mx[:br_line]}|#{Mx[:br_nl]}/
+ fix = n.split(/#{Mx[:br_line]}|#{Mx[:br_nl]}/) #watch #added
fix.each do |x|
unless x.empty?; @n << x
end
@@ -283,11 +236,11 @@ WOK
lv=nil if lv == 0
wrapped=if para[@regx]
paragraph=para[@regx,2]
- if paragraph =~/<:i([1-9])>/
+ if paragraph =~/#{Mx[:pa_o]}:i([1-9])#{Mx[:pa_c]}/
m=$1.to_i
- paragraph.gsub!(/<:i#{m}>/,'')
+ paragraph.gsub!(/#{Mx[:pa_o]}:i#{m}#{Mx[:pa_c]}/,'')
util=SiSU_text_utils::Wrap.new(paragraph,78,m*2)
- else util=SiSU_text_utils::Wrap.new(paragraph,78,0)
+ else util=SiSU_text_utils::Wrap.new(paragraph.gsub(/#{Mx[:lv_o]}[1-9]:\S*?#{Mx[:lv_c]}/,''),78,0)
end
util.line_wrap
end
@@ -322,83 +275,89 @@ WOK
table_message='[table omitted, see other document formats]'
fix=[]
data.each do |para|
- para.gsub!(/<!Th?¡.+/um,"#@br#{table_message}")
- para.gsub!(/.+?<-#>/,'') # remove dummy headings (used by html) #check
- para.gsub!(/_\*\s+/,'* ') # bullet markup, marked down
- para.gsub!(/<sup>(.+?)<\/sup>/,'^\1^')
- para.gsub!(/<sub>(.+?)<\/sub>/,'[\1]')
- para.gsub!(/<i>(.+?)<\/i>/,'/\1/')
- para.gsub!(/<b>(.+?)<\/b>/,'*\1*')
- para.gsub!(/<u>(.+?)<\/u>/,'_\1_')
- unless para =~/<:code>/
+ para.gsub!(/#{Mx[:gr_o]}Th?#{Mx[:tc_p]}.+/um,"#@br#{table_message}")
+ para.gsub!(/.+?#{Mx[:gl_o]}-##{Mx[:gl_c]}/,'') # remove dummy headings (used by html) #check
+ para.gsub!(/#{Mx[:gl_bullet]}\s*/,'* ') # bullet markup, marked down
+ para.gsub!(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'*\1*')
+ para.gsub!(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'/\1/')
+ para.gsub!(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'[\1]')
+ para.gsub!(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'_\1_')
+ para.gsub!(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'^\1^')
+ para.gsub!(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'+\1+')
+ para.gsub!(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'"\1"')
+ para.gsub!(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,'-\1-')
+ unless para =~/#{Mx[:gr_o]}code#{Mx[:gr_c]}/
para.gsub!(/\{(.+?)\}((?:https?|file|ftp):\/\/\S+|image)/,'\1 [link:] \2')
- para.gsub!(/(^|\s)((?:https?|file|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,"\\1#{@url_brace.txt_open}\\2#{@url_brace.txt_close}\\3")
+ para.gsub!(/(^|#{Mx[:gl_c]}|\s)((?:https?|file|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,"\\1#{@url_brace.txt_open}\\2#{@url_brace.txt_close}\\3")
para.gsub!(/_((?:https?|file|ftp):\/\/\S+)/,'\1')
extract_endnotes(para)
- para.gsub!(/~[{\[]([\d*+]+)\s+(?:.+?)[}\]]~/,'[^\1]') # endnote marker marked up
- para.gsub!(/&amp;/,'&')
- para.gsub!(/&#033;/,'!')
- para.gsub!(/&#035;/,'#')
- para.gsub!(/&#042;/,'*')
- para.gsub!(/&#045;/,'-')
- para.gsub!(/&#047;/,'/')
- para.gsub!(/&#095;/,'_')
- para.gsub!(/&#123;/,'{')
- para.gsub!(/&#125;/,'}')
- para.gsub!(/&#126;/,'~')
- para.gsub!(/&#169;/,'©')
+ para.gsub!(/#{Mx[:en_a_o]}([\d*+]+)\s+(?:.+?)#{Mx[:en_a_c]}/,'[^\1]') # endnote marker marked up
+ para.gsub!(/#{Mx[:en_b_o]}([\d*+]+)\s+(?:.+?)#{Mx[:en_b_c]}/,'[^\1]') # endnote marker marked up
+ para.gsub!(/#{Mx[:gl_o]}(?:#lt|#060)#{Mx[:gl_c]}/,'<')
+ para.gsub!(/#{Mx[:gl_o]}(?:#gt|#062)#{Mx[:gl_c]}/,'>')
+ para.gsub!(/#{Mx[:gl_o]}#(?:038|amp)#{Mx[:gl_c]}/,'&')
+ para.gsub!(/#{Mx[:gl_o]}#033#{Mx[:gl_c]}/,'!')
+ para.gsub!(/#{Mx[:gl_o]}#035#{Mx[:gl_c]}/,'#')
+ para.gsub!(/#{Mx[:gl_o]}#042#{Mx[:gl_c]}/,'*')
+ para.gsub!(/#{Mx[:gl_o]}#045#{Mx[:gl_c]}/,'-')
+ para.gsub!(/#{Mx[:gl_o]}#047#{Mx[:gl_c]}/,'/')
+ para.gsub!(/#{Mx[:gl_o]}#095#{Mx[:gl_c]}/,'_')
+ para.gsub!(/#{Mx[:gl_o]}#123#{Mx[:gl_c]}/,'{')
+ para.gsub!(/#{Mx[:gl_o]}#125#{Mx[:gl_c]}/,'}')
+ para.gsub!(/#{Mx[:gl_o]}#126#{Mx[:gl_c]}/,'~')
+ para.gsub!(/#{Mx[:gl_o]}#169#{Mx[:gl_c]}/,'©')
end
- if para =~/<:(?:group|verse|alt|code)(?:-end)?>(?:\s+<~(\d+);(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>)?/
- if para =~/<:code>/ #code-block: angle brackets special characters
+ if para =~/#{Mx[:gr_o]}(?:group|verse|alt|code)(?:-end)?#{Mx[:gr_c]}(?:\s+#{Mx[:id_o]}~(\d+);(?:\w|[0-6]:)\d+;\w\d+#{Mx[:id_c]}#{Mx[:id_o]}#@dp:#@dp#{Mx[:id_c]})?/ ##{Mx[:gr_o]}codeline#{Mx[:gr_c]}
+ if para =~/#{Mx[:gr_o]}code#{Mx[:gr_c]}/ #code-block: angle brackets special characters
para.gsub!(/(^|[^}])_([<>])/m,'\1\2') # _> _<
para.gsub!(/(^|[^}])_([<>])/m,'\1\2') # _<_<
end
- para.gsub!(/<br(?: \/)?>/,"\n") # watch
- para.gsub!(/<:(?:group|verse|alt|code)(?:-end)?>(?:\s+<~(\d+);(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>)?/,'')
- else para.gsub!(/<br(?: \/)?>/,"\n\n") # watch introduces a bug
+ para.gsub!(/#{Mx[:br_line]}|#{Mx[:br_nl]}/,"\n") # watch
+ para.gsub!(/#{Mx[:gr_o]}(?:group|verse|alt|code)(?:-end)?#{Mx[:gr_c]}(?:\s+#{Mx[:id_o]}~(\d+);(?:\w|[0-6]:)\d+;\w\d+#{Mx[:id_c]}#{Mx[:id_o]}#@dp:#@dp#{Mx[:id_c]})?/,'')
+ else para.gsub!(/#{Mx[:br_line]}|#{Mx[:br_nl]}/,"\n\n") # watch introduces a bug
end
- para.gsub!(/<:p[bn]>/,'') # remove page breaks
- para.gsub!(/^\s*<~\d+;(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>$/,'') # remove empty lines - check
- para.gsub!(/(^|\s)[_\\]((?:https?|file|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\2\3')
+ para.gsub!(/#{Mx[:br_page]}\s*|#{Mx[:br_page_new]}/,'') # remove page breaks
+ para.gsub!(/^\s*#{Mx[:id_o]}~\d+;(?:\w|[0-6]:)\d+;\w\d+#{Mx[:id_c]}#{Mx[:id_o]}#@dp:#@dp#{Mx[:id_c]}$/,'') # remove empty lines - check
+ para.gsub!(/(^|#{Mx[:gl_c]}|\s)[_\\]((?:https?|file|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\2\3')
para.gsub!(/<a href=".+?">(.+?)<\/a>/m,'\1')
- para.gsub!(/<:name#\S+?>/,'') # remove name links
+ para.gsub!(/#{Mx[:mk_o]}:name#(\S+?)#{Mx[:mk_c]}/,'') # remove name links
para.gsub!(/&nbsp;/,' ') # decide on
para.gsub!(/(?:^|[^_\\])\{(\S+?\.(?:png|jpg|gif)) .+?\}(?:(?:https?|file|ftp):\/\/\S+|image)/,' [ \1 ]') #"[ #{dir.url.images_local}\/\\1 ]")
para.gsub!(/(?:^|[^_\\])\{\s*\S+?\.(?:png|jpg|gif)\s+.+?"(.*?)"\s*\}\S+/,'[image: "\1"]')
#para.gsub!(/^\{\S+?\.(?:png|jpg|gif)\s+.+?"(.*?)"\s*\}\S+/,'[image: "\1"]')
wordlist=para.scan(/\S+/)
- if para =~/^0~(\S+)\s+(.+?)\Z/m # for headers
+ if para =~/^#{Rx[:meta]}\s*(.+?)\Z/m # for headers
d_meta=SiSU_text_utils::Header_scan.new(@md,para).meta
if d_meta; plaintext_metadata(d_meta)
end
end
- if para !~/(^0~|<ENDNOTES>|<EOF>)/
+ if para !~/(^#{Rx[:meta]}|#{Mx[:br_eof]}|#{Mx[:br_endnotes]})/
if para =~@regx #/.+?<~\d+;\w\d+;\w\d+>.*/ #watch change
paranum=para[@regx,3]
@p_num=Format::Paragraph_number.new(paranum)
end
- @sto=Split_text_object.new(para).lev_segname_para_ocn
+ @sto=SiSU_Structure::Split_text_object.new(@md,para).txt
### problem in scroll, it appears tables are getting paragraph numbers
- m=/<~(\d+);(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>$/
+ m=/#{Mx[:id_o]}~(\d+);(?:\w|[0-6]:)\d+;\w\d+#{Mx[:id_c]}#{Mx[:id_o]}#@dp:#@dp#{Mx[:id_c]}$/
if para =~m \
and para=~/\S+/
para=case @sto.format
- when /^(1)~(?:(\S+))?/
+ when /^(1):(\S*?)/
plaintext_structure(para,$1,@sto.ocn,$2)
@sto.lev_para_ocn.heading_body1
- when /^(2)~(?:(\S+))?/
+ when /^(2):(\S*?)/
plaintext_structure(para,$1,@sto.ocn,$2)
@sto.lev_para_ocn.heading_body2
- when /^(3)~(?:(\S+))?/
+ when /^(3):(\S*?)/
plaintext_structure(para,$1,@sto.ocn,$2)
@sto.lev_para_ocn.heading_body3
- when /^(4)~(\S+)/ # work on see SiSU_text_parts::Split_text_object
+ when /^(4):(\S+?)/ # work on see SiSU_text_parts::Split_text_object
plaintext_structure(para,$1,@sto.ocn,$2)
@sto.lev_para_ocn.heading_body4
- when /^(5)~(?:(\S+))?/
+ when /^(5):(\S*?)/
plaintext_structure(para,$1,@sto.ocn,$2)
@sto.lev_para_ocn.heading_body5
- when /^(6)~(?:(\S+))?/
+ when /^(6):(\S*?)/
plaintext_structure(para,$1,@sto.ocn,$2)
@sto.lev_para_ocn.heading_body6
#when /^(i1)$/
@@ -427,17 +386,17 @@ WOK
elsif para =~/#{table_message}/
@plaintext[:body] << para << @br
elsif para =~/(Note|Endnotes?)/ \
- and para !~/<~\d+;(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>$/
+ and para !~/#{Mx[:id_o]}~\d+;(?:\w|[0-6]:)\d+;\w\d+#{Mx[:id_c]}#{Mx[:id_o]}#@dp:#@dp#{Mx[:id_c]}$/
elsif para =~/(MetaData)/ \
- and para =~/<~(\d+);[um]\d+;\w\d+><#@dp:#@dp>$/ #debug 2003w46 add rc info ####suspect visit
+ and para =~/#{Mx[:id_o]}~(\d+);[um]\d+;\w\d+#{Mx[:id_c]}#{Mx[:id_o]}#@dp:#@dp#{Mx[:id_c]}$/ #debug 2003w46 add rc info ####suspect visit
#formatMono=MonoSiSU.new('<br /><a name="metadata">MetaData</a>')
#para=formatMono.bold_para
elsif para.include? 'Owner Details' \
- and para !~/<~(\d+);(?:[oh]|[0-6]:)\d+;\w\d+><#@dp:#@dp>$/
+ and para !~/#{Mx[:id_o]}~(\d+);(?:[oh]|[0-6]:)\d+;\w\d+#{Mx[:id_c]}#{Mx[:id_o]}#@dp:#@dp#{Mx[:id_c]}$/
#formatMono=MonoSiSU.new('<br /><a name="owner.details">Owner Details</a>')
#@plaintext[:owner_details]=formatMono.bold_para
#para=''
- elsif para =~/(¡|<!Th?)/u #tables !
+ elsif para =~/(#{Mx[:tc_p]}|#{Mx[:gr_o]}Th?)/u #tables ! check
elsif para =~/(.*)<!#!>(.*)/
one,two=$1,$2
format_text=Format_text_object.new(one,two)
@@ -446,7 +405,7 @@ WOK
para='' if (para =~/<a name="n\d+">/ \
and para =~/^(-\{{2}~\d+|<!e[:_]\d+!>)/) # -endnote
case para
- when /<:i[1-9]>/
+ when /#{Mx[:pa_o]}:i[1-9]#{Mx[:pa_c]}/
if para =~/.*<:#>.*$/m
format_text=Format_text_object.new(para,'')
para=format_text.scr_indent_one_no_paranum
@@ -460,6 +419,7 @@ WOK
format_text=Format_text_object.new(one,two)
para=format_text.center
end
+ para.gsub!(/#{Mx[:id_o]}.+?#{Mx[:id_c]}/,' ') if para ## Clean Prepared Text
para.gsub!(/<!.+!>/,' ') if para ## Clean Prepared Text
para.gsub!(/<:\S+>/,' ') if para ## Clean Prepared Text
end