aboutsummaryrefslogtreecommitdiffhomepage
path: root/lib/sisu/v0/concordance.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/sisu/v0/concordance.rb')
-rw-r--r--lib/sisu/v0/concordance.rb49
1 files changed, 36 insertions, 13 deletions
diff --git a/lib/sisu/v0/concordance.rb b/lib/sisu/v0/concordance.rb
index 1b777bb5..f62b20ac 100644
--- a/lib/sisu/v0/concordance.rb
+++ b/lib/sisu/v0/concordance.rb
@@ -183,18 +183,18 @@ WOK
@path="#{@env.path.output}/#{@md.fnb}"
@freq=Hash.new(0)
@dp=@@dp ||=SiSU_Env::Info_env.new.digest.pattern
- @rxp_to=Regexp.new("<~(\\d+);(?:[oh]|[0-6]:)\\d+;\\w\\d+><#@dp:#@dp>$")
- @rxp_lv1=Regexp.new('^1~') #line start markers removed, ('^1~') for exceptions <!pn!>\n\n4{{{
- @rxp_lv2=Regexp.new('^2~')
- @rxp_lv3=Regexp.new('^3~')
- @rxp_seg=Regexp.new('^4~(.+?)\s+')
- @rxp_title=Regexp.new('^0~title\s*(.+?)\s*$')
+ @rxp_to=Regexp.new("#{Mx[:id_o]}~(\\d+);(?:[oh]|[0-6]:)\\d+;\\w\\d+#{Mx[:id_c]}#{Mx[:id_o]}#@dp:#@dp#{Mx[:id_c]}|#{Mx[:id_o]}\S+?#{Mx[:id_c]}$")
+ @rxp_lv1=/^#{Mx[:lv_o]}1:/
+ @rxp_lv2=/^#{Mx[:lv_o]}2:/
+ @rxp_lv3=/^#{Mx[:lv_o]}3:/
+ @rxp_seg=/^#{Mx[:lv_o]}4:(\S+?)#{Mx[:lv_c]}/
+ @rxp_title=Regexp.new("^#{Mx[:meta_o]}title#{Mx[:meta_c]}\s*(.+?)\s*$")
@rxp_t1=Regexp.new('^T1')
@rxp_t2=Regexp.new('^T2')
@rxp_t3=Regexp.new('^T3')
- @rxp_excluded1=/(?:https?|file|ftp):\/\/\S+/mi
- @rxp_excluded0=/^(?:to\d+|\d+|&nbsp;|EOF|thumb_\S+|snap_\S+|_+|-+|ii+|iv|vi+|ix|xi+|xiv|xv|xvi+|xix|xx|\S+?_\S+|[\d_]+\w\S+|[\w\d]{1,2}|\d{1,3}\w?|#@dp|[0-9a-f]{16,64}|\d{2,3}x\d{2,3}|\S{0,2}sha\d|\S{0,3}\d{4}w\d\d|\b\w\d+|\d_all\b|e\.?g\.?)$/mi #this regex causes and cures a stack dump in ruby 1.9 !!!
- @rgx_scanlist=%r{(?:<i>(?:[a-zA-Z0-9"\s]){2,7}</i>|<b>(?:[a-zA-Z0-9"\s]){2,7}</b>|(?:https?|file)://\S+)|code\{.+?\}code|<\S+?>|\w+}mi
+ @rxp_excluded1=/(?:https?|file|ftp):\/\/\S+/
+ @rxp_excluded0=/^(?:#{Mx[:fa_bold_o]}|#{Mx[:fa_italics_o]})?(?:to\d+|\d+|&nbsp;|#{Mx[:br_endnotes]}|EOF|#{Mx[:br_eof]}|thumb_\S+|snap_\S+|_+|-+|[(]?(?:ii+|iv|vi+|ix|xi+|xiv|xv|xvi+|xix|xx)[).]?|\S+?_\S+|[\d_]+\w\S+|[\w\d]{1,2}|\d{1,3}\w?|#@dp|[0-9a-f]{16,64}|\d{2,3}x\d{2,3}|\S{0,2}sha\d|\S{0,3}\d{4}w\d\d|\b\w\d+|\d_all\b|e\.?g\.?)(?:#{Mx[:fa_bold_c]}|#{Mx[:fa_italics_c]})?$/mi #this regex causes and cures a stack dump in ruby 1.9 !!!
+ @rgx_scanlist=%r{#{Mx[:fa_italics_o]}[a-zA-Z0-9"\s]{2,12}#{Mx[:fa_italics_c]}|#{Mx[:fa_bold_o]}[a-zA-Z0-9"\s]{2,12}#{Mx[:fa_bold_c]}|(?:https?|file)://\S+|#{Mx[:gr_o]}code#{Mx[:gr_o]}.+?#{Mx[:gr_o]}code-end#{Mx[:gr_o]}|<\S+?>|#{Mx[:id_o]}\S+?#{Mx[:id_c]}|\w+|[a-zA-Z]+}mi
rescue; SiSU_Errors::Info_error.new($!,$@,@md.cmd,@md.fns).error
end
end
@@ -231,20 +231,39 @@ WOK
@seg,toy=nil,nil
@word_map={}
@dal_array.each do |line|
- if line !~/<~(\d+);[um]\d+;\w\d+><#@dp:#@dp>$/ # lines to ignore: # are added but not part of authors substantive text; 0 are mostly machine generated
- if line =~@rxp_seg; @seg=line[@rxp_seg,1]
+ if line !~/#{Mx[:id_o]}~(\d+);[um]\d+;\w\d+#{Mx[:id_c]}#{Mx[:id_o]}#@dp:#@dp#{Mx[:id_c]}$/ #lines to ignore: # are added but not part of authors substantive text; 0 are mostly machine generated
+ if line =~@rxp_seg; @seg=line[@rxp_seg,1]
end
- if line =~@rxp_to; toy=line[@rxp_to,1]
+ if line =~@rxp_to; toy=line[@rxp_to,1]
end
if toy =~/\d+/ \
and toy !~/^0$/
for word in line.scan(@rgx_scanlist) #%take in word or other match
+ #word.gsub!(@rxp_clean,'')
+ word.gsub!(/#{Mx[:fa_o]}\S+?#{Mx[:fa_o_c]}/,'')
+ word.gsub!(/#{Mx[:fa_c_o]}\S+?#{Mx[:fa_c]}/,'')
+ word.gsub!(/#{Mx[:gl_o]}#[a-z]+#{Mx[:gl_c]}/,'')
+ word.gsub!(/#{Mx[:gl_o]}#[0-9]+#{Mx[:gl_c]}/,'')
+ word.gsub!(/^\S$/,'')
+ word=nil if word.empty?
word=nil if word =~@rxp_excluded0 #watch
word=nil if word =~@rxp_excluded1 #watch
+ word=nil if word =~/^\S$/
if word
- #word.gsub!(/<\/?[i]>/,'')
+ word.gsub!(/#{Mx[:br_nl]}|#{Mx[:br_line]}/,' ')
+ word.gsub!(/#{Mx[:lv_o]}\d:\S*?#{Mx[:lv_c]}/,'')
+ word.gsub!(/#{Mx[:pa_o]}:i\d#{Mx[:pa_c]}/,'')
+ word.gsub!(/#{Mx[:id_o]}~\d+;\S+?#{Mx[:id_c]}/,'')
+ word.gsub!(/#{Mx[:fa_o]}[a-z]{1,7}#{Mx[:fa_o_c]}|#{Mx[:fa_c_o]}[a-z]{1,7}#{Mx[:fa_c]}/,'')
+ word.gsub!(/#{Mx[:mk_o]}(?:[0-9a-f]{32}:[0-9a-f]{32}|[0-9a-f]{64}:[0-9a-f]{64})#{Mx[:mk_c]}/,'')
+ word.gsub!(/#{Mx[:mk_o]}(?:[0-9a-f]{32}|[0-9a-f]{64})#{Mx[:mk_c]}/,'')
+ word.gsub!(/#{Mx[:en_a_o]}(?:\d|[*+])*|#{Mx[:en_b_o]}(?:\d|[*+])*|#{Mx[:en_a_c]}|#{Mx[:en_b_c]}/mi,'')
+ word.gsub!(/#{Mx[:fa_o]}\S+?#{Mx[:fa_o_c]}/,''); word.gsub!(/#{Mx[:fa_c_o]}\S+?#{Mx[:fa_c]}/,'')
+ #word.gsub!(/#{Mx[:fa_o]}\S+?#{Mx[:fa_c]}/,'') #watch
word.gsub!(/<\/?\S+?>/,'')
+ word.gsub!(/^\@+/,'')
word.strip!
+ word.gsub!(/#{Mx[:tc_p]}.+/,'')
word.gsub!(/[\.,;:"]$/,'')
word.gsub!(/["]/,'')
word.gsub!(/^\s*[\(]/,'')
@@ -252,8 +271,12 @@ WOK
word.gsub!(/^(?:See|e\.?g\.?).+/,'')
word.gsub!(/^\s*[.,;:]\s*/,'')
word.strip!
+ word.gsub!(/^\(?[a-zA-Z]\)$/,'')
word.gsub!(/^\d+(st|nd|rd|th)$/,'')
word.gsub!(/^(\d+\.?)+$/, '')
+ word.gsub(/#{Mx[:mk_o]}|#{Mx[:mk_c]}/,'')
+ word.gsub!(/^\S$/,'')
+ word=nil if word =~/^\S$/
word=nil if word =~/^\s*$/ #watch
if word
unless word =~/[A-Z][A-Z]/ \