From 6a8a7c3cf32fa1920bfcd62c0702b14dd84e6d65 Mon Sep 17 00:00:00 2001 From: Ralph Amissah Date: Sat, 21 Sep 2013 00:05:24 -0400 Subject: v4: dal, objects, store book index as hash (simplify downstream processing) * merge v5 * db, book index, convert hash back to string (associated downstream processing change) --- data/doc/sisu/CHANGELOG_v4 | 4 ++ lib/sisu/v4/dal_doc_str.rb | 54 +++++++++++++++++++++++- lib/sisu/v4/dal_idx.rb | 101 ++++++++++++++------------------------------- lib/sisu/v4/db_import.rb | 30 +++++++++++--- 4 files changed, 112 insertions(+), 77 deletions(-) diff --git a/data/doc/sisu/CHANGELOG_v4 b/data/doc/sisu/CHANGELOG_v4 index 01053457..b1ed1011 100644 --- a/data/doc/sisu/CHANGELOG_v4 +++ b/data/doc/sisu/CHANGELOG_v4 @@ -30,6 +30,10 @@ http://www.jus.uio.no/sisu/pkg/src/sisu_4.2.4.orig.tar.xz sisu_4.2.4.orig.tar.xz sisu_4.2.4-1.dsc +* dal, objects, store book index as hash (simplify downstream processing), and + associated downstream processing change from v5 + * db, book index, convert hash back to string + * minor cleaning %% 4.2.3.orig.tar.xz (2013-09-07:35/6) diff --git a/lib/sisu/v4/dal_doc_str.rb b/lib/sisu/v4/dal_doc_str.rb index 2a3e6b83..e4a0be2b 100644 --- a/lib/sisu/v4/dal_doc_str.rb +++ b/lib/sisu/v4/dal_doc_str.rb @@ -168,6 +168,55 @@ module SiSU_DAL_DocumentStructureExtract end [str,tags] end + def rgx_idx_ocn_seg + @rgx_idx_ocn_seg=/(.+?)\s*[+](\d+)/ + end + def construct_idx_array_and_hash(idxraw) + idx_array_raw=idxraw.scan(/[^;]+/) + idx_hash,idx_array,idx_lst={},[],[] + idx_array_raw.each do |idx| + idx_lst=case idx + when /\S+?\s*:/ + idx_couplet_tmp=[] + idx_couplet=idx.scan(/\s*[^:]+\s*/) + if idx_couplet[1] =~/[|]/ + idx_couplet_tmp << idx_couplet[0] << idx_couplet[1].scan(/\s*[^|]+\s*/) + else + idx_couplet_tmp << idx_couplet[0] << [idx_couplet[1]] + end + idx_couplet=idx_couplet_tmp + else [idx] + end + term_nodes=[] + idx_lst.each do |term_node| + case term_node + when String + term_node=term_node[0].chr.capitalize + term_node[1,term_node.length] + term_node=(term_node =~/.+?[+]\d+/) \ + ? term_node + : (term_node + '+0') + term_nodes << term_node + use,plus=rgx_idx_ocn_seg.match(term_node)[1,2] + @use=use.strip + idx_hash[@use]={ sub: [], plus: plus } unless idx_hash[@use] and defined? idx_hash[@use] + when Array + subterm_nodes=[] + term_node.each do |subterm_node| + subterm_node=(subterm_node =~/.+?[+]\d+/) \ + ? subterm_node + : (subterm_node + '+0') + subterm_nodes << subterm_node + sub,sub_plus=rgx_idx_ocn_seg.match(subterm_node)[1,2] + idx_hash[@use]={ sub: [], plus: 0 } unless idx_hash[@use] and defined? idx_hash[@use] + idx_hash[@use][:sub] << {sub.strip => { plus: sub_plus }} + end + term_nodes << subterm_nodes + end + end + idx_array << term_nodes + end + { hash: idx_hash, array: idx_array } + end def identify_parts tuned_file=[] @tuned_block,@tuned_code=[],[] @@ -205,9 +254,10 @@ module SiSU_DAL_DocumentStructureExtract idx=if t_o=~/^=\{(.+)\}\s*$\Z/m; m=$1 m=m.split(/\n/).join(' '). gsub(/\s+([|:;])\s+/,'\1'). - gsub(/\s+([+])\s+/,'\1') + gsub(/\s+([+]\d+)\s+/,'\1') t_o=t_o.gsub(/\n=\{.+\}\s*$\Z/m,'') - m + idx_array_and_hash=construct_idx_array_and_hash(m) + idx_array_and_hash[:hash] else nil end end diff --git a/lib/sisu/v4/dal_idx.rb b/lib/sisu/v4/dal_idx.rb index 96486858..5fbfe5df 100644 --- a/lib/sisu/v4/dal_idx.rb +++ b/lib/sisu/v4/dal_idx.rb @@ -84,13 +84,11 @@ module SiSU_DAL_BookIndex @seg=dob.name end if defined? dob.idx \ - and dob.idx.is_a?(String) \ - and not dob.idx.empty? - idx_array << "#{dob.idx}~#{dob.ocn}~#{@seg}" + and dob.idx.is_a?(Hash) + idx_array << {idx: dob.idx, ocn: dob.ocn, seg: @seg } end tuned_file << dob if dob end - idx_array=construct_idx_array(idx_array) if idx_array.length > 0 if idx_array.length > 0 the_idx=construct_book_index(idx_array) sisu_markup_idx_rel,sisu_markup_idx_rel_html_seg,html_idx,xhtml_idx=nil,nil,nil,nil @@ -101,61 +99,28 @@ module SiSU_DAL_BookIndex end [tuned_file,sisu_markup_idx_rel,sisu_markup_idx_rel_html_seg,html_idx,xhtml_idx] end - def construct_idx_array(idx_array) - idx_lst=[] - idx_array.each do |idx| - idx_list,ocn,seg=@rgx_idx_ocn_seg.match(idx)[1..3] - idx_lst <<=if idx_list =~/;/ - g=idx_list.scan(/[^;]+/) - idxl=[] - g.each do |i| - i=i.strip - idxl << { rough_idx: i, ocn: ocn, seg: seg } - end - idxl - else { rough_idx: idx_list, ocn: ocn, seg: seg } - end - end - idx_lst=idx_lst.flatten - end def construct_book_index(idx_array) the_idx={} idx_array.each do |idx| - if idx[:rough_idx] =~/[|]/ \ - && idx[:rough_idx] !~/[:]/ - if @md.opt.cmd =~/[MVv]/ - p 'book index error? --> ' + idx[:rough_idx] - end - else - idx_lst=idx[:rough_idx].scan(/[^|:]+/) - idx_lst[0]=idx_lst[0].strip - if idx_lst[0] =~/.+?\+\d+/ - use=/(.+?)\+(?:\d+)/.match(idx_lst[0])[1] - else use=idx_lst[0] - end - use=use[0].chr.capitalize + use[1,use.length] - the_idx[use]={} unless the_idx[use] and defined? the_idx[use] - idx_lst.each do |i| - i=i.strip - i,r=/(.+?)\+(\d+)/.match(i)[1,2] if i =~/.+?\+\d+/ - x=if idx_lst.length==1 or idx_lst[0].gsub(/\+\d+/,'')==i - the_idx[use]['term_node_lev1']=[] unless the_idx[use]['term_node_lev1'] and defined? the_idx[use]['term_node_lev1'] - x=if r - the_idx[use]['term_node_lev1'] << { ocn: idx[:ocn], range: "#{idx[:ocn]}-#{idx[:ocn].to_i+r.to_i}", seg: idx[:seg] } - "#{i} #{idx[:ocn]}-#{idx[:ocn].to_i+r.to_i}" - else - the_idx[use]['term_node_lev1'] << { ocn: idx[:ocn], seg: idx[:seg] } - "#{i} #{idx[:ocn]}" - end - else - the_idx[use]['term_node_lev2']={} unless the_idx[use]['term_node_lev2'] and defined? the_idx[use]['term_node_lev2'] - the_idx[use]['term_node_lev2'][i]=[] unless the_idx[use]['term_node_lev2'][i] and defined? the_idx[use]['term_node_lev2'][i] - x=if r - the_idx[use]['term_node_lev2'][i] << { ocn: idx[:ocn], range: "#{idx[:ocn]}-#{idx[:ocn].to_i+r.to_i}", seg: idx[:seg] } - "#{idx_lst[0]}:#{i} #{idx[:ocn]}-#{idx[:ocn].to_i+r.to_i}" - else - the_idx[use]['term_node_lev2'][i] << { ocn: idx[:ocn], seg: idx[:seg] } - "#{idx_lst[0]}:#{i} #{idx[:ocn]}" + idx[:idx].each_pair do |term,term_info| + location=(term_info[:plus].to_i > 0) \ + ? (%{#{idx[:ocn]}-#{idx[:ocn].to_i + term_info[:plus].to_i}}) + : idx[:ocn].to_s + the_idx[term]={} unless the_idx[term] and defined? the_idx[term] + the_idx[term]['node_0_terms']=[] unless the_idx[term]['node_0_terms'] and defined? the_idx[term]['node_0_terms'] + the_idx[term]['node_0_terms'] << { ocn: idx[:ocn], range: location, seg: idx[:seg] } + if term_info[:sub].is_a?(Array) \ + and term_info[:sub].length > 0 + term_info[:sub].each do |y| + y.each_pair do |subterm,subterm_info| + location=(subterm_info[:plus].to_i > 0) \ + ? (%{#{idx[:ocn]}-#{idx[:ocn].to_i + subterm_info[:plus].to_i}}) + : idx[:ocn].to_s + the_idx[term]={} unless the_idx[term] and defined? the_idx[term] + the_idx[term]['node_0_terms']=[] unless the_idx[term]['node_0_terms'] and defined? the_idx[term]['node_0_terms'] + the_idx[term]['node_1_subterms']={} unless the_idx[term]['node_1_subterms'] and defined? the_idx[term]['node_1_subterms'] + the_idx[term]['node_1_subterms'][subterm]=[] unless the_idx[term]['node_1_subterms'][subterm] and defined? the_idx[term]['node_1_subterms'][subterm] + the_idx[term]['node_1_subterms'][subterm] << { ocn: idx[:ocn], range: location, seg: idx[:seg] } end end end @@ -230,8 +195,8 @@ module SiSU_DAL_BookIndex p 'array error? -->' print x elsif x.is_a?(Hash) - if x['term_node_lev1'].is_a?(Array) - x['term_node_lev1'].each do |a| + if x['node_0_terms'].is_a?(Array) + x['node_0_terms'].each do |a| if a[:range] idx[:sst_rel_html_seg][@o]=idx[:sst_rel_html_seg][@o] + %{#{Mx[:lnk_o]}#{a[:range]}#{Mx[:lnk_c]}#{Mx[:rel_o]}/#{a[:seg]}.html##{a[:ocn]}#{Mx[:rel_c]}, } idx[:sst_rel][@t]=idx[:sst_rel][@t] + %{#{Mx[:lnk_o]}#{a[:range]}#{Mx[:lnk_c]}#{Mx[:rel_o]}#{a[:ocn]}#{Mx[:rel_c]}, } @@ -250,11 +215,9 @@ module SiSU_DAL_BookIndex idx[:html][@q]=idx[:html][@q] + '

' idx[:xhtml][@r]=idx[:xhtml][@r] + '

' end - if x['term_node_lev2'] - m=x['term_node_lev2'] - m=m.sort - m.each do |k,y| - if k !~/term_node_lev1/ + if x['node_1_subterms'] + x['node_1_subterms'].sort.each do |k,y| + if k !~/node_0_terms/ idx[:sst_rel_html_seg][@o]=idx[:sst_rel_html_seg][@o] + %{#{k}, } idx[:sst_rel][@t]=idx[:sst_rel][@t] + %{#{k}, } idx[:html][@q]=idx[:html][@q] + %{\n

#{k}, } @@ -298,8 +261,8 @@ module SiSU_DAL_BookIndex p 'array error? -->' print x elsif x.is_a?(Hash) - if x['term_node_lev1'].is_a?(Array) - x['term_node_lev1'].each do |a| + if x['node_0_terms'].is_a?(Array) + x['node_0_terms'].each do |a| if a[:range] print a[:range] + ', ' elsif a[:ocn] @@ -308,11 +271,9 @@ module SiSU_DAL_BookIndex end end end - if x['term_node_lev2'] - m=x['term_node_lev2'] - m=m.sort - m.each do |k,y| - if k !~/term_node_lev1/ + if x['node_1_subterms'] + x['node_1_subterms'].sort.each do |k,y| + if k !~/node_0_terms/ print "\n\t" + k + ', ' y.each do |z| if z[:range] diff --git a/lib/sisu/v4/db_import.rb b/lib/sisu/v4/db_import.rb index 240efd67..0384795a 100644 --- a/lib/sisu/v4/db_import.rb +++ b/lib/sisu/v4/db_import.rb @@ -209,6 +209,26 @@ module SiSU_DbImport end def pf_db_import_transaction_close end + def book_idx_hash_to_str(book_idx) + book_idx=book_idx ? book_idx : '' + book_idx_str,book_subidx_part='','' + if not book_idx.empty? + book_idx_str='' + book_idx.each_pair do |k0,v0| + book_idx_str << %{#{k0}+#{v0[:plus]}} + book_subidx_part='' + if v0[:sub].length > 0 + v0[:sub].each do |subterms| + subterms.each_pair do |k1,v1| + book_subidx_part << %{\n #{k1}+#{v1[:plus]} | } + end + end + book_idx_str=book_idx_str + ':' + book_subidx_part + end + end + end + book_idx_str + end def db_import_metadata #% import documents - populate database print %{ #{@cX.grey}import documents dbi_unit #{@cX.off} } if @opt.cmd =~/vVM/ @tp={} @@ -285,7 +305,7 @@ module SiSU_DbImport plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text(plaintext) - book_idx=data.idx ? data.idx : '' + book_idx=book_idx_hash_to_str(data.idx) @col[:book_idx]=clean_searchable_text(book_idx) if @en[0] then @en_a,@en_z=@en[0].first,@en[0].last end @@ -321,7 +341,7 @@ module SiSU_DbImport plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text(plaintext) - book_idx=data.idx ? data.idx : '' + book_idx=book_idx_hash_to_str(data.idx) @col[:book_idx]=clean_searchable_text(book_idx) @en_a,@en_z=@en[0].first,@en[0].last if @en[0] @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0] @@ -353,7 +373,7 @@ module SiSU_DbImport plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text(plaintext) - book_idx=data.idx ? data.idx : '' + book_idx=book_idx_hash_to_str(data.idx) @col[:book_idx]=clean_searchable_text(book_idx) @en_a,@en_z=@en[0].first,@en[0].last if @en[0] @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0] @@ -383,7 +403,7 @@ module SiSU_DbImport plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text(plaintext) - book_idx=data.idx ? data.idx : '' + book_idx=book_idx_hash_to_str(data.idx) @col[:book_idx]=clean_searchable_text(book_idx) @en_a,@en_z=@en[0].first,@en[0].last if @en[0] @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0] @@ -441,7 +461,7 @@ module SiSU_DbImport plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text(plaintext) - book_idx=data.idx ? data.idx : '' + book_idx=book_idx_hash_to_str(data.idx) @col[:book_idx]=clean_searchable_text(book_idx) t=SiSU_DbTuple::LoadDocuments.new(@conn,@col,@opt,@file_maint) @tuple_array << t.tuple -- cgit v1.2.3