aboutsummaryrefslogtreecommitdiffhomepage
path: root/lib/sisu/v5
diff options
context:
space:
mode:
authorRalph Amissah <ralph@amissah.com>2014-09-28 21:46:45 -0400
committerRalph Amissah <ralph@amissah.com>2014-09-28 22:09:38 -0400
commit33b94cc15e25dedcc6fb93d00942b97823090a4b (patch)
tree5b801f54842151af111e1cffc6a57e1f04ef84c4 /lib/sisu/v5
parentv5 v6: manifest, renamed html_manifest (diff)
v5 v6: sql, clean searchable text
* update for (ao/dal) text representation, fix legacy action
Diffstat (limited to 'lib/sisu/v5')
-rw-r--r--lib/sisu/v5/db_columns.rb2
-rw-r--r--lib/sisu/v5/db_import.rb26
-rw-r--r--lib/sisu/v5/db_sqltxt.rb28
3 files changed, 38 insertions, 18 deletions
diff --git a/lib/sisu/v5/db_columns.rb b/lib/sisu/v5/db_columns.rb
index 44d45e95..15341042 100644
--- a/lib/sisu/v5/db_columns.rb
+++ b/lib/sisu/v5/db_columns.rb
@@ -81,7 +81,7 @@ module SiSU_DbColumns
@sisutxt=special_character_escape(src)
else @sisutxt=''
end
- @fulltext=clean_searchable_text(txt_arr)
+ @fulltext=clean_searchable_text_from_document_objects(txt_arr)
else @sisutxt,@fulltext='',''
end
end
diff --git a/lib/sisu/v5/db_import.rb b/lib/sisu/v5/db_import.rb
index a7f33939..8a500f8a 100644
--- a/lib/sisu/v5/db_import.rb
+++ b/lib/sisu/v5/db_import.rb
@@ -292,7 +292,7 @@ module SiSU_DbImport
src=txt_arr.join("\n")
src=special_character_escape(src)
@tp[:sisutxt_f],@tp[:sisutxt_i]='sisutxt, ',"'#{src}', "
- txt=clean_searchable_text(txt_arr)
+ txt=clean_searchable_text_from_document_source(txt_arr)
#txt=special_character_escape(txt)
@tp[:fulltxt_f],@tp[:fulltxt_i]='fulltxt, ',"'#{txt}', "
end
@@ -374,9 +374,9 @@ module SiSU_DbImport
@col[:body]=special_character_escape(body)
plaintext=@col[:body].dup
plaintext=strip_markup(plaintext)
- @col[:plaintext]=clean_searchable_text(plaintext)
+ @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
book_idx=book_idx_hash_to_str(data.idx)
- @col[:book_idx]=clean_searchable_text(book_idx)
+ @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)
if @en[0] then @en_a,@en_z=@en[0].first,@en[0].last
end
if @en_ast[0] then @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last
@@ -425,9 +425,9 @@ module SiSU_DbImport
@col[:body]=special_character_escape(body)
plaintext=@col[:body].dup
plaintext=strip_markup(plaintext)
- @col[:plaintext]=clean_searchable_text(plaintext)
+ @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
book_idx=book_idx_hash_to_str(data.idx)
- @col[:book_idx]=clean_searchable_text(book_idx)
+ @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)
@en_a,@en_z=@en[0].first,@en[0].last if @en[0]
@en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]
@en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0]
@@ -470,9 +470,9 @@ module SiSU_DbImport
@col[:body]=special_character_escape(body)
plaintext=@col[:body].dup
plaintext=strip_markup(plaintext)
- @col[:plaintext]=clean_searchable_text(plaintext)
+ @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
book_idx=book_idx_hash_to_str(data.idx)
- @col[:book_idx]=clean_searchable_text(book_idx)
+ @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)
@en_a,@en_z=@en[0].first,@en[0].last if @en[0]
@en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]
@en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0]
@@ -501,9 +501,9 @@ module SiSU_DbImport
@col[:body]=special_character_escape(body)
plaintext=@col[:body].dup
plaintext=strip_markup(plaintext)
- @col[:plaintext]=clean_searchable_text(plaintext)
+ @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
book_idx=book_idx_hash_to_str(data.idx)
- @col[:book_idx]=clean_searchable_text(book_idx)
+ @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)
@en_a,@en_z=@en[0].first,@en[0].last if @en[0]
@en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]
@en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0]
@@ -532,9 +532,9 @@ module SiSU_DbImport
@col[:body]=special_character_escape(body)
plaintext=@col[:body].dup
plaintext=strip_markup(plaintext)
- @col[:plaintext]=clean_searchable_text(plaintext)
+ @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
book_idx=book_idx_hash_to_str(data.idx)
- @col[:book_idx]=clean_searchable_text(book_idx)
+ @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)
@en_a,@en_z=@en[0].first,@en[0].last if @en[0]
@en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]
@en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0]
@@ -603,9 +603,9 @@ module SiSU_DbImport
@col[:body]=special_character_escape(body)
plaintext=@col[:body].dup
plaintext=strip_markup(plaintext)
- @col[:plaintext]=clean_searchable_text(plaintext)
+ @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
book_idx=book_idx_hash_to_str(data.idx)
- @col[:book_idx]=clean_searchable_text(book_idx)
+ @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)
t=SiSU_DbTuple::LoadDocuments.new(@conn,@col,@opt,@file_maint)
@tuple_array << t.tuple
@en,@en_ast,@en_pls=[],[],[]
diff --git a/lib/sisu/v5/db_sqltxt.rb b/lib/sisu/v5/db_sqltxt.rb
index 99d417e1..adb2b0f0 100644
--- a/lib/sisu/v5/db_sqltxt.rb
+++ b/lib/sisu/v5/db_sqltxt.rb
@@ -71,7 +71,27 @@ module SiSU_DbText
gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}(?:file|ftp):\/\/\S+?([.,!?]?(?:\s|$))/,'\1\2').
gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/,'\1')
end
- def clean_searchable_text(arr) #produce clean, searchable, plaintext from document source
+ def clean_searchable_text_from_document_objects(arr)
+ txt_arr,en=[],[]
+ arr=(arr.is_a?(String)) ? [ arr ] : arr
+ arr.each do |s|
+ s=s.gsub(/#{Mx[:fa_o]}[a-z]{1,4}#{Mx[:fa_o_c]}/m,'').
+ gsub(/#{Mx[:fa_c_o]}[a-z]{1,4}#{Mx[:fa_c]}/m,'').
+ gsub(/<br>/m,' ')
+ en << s.scan(/#{Mx[:en_a_o]}\s*(.+?)\s*#{Mx[:en_a_c]}/m)
+ s=s.gsub(/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/m,'').
+ gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,'').
+ gsub(/ \s+/m,' ')
+ #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/
+ s
+ end
+ txt_arr << arr << en
+ #txt_arr=txt_arr.flatten
+ txt=txt_arr.flatten.join("\n")
+ txt=special_character_escape(txt)
+ txt
+ end
+ def clean_searchable_text_from_document_source(arr)
txt_arr,en=[],[]
arr=(arr.is_a?(String)) ? arr.split(/\n+/m) : arr
arr.each do |s|
@@ -107,14 +127,14 @@ module SiSU_DbText
gsub(/^(?::?[A-C]\~(\S+)?)\s+/m,'').
gsub(/^%{1,3} .+/m,''). #removed even if contained in code block
gsub(/<br>/m,' ')
- en << s.scan(/~\{\s*(.+?)\s*\}~/m)
+ #en << s.scan(/~\{\s*(.+?)\s*\}~/m)
s=s.gsub(/~\{.+?\}~/m,'').
gsub(/ \s+/m,' ')
- #special_character_escape(s)
+ ##special_character_escape(s)
+ #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/
s
end
txt_arr << arr << en
- #txt_arr=txt_arr.flatten
txt=txt_arr.flatten.join("\n")
txt=special_character_escape(txt)
txt