# encoding: utf-8 =begin * Name: SiSU ** Description: documents, structuring, processing, publishing, search *** system environment, resource control and configuration details ** Author: Ralph Amissah [ralph@amissah.com] [ralph.amissah@gmail.com] ** Copyright: (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 Ralph Amissah, All Rights Reserved. ** License: GPL 3 or later: SiSU, a framework for document structuring, publishing and search Copyright (C) Ralph Amissah This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see [http://www.gnu.org/licenses/]. If you have Internet connection, the latest version of the GPL should be available at these locations: [http://www.fsf.org/licensing/licenses/gpl.html] [http://www.gnu.org/licenses/gpl.html] ** SiSU uses: * Standard SiSU markup syntax, * Standard SiSU meta-markup syntax, and the * Standard SiSU object citation numbering and system ** Hompages: [http://www.jus.uio.no/sisu] [http://www.sisudoc.org] ** Git [http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=summary] [http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=blob;f=lib/sisu/db_sqltxt.rb;hb=HEAD] =end module SiSU_DbText class Prepare def special_character_escape(str) str=str.to_s.gsub(/'/m,"''"). #string.gsub!(/'/,"\047") #string.gsub!(/'/,"\\'") gsub(/(\\)/m,'\1\1'). #ok but with warnings, double backslash on sqlite #str.gsub!(/[\\]/m,'\\x5C') #ok but with warnings, but not for sqlite #str.gsub!(/(\\)/m,'\1') #ok for sqlite not for pgsql gsub(/#{Mx[:br_line]}|#{Mx[:br_nl]}/m,"
\n"). gsub(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/m,''). #check gsub(/#{Mx[:lnk_o]}\s*(\S+?\.(?:png|jpg))(?:\s+\d+x\d+)?(.+?)#{Mx[:lnk_c]}\S+/m,'[image: \1] \2'). gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}(?:file|ftp):\/\/\S+?([.,!?]?(?:\s|$))/m,'\1\2'). gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/m,'\1') end def clean_searchable_text_from_document_objects(arr) en=[] arr=(arr.is_a?(String)) ? [ arr ] : arr txt_arr=arr.each.map do |s| s=s.gsub(/#{Mx[:fa_o]}[a-z]{1,4}#{Mx[:fa_o_c]}/m,''). gsub(/#{Mx[:fa_c_o]}[a-z]{1,4}#{Mx[:fa_c]}/m,''). gsub(/
/m,' ') en << s.scan(/#{Mx[:en_a_o]}\s*(.+?)\s*#{Mx[:en_a_c]}/m) s=s.gsub(/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/m,''). gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,''). gsub(/ \s+/m,' ') #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/ s end txt_arr=txt_arr << en txt=txt_arr.flatten.join("\n") special_character_escape(txt) end def clean_document_objects_body(arr) en=[] arr=(arr.is_a?(String)) ? [ arr ] : arr txt_arr=arr.each.map do |s| en << s.scan(/#{Mx[:en_a_o]}\s*(.+?)\s*#{Mx[:en_a_c]}/m) s=s. gsub(/#{Mx[:en_a_o]}\s*(\d+).+?#{Mx[:en_a_c]}/m, '\1'). gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,''). gsub(/ \s+/m,' ') s end en_arr=en.flatten.each.map do |e| e.sub(/^(\d+)\s*/,'\1 ') end txt_arr=txt_arr << en_arr txt=txt_arr.flatten.join("\n
") special_character_escape(txt) end def clean_searchable_text_from_document_source(arr) txt_arr,en=[],[] arr=(arr.is_a?(String)) ? arr.split(/\n+/m) : arr arr.each do |s| s=s.gsub(/([*\/_-])\{(.+?)\}\1/m,'\2'). gsub(/^(?:block|group|poem|code)\{/m,''). gsub(/^\}(?:block|group|poem|code)/m,''). gsub(/\A(?:@\S+:\s+.+)\Z/m,'') if s =~/^:A~/ if defined? @md.creator \ and defined? @md.creator.author \ and not @md.creator.author.empty? s=s.gsub(/@author/,@md.creator.author) else SiSU_Screen::Ansi.new( 'v', 'WARNING Document Author information missing; provide @creator: :author:', @md.fnb ).warn unless @md.opt.act[:quiet][:set]==:on end if defined? @md.title \ and defined? @md.title.full \ and not @md.title.full.empty? s=s.gsub(/@title/,@md.title.full) else SiSU_Screen::Ansi.new( 'v', 'WARNING Document Title missing; provide @title:', @md.fnb ).warn unless @md.opt.act[:quiet][:set]==:on end end s=s.gsub(/^(?:_[1-9]\*?|_\*)\s+/m,''). gsub(/^(?:[1-9]\~(\S+)?)\s+/m,''). gsub(/^(?::?[A-C]\~(\S+)?)\s+/m,''). gsub(/^%{1,3} .+/m,''). #removed even if contained in code block gsub(/
/m,' ') #en << s.scan(/~\{\s*(.+?)\s*\}~/m) s=s.gsub(/~\{.+?\}~/m,''). gsub(/ \s+/m,' ') ##special_character_escape(s) #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/ s end txt_arr << arr << en txt=txt_arr.flatten.join("\n") txt=special_character_escape(txt) txt end def strip_markup(str) #define rules, make same as in dal clean str=str.gsub(/#{Mx[:fa_superscript_o]}(\d+)#{Mx[:fa_superscript_c]}/,'[\1]'). gsub(/(?: \\;|#{Mx[:nbsp]})+/,' '). gsub(/#{Mx[:tc_o]}#{Mx[:tc_p]}#{Mx[:tc_p]}\d+(.+)#{Mx[:tc_c]}/u,'\1'). #tables gsub(/#{Mx[:tc_p]}#{Mx[:tc_p]}\d+#{Mx[:tc_p]}/u,' '). #tables gsub(/#{Mx[:tc_p]}/u,' '). #tables tidy later gsub(/<.+?>/,''). gsub(/#{Mx[:lnk_o]}.+?\.(?:png|jpg|gif).+?#{Mx[:lnk_c]}(?:file|ftp)\/\/:\S+ /,' [image] '). # else image names found in search gsub(/#{Mx[:lnk_o]}.+?\.(?:png|jpg|gif).+?#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/,' [image]'). # else image names found in search gsub(/\s\s+/,' '). strip end def unique_words(str) a=str.scan(/[a-zA-Z0-9\\\/_-]{2,}/) #a=str.scan(/\S+{2,}/) str=a.uniq.sort.join(' ') str end end end __END__