# coding: utf-8 =begin * Name: SiSU * Description: a framework for document structuring, publishing and search #___# * Author: Ralph Amissah * Copyright: (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Ralph Amissah All Rights Reserved. * License: GPL 3 or later: SiSU, a framework for document structuring, publishing and search Copyright (C) Ralph Amissah This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . If you have Internet connection, the latest version of the GPL should be available at these locations: * SiSU uses: * Standard SiSU markup syntax, * Standard SiSU meta-markup syntax, and the * Standard SiSU object citation numbering and system * Hompages: * Download: * Ralph Amissah ** Description: system environment, resource control and configuration details =end module SiSU_DB_text class Prepare def special_character_escape(str) str.gsub!(/'/,"''") #string.gsub!(/'/,"\047") #string.gsub!(/'/,"\\'") str.gsub!(/#{Mx[:br_line]}|#{Mx[:br_nl]}/,"
\n") str.gsub!(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/,'') #check str.gsub!(/#{Mx[:lnk_o]}\s*(\S+?\.(?:png|jpg))(?:\s+\d+x\d+)?(.+?)#{Mx[:lnk_c]}\S+/,'[image: \1] \2') str.gsub!(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}(?:file|ftp):\/\/\S+?([.,!?]?(?:\s|$))/,'\1\2') str.gsub!(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/,'\1') str end def clean_searchable_text(arr) #produce clean, searchable, plaintext from document source txt_arr,en=[],[] arr.each do |s| s.gsub!(/([*\/_-])\{(.+?)\}\1/,'\2') s.gsub!(/^(?:group|poem|code)\{/,''); s.gsub!(/^\}(?:group|poem|code)/,'') s.gsub!(/\A(?:@\S+:\s+.+)\Z/m,'') if s =~/^:A~/ s.gsub!(/@author/,@md.creator.author) s.gsub!(/@title/,@md.title.full) end s.gsub!(/^(?:_[1-9]\*?|_\*)\s+/,'') s.gsub!(/^(?:[1-9]\~(\S+)?)\s+/,'') s.gsub!(/^(?::?[A-C]\~(\S+)?)\s+/,'') s.gsub!(/^%{1,3} .+/,'') #removed even if contained in code block s.gsub!(/
/,' ') en << s.scan(/~\{\s*(.+?)\s*\}~/) s.gsub!(/~\{.+?\}~/,'') s.gsub!(/ \s+/,' ') #special_character_escape(s) s end txt_arr << arr << en #txt_arr=txt_arr.flatten txt=txt_arr.flatten.join("\n") txt=special_character_escape(txt) txt end def strip_markup(str) #define rules, make same as in dal clean str.gsub!(/#{Mx[:fa_superscript_o]}(\d+)#{Mx[:fa_superscript_c]}/,'[\1]') str.gsub!(/(?: \\;|#{Mx[:nbsp]})+/,' ') str.gsub!(/#{Mx[:tc_o]}#{Mx[:tc_p]}#{Mx[:tc_p]}\d+(.+)#{Mx[:tc_c]}/u,'\1') #tables str.gsub!(/#{Mx[:tc_p]}#{Mx[:tc_p]}\d+#{Mx[:tc_p]}/u,' ') #tables str.gsub!(/#{Mx[:tc_p]}/u,' ') #tables tidy later str.gsub!(/<.+?>/,'') str.gsub!(/#{Mx[:lnk_o]}.+?\.(?:png|jpg|gif).+?#{Mx[:lnk_c]}(?:file|ftp)\/\/:\S+ /,' [image] ') # else image names found in search str.gsub!(/#{Mx[:lnk_o]}.+?\.(?:png|jpg|gif).+?#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/,' [image]') # else image names found in search str.gsub!(/\s\s+/,' ') str.strip! str end end end __END__