aboutsummaryrefslogtreecommitdiffhomepage
path: root/lib/sisu/v4/db_sqltxt.rb
blob: 53c15ed34d22d8c75077f05502a9e47f70ccf16c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# encoding: utf-8
=begin

 * Name: SiSU

 * Description: a framework for document structuring, publishing and search

 * Author: Ralph Amissah

 * Copyright: (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   2007, 2008, 2009, 2010, 2011, 2012 Ralph Amissah, All Rights Reserved.

 * License: GPL 3 or later:

   SiSU, a framework for document structuring, publishing and search

   Copyright (C) Ralph Amissah

   This program is free software: you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by the Free
   Software Foundation, either version 3 of the License, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful, but WITHOUT
   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
   more details.

   You should have received a copy of the GNU General Public License along with
   this program. If not, see <http://www.gnu.org/licenses/>.

   If you have Internet connection, the latest version of the GPL should be
   available at these locations:
   <http://www.fsf.org/licensing/licenses/gpl.html>
   <http://www.gnu.org/licenses/gpl.html>

   <http://www.sisudoc.org/sisu/en/manifest/gpl.fsf.html>

 * SiSU uses:
   * Standard SiSU markup syntax,
   * Standard SiSU meta-markup syntax, and the
   * Standard SiSU object citation numbering and system

 * Hompages:
   <http://www.jus.uio.no/sisu>
   <http://www.sisudoc.org>

 * Download:
   <http://www.sisudoc.org/sisu/en/SiSU/download.html>

 * Ralph Amissah
   <ralph@amissah.com>
   <ralph.amissah@gmail.com>

 ** Description: system environment, resource control and configuration details

=end
module SiSU_DbText
  class Prepare
    def special_character_escape(str)
      str=str.gsub(/'/,"''"). #string.gsub!(/'/,"\047") #string.gsub!(/'/,"\\'")
        gsub(/(\\)/m,'\1\1'). #ok but with warnings, double backslash on sqlite #str.gsub!(/[\\]/m,'\\x5C') #ok but with warnings, but not for sqlite #str.gsub!(/(\\)/m,'\1') #ok for sqlite not for pgsql
        gsub(/#{Mx[:br_line]}|#{Mx[:br_nl]}/,"<br />\n").
        gsub(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/,''). #check
        gsub(/#{Mx[:lnk_o]}\s*(\S+?\.(?:png|jpg))(?:\s+\d+x\d+)?(.+?)#{Mx[:lnk_c]}\S+/,'[image: \1] \2').
        gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}(?:file|ftp):\/\/\S+?([.,!?]?(?:\s|$))/,'\1\2').
        gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/,'\1')
    end
    def clean_searchable_text(arr) #produce clean, searchable, plaintext from document source
      txt_arr,en=[],[]
      arr=(arr.is_a?(String)) ? arr.split(/\n+/m) : arr
      arr.each do |s|
        s=s.gsub(/([*\/_-])\{(.+?)\}\1/m,'\2').
          gsub(/^(?:block|group|poem|code)\{/m,'').gsub(/^\}(?:block|group|poem|code)/m,'').
          gsub(/\A(?:@\S+:\s+.+)\Z/m,'')
        if s =~/^:A~/
          if defined? @md.creator \
          and defined? @md.creator.author \
          and not @md.creator.author.empty?
            s=s.gsub(/@author/,@md.creator.author)
          else
            SiSU_Screen::Ansi.new('v','WARNING Document Author information missing; provide @creator: :author:',@md.fnb).warn unless @md.opt.cmd.inspect =~/q/
          end
          if defined? @md.title \
          and defined? @md.title.full \
          and not @md.title.full.empty?
            s=s.gsub(/@title/,@md.title.full)
          else
            SiSU_Screen::Ansi.new('v','WARNING Document Title missing; provide @title:',@md.fnb).warn unless @md.opt.cmd.inspect =~/q/
          end
        end
        s=s.gsub(/^(?:_[1-9]\*?|_\*)\s+/m,'').
          gsub(/^(?:[1-9]\~(\S+)?)\s+/m,'').
          gsub(/^(?::?[A-C]\~(\S+)?)\s+/m,'').
          gsub(/^%{1,3} .+/m,''). #removed even if contained in code block
          gsub(/<br>/m,' ')
        en << s.scan(/~\{\s*(.+?)\s*\}~/m)
        s=s.gsub(/~\{.+?\}~/m,'').
          gsub(/ \s+/m,' ')
        #special_character_escape(s)
        s
      end
      txt_arr << arr << en
      #txt_arr=txt_arr.flatten
      txt=txt_arr.flatten.join("\n")
      txt=special_character_escape(txt)
      txt
    end
    def strip_markup(str) #define rules, make same as in dal clean
      str=str.gsub(/#{Mx[:fa_superscript_o]}(\d+)#{Mx[:fa_superscript_c]}/,'[\1]').
        gsub(/(?:&nbsp\\;|#{Mx[:nbsp]})+/,' ').
        gsub(/#{Mx[:tc_o]}#{Mx[:tc_p]}#{Mx[:tc_p]}\d+(.+)#{Mx[:tc_c]}/u,'\1').         #tables
        gsub(/#{Mx[:tc_p]}#{Mx[:tc_p]}\d+#{Mx[:tc_p]}/u,' ').                          #tables
        gsub(/#{Mx[:tc_p]}/u,' ').                                                     #tables tidy later
        gsub(/<.+?>/,'').
        gsub(/#{Mx[:lnk_o]}.+?\.(?:png|jpg|gif).+?#{Mx[:lnk_c]}(?:file|ftp)\/\/:\S+ /,' [image] '). # else image names found in search
        gsub(/#{Mx[:lnk_o]}.+?\.(?:png|jpg|gif).+?#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/,' [image]'). # else image names found in search
        gsub(/\s\s+/,' ').
        strip
    end
    def unique_words(str)
      a=str.scan(/[a-zA-Z0-9\\\/_-]{2,}/) #a=str.scan(/\S+{2,}/)
      str=a.uniq.sort.join(' ')
      str
    end
  end
end
__END__