From 65477054fd798728bf186aa2938727ddddbe86a5 Mon Sep 17 00:00:00 2001 From: Ralph Amissah Date: Tue, 22 May 2007 02:06:46 +0100 Subject: Imported upstream version 0.52.7 --- lib/sisu/0.52/concordance.rb | 311 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 311 insertions(+) create mode 100644 lib/sisu/0.52/concordance.rb (limited to 'lib/sisu/0.52/concordance.rb') diff --git a/lib/sisu/0.52/concordance.rb b/lib/sisu/0.52/concordance.rb new file mode 100644 index 00000000..5f251830 --- /dev/null +++ b/lib/sisu/0.52/concordance.rb @@ -0,0 +1,311 @@ +=begin + * Name: SiSU information Structuring Universe - Structured information, Serialized Units + * Author: Ralph Amissah + * http://www.jus.uio.no/sisu + * http://www.jus.uio.no/sisu/SiSU/download.html + + * Description: concordance file (html concordance, wordmap, linked index of words in document) + + * Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 Ralph Amissah + + * License: GPL 2 or later + + Summary of GPL 2 + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + + If you have Internet connection, the latest version of the GPL should be + available at these locations: + http://www.fsf.org/licenses/gpl.html + http://www.gnu.org/copyleft/gpl.html + http://www.jus.uio.no/sisu/gpl2.fsf + + SiSU was first released to the public on January 4th 2005 + + SiSU uses: + + * Standard SiSU markup syntax, + * Standard SiSU meta-markup syntax, and the + * Standard SiSU object citation numbering and system + + © Ralph Amissah 1997, current 2007. + All Rights Reserved. + + * Ralph Amissah: ralph@amissah.com + ralph.amissah@gmail.com +=end +module SiSU_Concordance + require SiSU_lib + '/param' + require SiSU_lib + '/sysenv' + require SiSU_lib + '/defaults' + require SiSU_lib + '/dal' + include SiSU_Param + include SiSU_Env + include SiSU_Viz + require SiSU_lib + '/html_format_css' + include SiSU_HTML_Format + class Source + def initialize(opt) + @opt=opt + end + def read + begin + @md=SiSU_Param::Parameters.new(@opt).get + @env=SiSU_Env::Info_env.new(@md.fns) + loc=@env.url.output_tell + tool=if @md.cmd =~/[MVv]/; "#{@env.program.web_browser} #{loc}/#{@md.fnb}/#{@md.fn[:concordance]}" + else '' + end + tell=SiSU_Screen::Ansi.new(@md.cmd,"Concordance",tool) + tell.grey_title_hi unless @md.cmd =~/q/ + wordmax=200000 + unless @md.wc_words.nil? + if @md.wc_words < wordmax + SiSU_Concordance::Source::Words.new(@md).songsheet + else + tell=SiSU_Screen::Ansi.new(@md.cmd,"concordance skipped, large document has over #{wordmax} words (#{@md.wc_words})") + tell.warn unless @md.cmd =~/q/ + end + else + tell=SiSU_Screen::Ansi.new(@md.cmd,"wc (word count) is off, concordance will be processed for all files including those over the max set size of: #{wordmax} words") + tell.warn unless @md.cmd =~/q/ + SiSU_Concordance::Source::Words.new(@md).songsheet + end + rescue; SiSU_Errors::Info_error.new($!,$@,@md.cmd,@md.fns).error + ensure + end + end + private + class Doc_title + require SiSU_lib + '/param' + include SiSU_Param + include SiSU_Viz + def initialize(lnk,md) + @md=md + @vz=SiSU_Env::Get_init.instance.skin + file_array=IO.readlines(@md.fns,'') + txt_path=%{#{@md.dir_out}} + SiSU_Env::Info_skin.new(@md).select + @md_title=@md.title + @fnb=@md.fnb + @env=SiSU_Env::Info_env.new + @lex_button=%{SiSU home -->} + @lnk=lnk + @doc_details =< 

Manifest #{@md.title}

            TOC TOC - table of contents for individual articles

            Full Text Full text (with indexed table of contents)

            PDF portrait pdf version of the document (portrait)

            PDF landscape pdf version of the document (landscape)

Word index links are to html versions of the text the segmented version followed by the scroll (single document) version.
[For segmented text references [T1], [T2] or [T3] appearing without a link, indicates that the word appears in a title (or subtitle) of the text (that is identifiable by the appended object citation number).]

+WOK + end + def create + < + + + + SiSU created WordIndex for: #{@md.dc_title} + + + + + + + + + #{@vz.js_head} + + + #{@vz.js_top} + + + +
+ #{@vz.banner_home_button_only} + + #{@env.widget_static.search_form} +
+ #@doc_details

(The word listing/index is Case sensitive: Capitalized words appear before lower case)

+

+ word (number of occurences)
linked references to word within document
+ [if number of occurences exceed number of references - word occurs more than once in at least one reference. Footnote/endnotes are either assigned to the paragraph from which they are referenced or ignored, so it is relevant to check the footnotes referenced from within a paragraph as well.] +

+

+ (After the page is fully loaded) you can jump directly to a word by appending a hash (#) and the word to the url for this text, (do not forget that words are case sensitive, and may be listed twice (starting with and without an upper case letter)), #your_word # [ http://[web host]/#@fnb/concordance.html#your_word ] +

+WOK + end + end + class Word + @@word_previous='' + def initialize(word,freq) + @word,@freq=word,freq + end + def html + w=if @word.capitalize==@@word_previous + %{\n

#@word

(#@freq)

\n\t

} + else n=@word.strip.gsub(/\s+/,'_') #also need to convert extended character set to html + %{\n

#@word

(#@freq)

\n\t

} + end + @@word_previous=@word.capitalize + w + end + end + class Words + require SiSU_lib + '/defaults' + require SiSU_lib + '/param' + include SiSU_Viz + include SiSU_Param + require SiSU_lib + '/html_format_css' + include SiSU_HTML_Format + require SiSU_lib + '/vocab' + require SiSU_lib + '/sysenv' + include SiSU_Screen + include Wordlists + @@dp=nil + def initialize(md) + begin + @vz=SiSU_Env::Get_init.instance.skin + @md=md + @env=SiSU_Env::Info_env.new(@md.fns) + @path="#{@env.path.output}/#{@md.fnb}" + @dal_array=SiSU_DAL::Source.new(@md).get # dal file drawn here + @freq=Hash.new(0) + @dp=@@dp ||=SiSU_Env::Info_env.new.digest.pattern + @rxp_to=Regexp.new("<~(\\d+);(?:[oh]|[0-6]:)\\d+;\\w\\d+><#@dp:#@dp>$") + @rxp_lv1=Regexp.new('^1~') #line start markers removed, ('^1~') for exceptions \n\n4{{{ + @rxp_lv2=Regexp.new('^2~') + @rxp_lv3=Regexp.new('^3~') + @rxp_seg=Regexp.new('^4~(.+?)\s+') + @rxp_title=Regexp.new('^0~title\s*(.+?)\s*$') + @rxp_t1=Regexp.new('^T1') + @rxp_t2=Regexp.new('^T2') + @rxp_t3=Regexp.new('^T3') + @rxp_excluded1=Regexp.new(/(?:https?|ftp):\/\/\S+/mi) + @rxp_excluded0=Regexp.new(/^(?:to\d+|\d+| \s*| |EOF|thumb_\S+|snap_\S+|_+|-+|\S+?_\S+|[\d_]+\w\S+|[\w\d]{1,2}|\d{1,3}\w?|ii|iii|iv|vi|vii|viii|ix|xi|xii|xiii|xiv|xv|xvi|xvii|xviii|xix|xx|#@dp|[0-9a-f]{24,64}|\d{2,3}x\d{2,3}|\S{0,2}sha\d|\S{0,3}\d{4}w\d\d|\b\w\d+|\d_all\b|e\.?g\.?)$/mi) + rescue; SiSU_Errors::Info_error.new($!,$@,@md.cmd,@md.fns).error + end + end + def songsheet + begin + File.mkpath(@path) unless FileTest.directory?(@path) + @file_index_all=File.open("#@path/#{@md.fn[:concordance]}",'w') + map_para + rescue; SiSU_Errors::Info_error.new($!,$@,@md.cmd,@md.fns).error + ensure + @file_index_all.close + end + end + protected + def location_scroll(wordlocation,show) + @wordlocation=wordlocation + %{#@wordlocation; } + end + def location_seg(wordlocation,show) + @wordlocation,@show=wordlocation,show + @sfx='.html' #used for hardlinks, previous setting @sfx='', web server takes care of suffix + @word_location_seg=wordlocation.gsub(/(.+?)\#(\d+)/,"#{@md.fnl[:pre]}\\1#{@md.fnl[:mid]}#@sfx#{@md.fnl[:post]}#\\2") unless wordlocation.nil? + case @wordlocation + when @rxp_t1 + %{[H]#@show, } + when @rxp_t2 + %{[H]#@show, } + when @rxp_t3 + %{[H]#@show, } + else %{#@show, } + end + end + def map_para + @seg,toy=nil,nil + @word_map={} + wordlist=Wordlists::Lists.new + lesser,greater,scanlist,special=wordlist.lesser,wordlist.greater,wordlist.scanlist,wordlist.special + #lesser,greater,scanlist=wordlist.lesser,wordlist.greater,wordlist.scanlist + @dal_array.each do |line| + if line !~/<~(\d+);[um]\d+;\w\d+><#@dp:#@dp>$/ # lines to ignore: # are added but not part of authors substantive text; 0 are mostly machine generated + if line =~/^0~vocabulary\s+(.+)/ + vocab=$1 + unless vocab =~/none/ + load SiSU_lib + "/vocab_#{vocab}.rb" + lesser,greater,scanlist=wordlist.lesser,wordlist.greater,wordlist.scanlist + #special=wordlist.special ##KEEP did two loops + tell=SiSU_Screen::Ansi.new(@md.cmd,"\tloaded vocabulary: #{vocab}") + tell.puts_grey unless @md.cmd =~/q/ #check + end + end + if line =~@rxp_seg; @seg=line[@rxp_seg,1] + end + if line =~@rxp_to; toy=line[@rxp_to,1] + end + if toy =~/\d+/ and toy !~/^0$/ + for word in line.scan(scanlist) #%take in word or other match + #for word in line.scan(special) #%take in word or other match #KEEP was second loop + word=nil if word =~@rxp_excluded0 #watch + word=nil if word =~@rxp_excluded1 #watch + if word + #word.gsub!(/<\/?[i]>/,'') + word.gsub!(/<\/?\S+?>/,'') + word.strip! + word.gsub!(/[\.,;:"]$/,'') + word.gsub!(/["]/,'') + word.gsub!(/^\s*[\(]/,'') + word.gsub!(/[\(]\s*$/,'') + word.gsub!(/^(?:See|e\.?g\.?).+/,'') + word.gsub!(/^\s*[.,;:]\s*/,'') + word.strip! + word.gsub!(/^\d+(st|nd|rd|th)$/,'') + word.gsub!(/^(\d+\.?)+$/, '') + word = nil if word =~/^\s*$/ #watch + if word + word.capitalize! unless word =~/[A-Z][A-Z]/ or word =~/\w+\s\w+/ + #word.downcase! if word =~lesser + #word.capitalize! if word =~greater + @freq[word] +=1 + @word_map[word] ||= [] + if line !~@rxp_lv1 and line !~@rxp_lv2 and line !~@rxp_lv3 + @word_map[word] << location_seg("#@seg\##{toy}",toy) + else + @word_map[word] << case line + when @rxp_lv1; location_seg('T1',toy) + when @rxp_lv2; location_seg('T2',toy) + when @rxp_lv3; location_seg('T3',toy) + end + end + end + end + end + end + end + end + scr='Full Text scroll: doc#  ' + seg='' + @file_index_all << SiSU_Concordance::Source::Doc_title.new('toc',@md).create + for word in @freq.keys.sort! {|a,b| a.downcase<=>b.downcase} + keyword=SiSU_Concordance::Source::Word.new(word,@freq[word]).html + if keyword !~ @rxp_excluded0 + if @word_map[word][0] =~ /\d+/ + wm=[] + @file_index_all << %{#{keyword}#{seg}#{@word_map[word].uniq.compact}} + end + @file_index_all << '

' + end + # special cases endnotes and header levels 1 - 3 + end + credits=@vz.credits_splash + @file_index_all << "#{credits}\n" # footer + tell=SiSU_Screen::Ansi.new(@md.cmd,@md.fns,"#{@env.path.output_tell}/#{@md.fn[:concordance]}") + tell.flow if @md.cmd =~/[MV]/ + end + end + end +end +__END__ -- cgit v1.2.3