diff options
Diffstat (limited to 'lib/sisu/v3dv/harvest_topics.rb')
-rw-r--r-- | lib/sisu/v3dv/harvest_topics.rb | 650 |
1 files changed, 0 insertions, 650 deletions
diff --git a/lib/sisu/v3dv/harvest_topics.rb b/lib/sisu/v3dv/harvest_topics.rb deleted file mode 100644 index dd7ec8c2..00000000 --- a/lib/sisu/v3dv/harvest_topics.rb +++ /dev/null @@ -1,650 +0,0 @@ -# encoding: utf-8 -=begin - - * Name: SiSU - - * Description: a framework for document structuring, publishing and search - metadata harvest, extract topics and associated writings from document set - (topics use topic_register header) - - * Author: Ralph Amissah - - * Copyright: (C) 1997 - 2012, Ralph Amissah, All Rights Reserved. - - * License: GPL 3 or later: - - SiSU, a framework for document structuring, publishing and search - - Copyright (C) Ralph Amissah - - This program is free software: you can redistribute it and/or modify it - under the terms of the GNU General Public License as published by the Free - Software Foundation, either version 3 of the License, or (at your option) - any later version. - - This program is distributed in the hope that it will be useful, but WITHOUT - ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - more details. - - You should have received a copy of the GNU General Public License along with - this program. If not, see <http://www.gnu.org/licenses/>. - - If you have Internet connection, the latest version of the GPL should be - available at these locations: - <http://www.fsf.org/licensing/licenses/gpl.html> - <http://www.gnu.org/licenses/gpl.html> - - <http://www.jus.uio.no/sisu/gpl.fsf/toc.html> - <http://www.jus.uio.no/sisu/gpl.fsf/doc.html> - <http://www.jus.uio.no/sisu/gpl.fsf/plain.txt> - - * SiSU uses: - * Standard SiSU markup syntax, - * Standard SiSU meta-markup syntax, and the - * Standard SiSU object citation numbering and system - - * Hompages: - <http://www.jus.uio.no/sisu> - <http://www.sisudoc.org> - - * Download: - <http://www.jus.uio.no/sisu/SiSU/download.html> - - * Ralph Amissah - <ralph@amissah.com> - <ralph.amissah@gmail.com> - - ** Description: simple xml representation (sax style) - -=end -module SiSU_Harvest_Topics - require_relative 'author_format' # author_format.rb - include SiSU_Viz - class Songsheet - @@the_idx_topics={} - def initialize(opt,env) - @opt,@env=opt,env - @file_list=opt.files - end - def songsheet - puts 'topics:' - idx_array={} - @opt.f_pths.each do |y| - lang_hash_file_array={} - name=y[:f] - filename=y[:pth] + '/' + y[:f] - File.open(filename,'r') do |file| - file.each_line("\n\n") do |line| - if line =~/^@(?:title|creator|classify):(?:\s|$)/m - lang_hash_file_array[y[:lng_is]] ||= [] - lang_hash_file_array[y[:lng_is]] << line - elsif line =~/^@\S+?:(?:\s|$)/m \ - or line =~/^(?:\s*\n|%+ )/ - else break - end - end - end - lang_hash_file_array.each_pair do |lang,a| - idx_array[lang] ||= [] - idx_array=SiSU_Harvest_Topics::Harvest.new(@opt,@env,a,filename,name,idx_array,lang).extract_harvest - end - end - the_idx=SiSU_Harvest_Topics::Index.new(@opt,@env,idx_array,@@the_idx_topics).construct_book_topic_index - SiSU_Harvest_Topics::OutputIndex.new(@opt,the_idx).html_print.html_songsheet - end - end - class Harvest - def initialize(opt,env,data,filename,name,idx_array,lang) - @opt,@env,@data,@filename,@name,@idx_array,@lang=opt,env,data,filename,name,idx_array,lang - end - def extract_harvest - data,filename,name,idx_array,lang=@data,@filename,@name,@idx_array,@lang - @idx_lst,@title,@subtitle,@fulltitle,@author,@author_format=nil,nil,nil,nil,nil,nil - rgx={} - rgx[:author]=/^@creator:(?:[ ]+|.+?:author:[ ]+)(.+?)(?:\||\n)/m - rgx[:title]=/^@title:[ ]+(.+)/ - rgx[:subtitle]=/^@title:.+?:subtitle:[ ]+(.+?)\n/m - rgx[:idx]=/^@classify:.+?:topic_register:[ ]+(.+?)\n/m - data.each do |para| - if para=~ rgx[:idx] - @idx_list=rgx[:idx].match(para)[1] - end - if para=~ rgx[:title] - @title=rgx[:title].match(para)[1] - end - if para=~ rgx[:subtitle] - @subtitle=rgx[:subtitle].match(para)[1] - end - if para=~ rgx[:author] - @author_format=rgx[:author].match(para)[1] - end - break if @title and @subtitle and @author and @idx_lst - end - @fulltitle=@subtitle ? (@title + ' - ' + @subtitle) : @title - if @title \ - and @author_format \ - and @idx_list - creator=SiSU_FormatAuthor::Author.new(@author_format.strip).author_details - @authors,@authorship=creator[:authors],creator[:authorship] - file=if name=~/~[a-z]{2,3}\.ss[mt]$/ - name.sub(/~[a-z]{2,3}\.ss[mt]$/,'') - else - name.sub(/\.ss[mt]$/,'') - end - page=if @env.output_dir_structure.by_language_code? - #fix - end - page=if @env.output_dir_structure.by_language_code? - "#{lang}/sisu_manifest.html" - else - "sisu_manifest.#{lang}.html" - end - idx_array[lang] <<=if @idx_list =~/;/ - g=@idx_list.scan(/[^;]+/) - idxl=[] - g.each do |i| - i=i.strip - idxl << { filename: filename, file: file, rough_idx: i, title: @fulltitle, author: creator, page: page, lang: lang } - end - idxl - else { filename: filename, file: file, rough_idx: @idx_list, title: @fulltitle, author: creator, page: page, lang: lang } - end - else - p "missing required field in #{@filename} - [title]: <<#{@title}>>; [author]: <<#{@author_format}>>; [idx]: <<#{@idx_list}>>" if @opt.cmd.inspect =~/[VM]/ - end - idx_array[lang]=idx_array[lang].flatten - idx_array - end - end - class Index - def initialize(opt,env,idx_array,the_idx) - @opt,@env,@idx_array,@the_idx=opt,env,idx_array,the_idx - @@the_idx_topics=@the_idx - end - def capital(txt) - txt[0].chr.capitalize + txt[1,txt.length] - end - def contents(lang,hash,idx) - names='' - idx[:author][:last_first_format_a].each do |n| - s=n.sub(/(.+?)(?:,.+|$)/,'\1').gsub(/\s+/,'_') - names=if @env.output_dir_structure.by_language_code? - names += %{<a href="authors.html##{s}">#{n}</a>, } - else - names += %{<a href="authors.#{lang}.html##{s}">#{n}</a>, } - end - end - hash << { filename: idx[:filename], file: idx[:file], author: names, title: idx[:title], page: idx[:page] } - end - def construct_book_topic_index - idx_array=@idx_array - idx_array.each_pair do |lang,idx_array| - @@the_idx_topics[lang] ||= {} - idx_array.each do |idx| - @lv0,@lv1,@lv2,@lv3,@lv4={},{},{},{},{} - if idx[:rough_idx] - idx_lst=idx[:rough_idx].scan(/[^:]+/) - else - puts "no topic register in: << #{idx[:filename]} >>" - next - end - idx_lst_alt=[] - idx_lst.each {|lev| idx_lst_alt << lev.scan(/[^|]+/)} - depth = idx_lst_alt.length - 1 - range = 0..depth - range.each do |t| - if idx_lst_alt[t] - case t - when 0 - lev0=idx_lst_alt[t] - lev0.each do |lv0| - lv0=capital(lv0) - if @@the_idx_topics[lang][lv0].class==NilClass - @@the_idx_topics[lang][lv0]={ md: [] } - end - @lv0=lv0 if lev0.length==1 - j=@@the_idx_topics[lang][lv0][:md] - contents(lang,j,idx) if idx_lst_alt.length - 1 == t - end - when 1 - lev1=idx_lst_alt[t] - lev1.each do |lv1| - lv1=capital(lv1) - if @@the_idx_topics[lang][@lv0][lv1].class==NilClass - @@the_idx_topics[lang][@lv0][lv1]={ md: [] } - end - @lv1=lv1 if lev1.length==1 - j=@@the_idx_topics[lang][@lv0][lv1][:md] - contents(lang,j,idx) if idx_lst_alt.length - 1 == t - end - when 2 - lev2=idx_lst_alt[t] - lev2.each do |lv2| - lv2=capital(lv2) - if @@the_idx_topics[lang][@lv0][@lv1][lv2].class==NilClass - @@the_idx_topics[lang][@lv0][@lv1][lv2]={ md: [] } - end - @lv2=lv2 if lev2.length==1 - j=@@the_idx_topics[lang][@lv0][@lv1][lv2][:md] - contents(lang,j,idx) if idx_lst_alt.length - 1 == t - end - when 3 - lev3=idx_lst_alt[t] - lev3.each do |lv3| - lv3=capital(lv3) - if @@the_idx_topics[lang][@lv0][@lv1][@lv2][lv3].class==NilClass - @@the_idx_topics[lang][@lv0][@lv1][@lv2][lv3]={ md: [] } - end - @lv3=lv3 if lev3.length==1 - j=@@the_idx_topics[lang][@lv0][@lv1][@lv2][lv3][:md] - contents(lang,j,idx) if idx_lst_alt.length - 1 == t - end - when 4 - lev4=idx_lst_alt[t] - lev4.each do |lv4| - lv4=capital(lv4) - if @@the_idx_topics[lang][@lv0][@lv1][@lv2][@lv3][lv4].class==NilClass - @@the_idx_topics[lang][@lv0][@lv1][@lv2][@lv3][lv4]={ md: [] } - end - @lv4=lv4 if lev4.length==1 - j=@@the_idx_topics[lang][@lv0][@lv1][@lv2][@lv3][lv4][:md] - contents(lang,j,idx) if idx_lst_alt.length - 1 == t - end - end - end - end - end - end - @the_idx - end - end - class OutputIndex - require_relative 'i18n' # i18n.rb - def initialize(opt,the_idx) - @opt,@the_idx=opt,the_idx - @env=SiSU_Env::InfoEnv.new - @rc=SiSU_Env::GetInit.instance.sisu_yaml.rc - @alph=%W[9 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z] - @letter=@alph.shift - @vz=SiSU_Env::GetInit.instance.skin - end - def html_file_open - @the_idx.keys.each do |lng| - @output ||={} - @output[lng] ||={} - harvest_pth,file='','' - if @env.output_dir_structure.by_language_code? - harvest_pth="#{@env.path.webserv}/#{@opt.base_stub}/#{lng}/manifest" - file="#{harvest_pth}/topics.html" - else @env.output_dir_structure.by_filetype? - harvest_pth="#{@env.path.webserv}/#{@opt.base_stub}/manifest" - file="#{harvest_pth}/topics.#{lng}.html" - end - FileUtils::mkdir_p(harvest_pth) unless FileTest.directory?(harvest_pth) - puts "file://#{file}" - @output[lng][:html]=File.new(file,'w') - if @opt.cmd.inspect =~/-M/ - @output[lng][:html_mnt]=File.new("#{@env.path.pwd}/topics.html",'w') - end - end - end - def html_file_close - @the_idx.keys.each do |lng| - @output[lng][:html].close - @output[lng][:html_mnt].close if @output[lng][:html_mnt].class==File - end - end - def html_print - def html_songsheet - html_file_open - html_head - html_alph - html_body - html_tail - html_file_close - end - def html_head_adjust(lng,type='') - css_path,authors='','' - if @env.output_dir_structure.by_language_code? - css_path=(type !~/maintenance/) \ - ? '../../_sisu/css/harvest.css' - : 'harvest.css' - authors='authors.html' - elsif @env.output_dir_structure.by_filetype? - css_path=(type !~/maintenance/) \ - ? '../_sisu/css/harvest.css' - : 'harvest.css' - authors="authors.#{lng}.html" - elsif @env.output_dir_structure.by_filename? - css_path=(type !~/maintenance/) \ - ? '../_sisu/css/harvest.css' - : 'harvest.css' - authors="authors.#{lng}.html" - end - ln=SiSU_i18n::Languages.new.language.list - harvest_languages='' - @the_idx.keys.each do |lng| - if @env.output_dir_structure.by_language_code? - harvest_pth="../../#{lng}/manifest" - file="#{harvest_pth}/topics.html" - else @env.output_dir_structure.by_filetype? - harvest_pth='.' - file="#{harvest_pth}/topics.#{lng}.html" - end - l=ln[lng][:t] - harvest_languages += %{<a href="#{file}">#{l}</a> } - end - sv=SiSU_Env::InfoVersion.instance.get_version - <<WOK -<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" -"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> -<html xmlns="http://www.w3.org/1999/xhtml"> -<head> -<title>SiSU Metadata Harvest - Topics</title> -<meta http-equiv="Content-Type" content="text/html;charset=utf-8" /> -<meta name="dc.title" content= "SiSU metadata harvest, Topics - SiSU information Structuring Universe, Structured information Serialised Units" /> -<meta name="dc.subject" content= "document structuring, ebook, publishing, PDF, LaTeX, XML, ODF, SQL, postgresql, sqlite, electronic book, electronic publishing, electronic document, electronic citation, data structure, citation systems, granular search, digital library" /> -<meta name="generator" content="#{sv[:project]} #{sv[:version]} of #{sv[:date_stamp]} (n*x and Ruby!)" /> -<link rel="generator" href="http://www.jus.uio.no/sisu/SiSU" /> -<link rel="stylesheet" href="#{css_path}" type="text/css" /> -<link rel="shortcut icon" href="../_sisu/image/rb7.ico" /> -</head> -<body bgcolor="#ffffff" text="#000000" link="#003090" lang="en" xml:lang="en"> -<a name="top" id="top"></a> -<a name="up" id="up"></a> -<a name="start" id="start"></a> -<h1>SiSU Metadata Harvest - Topics</h1> -<p>[<a href="../index.html"> HOME </a>] also see <a href="#{authors}">SiSU Metadata Harvest - Authors</a></p> -<p>#{@env.widget_static.search_form}</p> -<hr /> -<p class="tiny">#{harvest_languages}</p> -<hr /> -WOK - end - def html_head - @the_idx.keys.each do |lng| - @output[lng][:html_mnt] << html_head_adjust(lng,'maintenance') if @opt.cmd.inspect =~/M/ - @output[lng][:html] << html_head_adjust(lng) - end - end - def html_alph - a=[] - a << '<p>' - @alph.each do |x| - a << ((x =~/[0-9]/) \ - ? '' - : %{<a href="##{x}">#{x}</a>, }) - end - a=a.join - @the_idx.keys.each do |lng| - @output[lng][:html_mnt] << a if @opt.cmd.inspect =~/M/ - @output[lng][:html] << a - end - end - def html_tail - a =<<WOK -<hr /> -<a name="bottom" id="bottom"></a> -<a name="down" id="down"></a> -<a name="end" id="end"></a> -<a name="finish" id="finish"></a> -<a name="stop" id="stop"></a> -<a name="credits"></a> -#{@vz.credits_sisu} -</body> -</html> -WOK - @the_idx.keys.each do |lng| - @output[lng][:html_mnt] << a if @output[lng][:html_mnt].class==File - @output[lng][:html] << a - end - end - def do_html(lng,html) - @output[lng][:html] << html - end - def do_html_maintenance(lng,html) - @output[lng][:html_mnt] << html if @output[lng][:html_mnt].class==File - end - def do_string(lng,attrib,string) - html=%{<p class="#{attrib}">#{string}</p>} - do_html(lng,html) - do_html_maintenance(lng,html) if @output[lng][:html_mnt].class==File - end - def do_string_default(lng,attrib,string) - html=%{<p class="#{attrib}">#{string}</p>} - do_html(lng,html) - end - def do_string_maintenance(lng,attrib,string) - html=%{<p class="#{attrib}">#{string}</p>} - do_html_maintenance(lng,html) if @output[lng][:html_mnt].class==File - end - def do_string_name(lng,attrib,string) - f=/^(\S)/.match(string)[1] - if @letter < f - while @letter < f - if @alph.length > 0 - @letter=@alph.shift - if @output[lng][:html_mnt].class==File - @output[lng][:html_mnt] << %{\n<p class="letter"><a name="#{@letter}">#{@letter}</a></p><p class="book_index_lev1"><a name="#{@letter.downcase}"></a></p>} - end - @output[lng][:html] << %{\n<p class="letter"><a name="#{@letter}">#{@letter}</a></p><p class="book_index_lev1"><a name="#{@letter.downcase}"></a></p>} - else break - end - end - end - name=string.strip.gsub(/\s+/,'_') - html=%{<p class="#{attrib}"><a name="#{name}">#{string}</a></p>} - do_html(lng,html) - do_html_maintenance(lng,html) if @output[lng][:html_mnt].class==File - end - def do_array(lng,lv,array) - lv+=1 - array.each do |b| - do_case(lng,lv,b) - end - end - def do_hash_md(lng,attrib,hash) - if @env.output_dir_structure.by_language_code? - manifest_at=hash[:file] + '.html' - elsif @env.output_dir_structure.by_filetype? - manifest_at=hash[:file] + '.' + lng + '.html' - elsif @env.output_dir_structure.by_filename? - manifest_at="../#{hash[:file]}/#{hash[:page]}" - end - html=%{<a href="#{manifest_at}">#{hash[:title]}</a> - #{hash[:author]}} - do_string_default(lng,attrib,html) - end - def do_hash_md_maintenance(lng,attrib,hash) - if @output[lng][:html_mnt].class==File #should not be run for presentation output - html=%{[<a href="#{hash[:file]}.sst">src</a>] <a href="file://#{@env.path.output}/#{hash[:file]}/#{hash[:page]}">#{hash[:title]}</a> - #{hash[:author]}} - do_string_maintenance(lng,attrib,html) - end - end - def do_hash(lng,lv,hash) - lv+=1 - key=[] - hash.each_key do |m| - if m == :md - do_case(lng,lv,hash[m]) - elsif m != :title \ - and m != :author \ - and m != :filename \ - and m != :file \ - and m != :rough_idx \ - and m != :page - key << m - elsif m == :title - do_hash_md(lng,'work',hash) - do_hash_md_maintenance(lng,'work',hash) - end - end - if key.length > 0 - key.sort.each do |m| - attrib="lev#{lv}" - lv==0 ? do_string_name(lng,attrib,m) : do_string(lng,attrib,m) - do_case(lng,lv,hash[m]) - end - end - end - def do_case(lng,lv,a) - y = a.class - case - when y==String - attrib="lev#{lv}" - lv==0 ? do_string_name(lng,attrib,a) : do_string(lng,attrib,a) - when y==Array - do_array(lng,lv,a) - when y==Hash - do_hash(lng,lv,a) - end - end - def html_body - the_idx=@the_idx - the_idx.each_pair do |lng,lng_array| - lng_array.sort.each do |a| - do_case(lng,-1,a) - end - end - end - self - end - def screen_print - def do_string(lv,string) - s=' '*4 - puts s*lv + string - end - def do_array(lng,lv,array) - lv+=1 - array.each do |b| - do_case(lng,lv,b) - end - end - def do_hash_md(lng,lv,hash) - string=hash[:title] + ' - ' + hash[:author] - do_string(lng,lv,string) - end - def do_hash(lng,lv,hash) - lv+=1 - key=[] - hash.each_key do |m| - if m == :md - do_case(lng,lv,hash[m]) - elsif m != :title \ - and m != :author \ - and m != :filename \ - and m != :file \ - and m != :rough_idx \ - and m != :page - key << m - elsif m == :title - do_hash_md(lng,lv,hash) - end - end - if key.length > 0 - key.sort.each do |m| - do_string(lng,lv,m) - do_case(lng,lv,hash[m]) - end - end - end - def do_case(lng,lv,a) - s=' '*4 - y = a.class - case - when y==String - do_string(lng,lv,a) - when y==Array - do_array(lng,lv,a) - when y==Hash - do_hash(lng,lv,a) - end - end - def cycle - the_idx=@the_idx - the_idx.keys.each do |lng| - the_idx[lng].each do |a| - do_case(lng,-1,a) - end - end - end - self - end - def screen_print_unsorted - def do_string(lng,lv,string) - s=' '*4 - puts s*lv + string - end - def do_array(lng,lv,array) - lv+=1 - array.each do |b| - do_case(lng,lv,b) - end - end - def do_hash_md(lng,lv,hash) - string=hash[:title] + ' - ' + hash[:author] - do_string(lng,lv,string) - end - def do_hash(lng,lv,hash) - lv+=1 - hash.each_key do |m| - if m == :md - do_case(lng,lv,hash[m]) - else - if m != :title \ - and m != :author \ - and m != :filename \ - and m != :file \ - and m != :rough_idx \ - and m != :page - do_string(lng,lv,m) - do_case(lng,lv,hash[m]) - elsif m == :title - do_hash_md(lng,lv,hash) - else - end - end - end - end - def do_case(lng,lv,a) - s=' '*4 - y = a.class - case - when y==String - do_string(lng,lv,a) - when y==Array - do_array(lng,lv,a) - when y==Hash - do_hash(lng,lv,a) - end - end - def cycle - the_idx=@the_idx - the_idx.keys.each do |lng| - the_idx[lng].each do |a| - do_case(lng,-1,a) - end - end - end - self - end - end -end -__END__ -terms -|_ t{tl1} -|_ {fa}[fa]{filenames and other details} - | |_ {tl2} -|_ {fa}[fa]{filenames and other details} - | | |_{tl3} -|_ {fa}[fa]{filenames and other details} - | | | |_{tl4} - {fa}[fa]{filenames and other details} - | | | | - | | | |_{tl4a} - {fa}[fa]{filenames and other details} - | | | | - | | | |_{tl4b} - {fa}[fa]{filenames and other details} - | | | | - | | | |_ ... - | | | - | | |_{tl3a} - {fa}[fa]{filenames and other details} - | | - | |_{tl2a} - {fa}[fa]{filenames and other details} - | - |_ t{tl1a} -|_ {fa}[fa]{filenames and other details} - |_ ... |