aboutsummaryrefslogtreecommitdiffhomepage
path: root/lib/sisu/v3dv/harvest_topics.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/sisu/v3dv/harvest_topics.rb')
-rw-r--r--lib/sisu/v3dv/harvest_topics.rb650
1 files changed, 0 insertions, 650 deletions
diff --git a/lib/sisu/v3dv/harvest_topics.rb b/lib/sisu/v3dv/harvest_topics.rb
deleted file mode 100644
index dd7ec8c2..00000000
--- a/lib/sisu/v3dv/harvest_topics.rb
+++ /dev/null
@@ -1,650 +0,0 @@
-# encoding: utf-8
-=begin
-
- * Name: SiSU
-
- * Description: a framework for document structuring, publishing and search
- metadata harvest, extract topics and associated writings from document set
- (topics use topic_register header)
-
- * Author: Ralph Amissah
-
- * Copyright: (C) 1997 - 2012, Ralph Amissah, All Rights Reserved.
-
- * License: GPL 3 or later:
-
- SiSU, a framework for document structuring, publishing and search
-
- Copyright (C) Ralph Amissah
-
- This program is free software: you can redistribute it and/or modify it
- under the terms of the GNU General Public License as published by the Free
- Software Foundation, either version 3 of the License, or (at your option)
- any later version.
-
- This program is distributed in the hope that it will be useful, but WITHOUT
- ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- more details.
-
- You should have received a copy of the GNU General Public License along with
- this program. If not, see <http://www.gnu.org/licenses/>.
-
- If you have Internet connection, the latest version of the GPL should be
- available at these locations:
- <http://www.fsf.org/licensing/licenses/gpl.html>
- <http://www.gnu.org/licenses/gpl.html>
-
- <http://www.jus.uio.no/sisu/gpl.fsf/toc.html>
- <http://www.jus.uio.no/sisu/gpl.fsf/doc.html>
- <http://www.jus.uio.no/sisu/gpl.fsf/plain.txt>
-
- * SiSU uses:
- * Standard SiSU markup syntax,
- * Standard SiSU meta-markup syntax, and the
- * Standard SiSU object citation numbering and system
-
- * Hompages:
- <http://www.jus.uio.no/sisu>
- <http://www.sisudoc.org>
-
- * Download:
- <http://www.jus.uio.no/sisu/SiSU/download.html>
-
- * Ralph Amissah
- <ralph@amissah.com>
- <ralph.amissah@gmail.com>
-
- ** Description: simple xml representation (sax style)
-
-=end
-module SiSU_Harvest_Topics
- require_relative 'author_format' # author_format.rb
- include SiSU_Viz
- class Songsheet
- @@the_idx_topics={}
- def initialize(opt,env)
- @opt,@env=opt,env
- @file_list=opt.files
- end
- def songsheet
- puts 'topics:'
- idx_array={}
- @opt.f_pths.each do |y|
- lang_hash_file_array={}
- name=y[:f]
- filename=y[:pth] + '/' + y[:f]
- File.open(filename,'r') do |file|
- file.each_line("\n\n") do |line|
- if line =~/^@(?:title|creator|classify):(?:\s|$)/m
- lang_hash_file_array[y[:lng_is]] ||= []
- lang_hash_file_array[y[:lng_is]] << line
- elsif line =~/^@\S+?:(?:\s|$)/m \
- or line =~/^(?:\s*\n|%+ )/
- else break
- end
- end
- end
- lang_hash_file_array.each_pair do |lang,a|
- idx_array[lang] ||= []
- idx_array=SiSU_Harvest_Topics::Harvest.new(@opt,@env,a,filename,name,idx_array,lang).extract_harvest
- end
- end
- the_idx=SiSU_Harvest_Topics::Index.new(@opt,@env,idx_array,@@the_idx_topics).construct_book_topic_index
- SiSU_Harvest_Topics::OutputIndex.new(@opt,the_idx).html_print.html_songsheet
- end
- end
- class Harvest
- def initialize(opt,env,data,filename,name,idx_array,lang)
- @opt,@env,@data,@filename,@name,@idx_array,@lang=opt,env,data,filename,name,idx_array,lang
- end
- def extract_harvest
- data,filename,name,idx_array,lang=@data,@filename,@name,@idx_array,@lang
- @idx_lst,@title,@subtitle,@fulltitle,@author,@author_format=nil,nil,nil,nil,nil,nil
- rgx={}
- rgx[:author]=/^@creator:(?:[ ]+|.+?:author:[ ]+)(.+?)(?:\||\n)/m
- rgx[:title]=/^@title:[ ]+(.+)/
- rgx[:subtitle]=/^@title:.+?:subtitle:[ ]+(.+?)\n/m
- rgx[:idx]=/^@classify:.+?:topic_register:[ ]+(.+?)\n/m
- data.each do |para|
- if para=~ rgx[:idx]
- @idx_list=rgx[:idx].match(para)[1]
- end
- if para=~ rgx[:title]
- @title=rgx[:title].match(para)[1]
- end
- if para=~ rgx[:subtitle]
- @subtitle=rgx[:subtitle].match(para)[1]
- end
- if para=~ rgx[:author]
- @author_format=rgx[:author].match(para)[1]
- end
- break if @title and @subtitle and @author and @idx_lst
- end
- @fulltitle=@subtitle ? (@title + ' - ' + @subtitle) : @title
- if @title \
- and @author_format \
- and @idx_list
- creator=SiSU_FormatAuthor::Author.new(@author_format.strip).author_details
- @authors,@authorship=creator[:authors],creator[:authorship]
- file=if name=~/~[a-z]{2,3}\.ss[mt]$/
- name.sub(/~[a-z]{2,3}\.ss[mt]$/,'')
- else
- name.sub(/\.ss[mt]$/,'')
- end
- page=if @env.output_dir_structure.by_language_code?
- #fix
- end
- page=if @env.output_dir_structure.by_language_code?
- "#{lang}/sisu_manifest.html"
- else
- "sisu_manifest.#{lang}.html"
- end
- idx_array[lang] <<=if @idx_list =~/;/
- g=@idx_list.scan(/[^;]+/)
- idxl=[]
- g.each do |i|
- i=i.strip
- idxl << { filename: filename, file: file, rough_idx: i, title: @fulltitle, author: creator, page: page, lang: lang }
- end
- idxl
- else { filename: filename, file: file, rough_idx: @idx_list, title: @fulltitle, author: creator, page: page, lang: lang }
- end
- else
- p "missing required field in #{@filename} - [title]: <<#{@title}>>; [author]: <<#{@author_format}>>; [idx]: <<#{@idx_list}>>" if @opt.cmd.inspect =~/[VM]/
- end
- idx_array[lang]=idx_array[lang].flatten
- idx_array
- end
- end
- class Index
- def initialize(opt,env,idx_array,the_idx)
- @opt,@env,@idx_array,@the_idx=opt,env,idx_array,the_idx
- @@the_idx_topics=@the_idx
- end
- def capital(txt)
- txt[0].chr.capitalize + txt[1,txt.length]
- end
- def contents(lang,hash,idx)
- names=''
- idx[:author][:last_first_format_a].each do |n|
- s=n.sub(/(.+?)(?:,.+|$)/,'\1').gsub(/\s+/,'_')
- names=if @env.output_dir_structure.by_language_code?
- names += %{<a href="authors.html##{s}">#{n}</a>, }
- else
- names += %{<a href="authors.#{lang}.html##{s}">#{n}</a>, }
- end
- end
- hash << { filename: idx[:filename], file: idx[:file], author: names, title: idx[:title], page: idx[:page] }
- end
- def construct_book_topic_index
- idx_array=@idx_array
- idx_array.each_pair do |lang,idx_array|
- @@the_idx_topics[lang] ||= {}
- idx_array.each do |idx|
- @lv0,@lv1,@lv2,@lv3,@lv4={},{},{},{},{}
- if idx[:rough_idx]
- idx_lst=idx[:rough_idx].scan(/[^:]+/)
- else
- puts "no topic register in: << #{idx[:filename]} >>"
- next
- end
- idx_lst_alt=[]
- idx_lst.each {|lev| idx_lst_alt << lev.scan(/[^|]+/)}
- depth = idx_lst_alt.length - 1
- range = 0..depth
- range.each do |t|
- if idx_lst_alt[t]
- case t
- when 0
- lev0=idx_lst_alt[t]
- lev0.each do |lv0|
- lv0=capital(lv0)
- if @@the_idx_topics[lang][lv0].class==NilClass
- @@the_idx_topics[lang][lv0]={ md: [] }
- end
- @lv0=lv0 if lev0.length==1
- j=@@the_idx_topics[lang][lv0][:md]
- contents(lang,j,idx) if idx_lst_alt.length - 1 == t
- end
- when 1
- lev1=idx_lst_alt[t]
- lev1.each do |lv1|
- lv1=capital(lv1)
- if @@the_idx_topics[lang][@lv0][lv1].class==NilClass
- @@the_idx_topics[lang][@lv0][lv1]={ md: [] }
- end
- @lv1=lv1 if lev1.length==1
- j=@@the_idx_topics[lang][@lv0][lv1][:md]
- contents(lang,j,idx) if idx_lst_alt.length - 1 == t
- end
- when 2
- lev2=idx_lst_alt[t]
- lev2.each do |lv2|
- lv2=capital(lv2)
- if @@the_idx_topics[lang][@lv0][@lv1][lv2].class==NilClass
- @@the_idx_topics[lang][@lv0][@lv1][lv2]={ md: [] }
- end
- @lv2=lv2 if lev2.length==1
- j=@@the_idx_topics[lang][@lv0][@lv1][lv2][:md]
- contents(lang,j,idx) if idx_lst_alt.length - 1 == t
- end
- when 3
- lev3=idx_lst_alt[t]
- lev3.each do |lv3|
- lv3=capital(lv3)
- if @@the_idx_topics[lang][@lv0][@lv1][@lv2][lv3].class==NilClass
- @@the_idx_topics[lang][@lv0][@lv1][@lv2][lv3]={ md: [] }
- end
- @lv3=lv3 if lev3.length==1
- j=@@the_idx_topics[lang][@lv0][@lv1][@lv2][lv3][:md]
- contents(lang,j,idx) if idx_lst_alt.length - 1 == t
- end
- when 4
- lev4=idx_lst_alt[t]
- lev4.each do |lv4|
- lv4=capital(lv4)
- if @@the_idx_topics[lang][@lv0][@lv1][@lv2][@lv3][lv4].class==NilClass
- @@the_idx_topics[lang][@lv0][@lv1][@lv2][@lv3][lv4]={ md: [] }
- end
- @lv4=lv4 if lev4.length==1
- j=@@the_idx_topics[lang][@lv0][@lv1][@lv2][@lv3][lv4][:md]
- contents(lang,j,idx) if idx_lst_alt.length - 1 == t
- end
- end
- end
- end
- end
- end
- @the_idx
- end
- end
- class OutputIndex
- require_relative 'i18n' # i18n.rb
- def initialize(opt,the_idx)
- @opt,@the_idx=opt,the_idx
- @env=SiSU_Env::InfoEnv.new
- @rc=SiSU_Env::GetInit.instance.sisu_yaml.rc
- @alph=%W[9 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z]
- @letter=@alph.shift
- @vz=SiSU_Env::GetInit.instance.skin
- end
- def html_file_open
- @the_idx.keys.each do |lng|
- @output ||={}
- @output[lng] ||={}
- harvest_pth,file='',''
- if @env.output_dir_structure.by_language_code?
- harvest_pth="#{@env.path.webserv}/#{@opt.base_stub}/#{lng}/manifest"
- file="#{harvest_pth}/topics.html"
- else @env.output_dir_structure.by_filetype?
- harvest_pth="#{@env.path.webserv}/#{@opt.base_stub}/manifest"
- file="#{harvest_pth}/topics.#{lng}.html"
- end
- FileUtils::mkdir_p(harvest_pth) unless FileTest.directory?(harvest_pth)
- puts "file://#{file}"
- @output[lng][:html]=File.new(file,'w')
- if @opt.cmd.inspect =~/-M/
- @output[lng][:html_mnt]=File.new("#{@env.path.pwd}/topics.html",'w')
- end
- end
- end
- def html_file_close
- @the_idx.keys.each do |lng|
- @output[lng][:html].close
- @output[lng][:html_mnt].close if @output[lng][:html_mnt].class==File
- end
- end
- def html_print
- def html_songsheet
- html_file_open
- html_head
- html_alph
- html_body
- html_tail
- html_file_close
- end
- def html_head_adjust(lng,type='')
- css_path,authors='',''
- if @env.output_dir_structure.by_language_code?
- css_path=(type !~/maintenance/) \
- ? '../../_sisu/css/harvest.css'
- : 'harvest.css'
- authors='authors.html'
- elsif @env.output_dir_structure.by_filetype?
- css_path=(type !~/maintenance/) \
- ? '../_sisu/css/harvest.css'
- : 'harvest.css'
- authors="authors.#{lng}.html"
- elsif @env.output_dir_structure.by_filename?
- css_path=(type !~/maintenance/) \
- ? '../_sisu/css/harvest.css'
- : 'harvest.css'
- authors="authors.#{lng}.html"
- end
- ln=SiSU_i18n::Languages.new.language.list
- harvest_languages=''
- @the_idx.keys.each do |lng|
- if @env.output_dir_structure.by_language_code?
- harvest_pth="../../#{lng}/manifest"
- file="#{harvest_pth}/topics.html"
- else @env.output_dir_structure.by_filetype?
- harvest_pth='.'
- file="#{harvest_pth}/topics.#{lng}.html"
- end
- l=ln[lng][:t]
- harvest_languages += %{<a href="#{file}">#{l}</a>&nbsp;&nbsp;&nbsp;}
- end
- sv=SiSU_Env::InfoVersion.instance.get_version
- <<WOK
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
-"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml">
-<head>
-<title>SiSU Metadata Harvest - Topics</title>
-<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
-<meta name="dc.title" content= "SiSU metadata harvest, Topics - SiSU information Structuring Universe, Structured information Serialised Units" />
-<meta name="dc.subject" content= "document structuring, ebook, publishing, PDF, LaTeX, XML, ODF, SQL, postgresql, sqlite, electronic book, electronic publishing, electronic document, electronic citation, data structure, citation systems, granular search, digital library" />
-<meta name="generator" content="#{sv[:project]} #{sv[:version]} of #{sv[:date_stamp]} (n*x and Ruby!)" />
-<link rel="generator" href="http://www.jus.uio.no/sisu/SiSU" />
-<link rel="stylesheet" href="#{css_path}" type="text/css" />
-<link rel="shortcut icon" href="../_sisu/image/rb7.ico" />
-</head>
-<body bgcolor="#ffffff" text="#000000" link="#003090" lang="en" xml:lang="en">
-<a name="top" id="top"></a>
-<a name="up" id="up"></a>
-<a name="start" id="start"></a>
-<h1>SiSU Metadata Harvest - Topics</h1>
-<p>[<a href="../index.html">&nbsp;HOME&nbsp;</a>] also see <a href="#{authors}">SiSU Metadata Harvest - Authors</a></p>
-<p>#{@env.widget_static.search_form}</p>
-<hr />
-<p class="tiny">#{harvest_languages}</p>
-<hr />
-WOK
- end
- def html_head
- @the_idx.keys.each do |lng|
- @output[lng][:html_mnt] << html_head_adjust(lng,'maintenance') if @opt.cmd.inspect =~/M/
- @output[lng][:html] << html_head_adjust(lng)
- end
- end
- def html_alph
- a=[]
- a << '<p>'
- @alph.each do |x|
- a << ((x =~/[0-9]/) \
- ? ''
- : %{<a href="##{x}">#{x}</a>,&nbsp;})
- end
- a=a.join
- @the_idx.keys.each do |lng|
- @output[lng][:html_mnt] << a if @opt.cmd.inspect =~/M/
- @output[lng][:html] << a
- end
- end
- def html_tail
- a =<<WOK
-<hr />
-<a name="bottom" id="bottom"></a>
-<a name="down" id="down"></a>
-<a name="end" id="end"></a>
-<a name="finish" id="finish"></a>
-<a name="stop" id="stop"></a>
-<a name="credits"></a>
-#{@vz.credits_sisu}
-</body>
-</html>
-WOK
- @the_idx.keys.each do |lng|
- @output[lng][:html_mnt] << a if @output[lng][:html_mnt].class==File
- @output[lng][:html] << a
- end
- end
- def do_html(lng,html)
- @output[lng][:html] << html
- end
- def do_html_maintenance(lng,html)
- @output[lng][:html_mnt] << html if @output[lng][:html_mnt].class==File
- end
- def do_string(lng,attrib,string)
- html=%{<p class="#{attrib}">#{string}</p>}
- do_html(lng,html)
- do_html_maintenance(lng,html) if @output[lng][:html_mnt].class==File
- end
- def do_string_default(lng,attrib,string)
- html=%{<p class="#{attrib}">#{string}</p>}
- do_html(lng,html)
- end
- def do_string_maintenance(lng,attrib,string)
- html=%{<p class="#{attrib}">#{string}</p>}
- do_html_maintenance(lng,html) if @output[lng][:html_mnt].class==File
- end
- def do_string_name(lng,attrib,string)
- f=/^(\S)/.match(string)[1]
- if @letter < f
- while @letter < f
- if @alph.length > 0
- @letter=@alph.shift
- if @output[lng][:html_mnt].class==File
- @output[lng][:html_mnt] << %{\n<p class="letter"><a name="#{@letter}">#{@letter}</a></p><p class="book_index_lev1"><a name="#{@letter.downcase}"></a></p>}
- end
- @output[lng][:html] << %{\n<p class="letter"><a name="#{@letter}">#{@letter}</a></p><p class="book_index_lev1"><a name="#{@letter.downcase}"></a></p>}
- else break
- end
- end
- end
- name=string.strip.gsub(/\s+/,'_')
- html=%{<p class="#{attrib}"><a name="#{name}">#{string}</a></p>}
- do_html(lng,html)
- do_html_maintenance(lng,html) if @output[lng][:html_mnt].class==File
- end
- def do_array(lng,lv,array)
- lv+=1
- array.each do |b|
- do_case(lng,lv,b)
- end
- end
- def do_hash_md(lng,attrib,hash)
- if @env.output_dir_structure.by_language_code?
- manifest_at=hash[:file] + '.html'
- elsif @env.output_dir_structure.by_filetype?
- manifest_at=hash[:file] + '.' + lng + '.html'
- elsif @env.output_dir_structure.by_filename?
- manifest_at="../#{hash[:file]}/#{hash[:page]}"
- end
- html=%{<a href="#{manifest_at}">#{hash[:title]}</a> - #{hash[:author]}}
- do_string_default(lng,attrib,html)
- end
- def do_hash_md_maintenance(lng,attrib,hash)
- if @output[lng][:html_mnt].class==File #should not be run for presentation output
- html=%{[<a href="#{hash[:file]}.sst">src</a>]&nbsp;&nbsp;<a href="file://#{@env.path.output}/#{hash[:file]}/#{hash[:page]}">#{hash[:title]}</a> - #{hash[:author]}}
- do_string_maintenance(lng,attrib,html)
- end
- end
- def do_hash(lng,lv,hash)
- lv+=1
- key=[]
- hash.each_key do |m|
- if m == :md
- do_case(lng,lv,hash[m])
- elsif m != :title \
- and m != :author \
- and m != :filename \
- and m != :file \
- and m != :rough_idx \
- and m != :page
- key << m
- elsif m == :title
- do_hash_md(lng,'work',hash)
- do_hash_md_maintenance(lng,'work',hash)
- end
- end
- if key.length > 0
- key.sort.each do |m|
- attrib="lev#{lv}"
- lv==0 ? do_string_name(lng,attrib,m) : do_string(lng,attrib,m)
- do_case(lng,lv,hash[m])
- end
- end
- end
- def do_case(lng,lv,a)
- y = a.class
- case
- when y==String
- attrib="lev#{lv}"
- lv==0 ? do_string_name(lng,attrib,a) : do_string(lng,attrib,a)
- when y==Array
- do_array(lng,lv,a)
- when y==Hash
- do_hash(lng,lv,a)
- end
- end
- def html_body
- the_idx=@the_idx
- the_idx.each_pair do |lng,lng_array|
- lng_array.sort.each do |a|
- do_case(lng,-1,a)
- end
- end
- end
- self
- end
- def screen_print
- def do_string(lv,string)
- s=' '*4
- puts s*lv + string
- end
- def do_array(lng,lv,array)
- lv+=1
- array.each do |b|
- do_case(lng,lv,b)
- end
- end
- def do_hash_md(lng,lv,hash)
- string=hash[:title] + ' - ' + hash[:author]
- do_string(lng,lv,string)
- end
- def do_hash(lng,lv,hash)
- lv+=1
- key=[]
- hash.each_key do |m|
- if m == :md
- do_case(lng,lv,hash[m])
- elsif m != :title \
- and m != :author \
- and m != :filename \
- and m != :file \
- and m != :rough_idx \
- and m != :page
- key << m
- elsif m == :title
- do_hash_md(lng,lv,hash)
- end
- end
- if key.length > 0
- key.sort.each do |m|
- do_string(lng,lv,m)
- do_case(lng,lv,hash[m])
- end
- end
- end
- def do_case(lng,lv,a)
- s=' '*4
- y = a.class
- case
- when y==String
- do_string(lng,lv,a)
- when y==Array
- do_array(lng,lv,a)
- when y==Hash
- do_hash(lng,lv,a)
- end
- end
- def cycle
- the_idx=@the_idx
- the_idx.keys.each do |lng|
- the_idx[lng].each do |a|
- do_case(lng,-1,a)
- end
- end
- end
- self
- end
- def screen_print_unsorted
- def do_string(lng,lv,string)
- s=' '*4
- puts s*lv + string
- end
- def do_array(lng,lv,array)
- lv+=1
- array.each do |b|
- do_case(lng,lv,b)
- end
- end
- def do_hash_md(lng,lv,hash)
- string=hash[:title] + ' - ' + hash[:author]
- do_string(lng,lv,string)
- end
- def do_hash(lng,lv,hash)
- lv+=1
- hash.each_key do |m|
- if m == :md
- do_case(lng,lv,hash[m])
- else
- if m != :title \
- and m != :author \
- and m != :filename \
- and m != :file \
- and m != :rough_idx \
- and m != :page
- do_string(lng,lv,m)
- do_case(lng,lv,hash[m])
- elsif m == :title
- do_hash_md(lng,lv,hash)
- else
- end
- end
- end
- end
- def do_case(lng,lv,a)
- s=' '*4
- y = a.class
- case
- when y==String
- do_string(lng,lv,a)
- when y==Array
- do_array(lng,lv,a)
- when y==Hash
- do_hash(lng,lv,a)
- end
- end
- def cycle
- the_idx=@the_idx
- the_idx.keys.each do |lng|
- the_idx[lng].each do |a|
- do_case(lng,-1,a)
- end
- end
- end
- self
- end
- end
-end
-__END__
-terms -|_ t{tl1} -|_ {fa}[fa]{filenames and other details}
- | |_ {tl2} -|_ {fa}[fa]{filenames and other details}
- | | |_{tl3} -|_ {fa}[fa]{filenames and other details}
- | | | |_{tl4} - {fa}[fa]{filenames and other details}
- | | | |
- | | | |_{tl4a} - {fa}[fa]{filenames and other details}
- | | | |
- | | | |_{tl4b} - {fa}[fa]{filenames and other details}
- | | | |
- | | | |_ ...
- | | |
- | | |_{tl3a} - {fa}[fa]{filenames and other details}
- | |
- | |_{tl2a} - {fa}[fa]{filenames and other details}
- |
- |_ t{tl1a} -|_ {fa}[fa]{filenames and other details}
- |_ ...