diff options
Diffstat (limited to 'lib/sisu/html_harvest_authors.rb')
-rw-r--r-- | lib/sisu/html_harvest_authors.rb | 466 |
1 files changed, 466 insertions, 0 deletions
diff --git a/lib/sisu/html_harvest_authors.rb b/lib/sisu/html_harvest_authors.rb new file mode 100644 index 00000000..c6895ec8 --- /dev/null +++ b/lib/sisu/html_harvest_authors.rb @@ -0,0 +1,466 @@ +# encoding: utf-8 +=begin + +* Name: SiSU + +** Description: documents, structuring, processing, publishing, search +*** metadata harvest, extract authors and their writings from document set + +** Author: Ralph Amissah + <ralph@amissah.com> + <ralph.amissah@gmail.com> + +** Copyright: (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, + 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Ralph Amissah, + All Rights Reserved. + +** License: GPL 3 or later: + + SiSU, a framework for document structuring, publishing and search + + Copyright (C) Ralph Amissah + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the Free + Software Foundation, either version 3 of the License, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + You should have received a copy of the GNU General Public License along with + this program. If not, see <http://www.gnu.org/licenses/>. + + If you have Internet connection, the latest version of the GPL should be + available at these locations: + <http://www.fsf.org/licensing/licenses/gpl.html> + <http://www.gnu.org/licenses/gpl.html> + + <http://www.sisudoc.org/sisu/en/manifest/gpl.fsf.html> + +** SiSU uses: + * Standard SiSU markup syntax, + * Standard SiSU meta-markup syntax, and the + * Standard SiSU object citation numbering and system + +** Hompages: + <http://www.jus.uio.no/sisu> + <http://www.sisudoc.org> + +** Git + <http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=summary> + <http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=blob;f=lib/sisu/harvest_authors.rb;hb=HEAD> + +=end +module SiSU_HarvestAuthors + require_relative 'html_harvest_author_format' # html_harvest_author_format.rb + require_relative 'html_parts' # html_parts.rb + class Songsheet + @@the_idx_authors={} + def initialize(opt,env) + @opt,@env=opt,env + @file_list=opt.files + end + def songsheet + idx_array={} + @opt.f_pths.each do |y| + lang_hash_file_array={} + name=y[:f] + filename=y[:pth] + '/' + y[:f] + File.open(filename,'r') do |file| + file.each_line("\n\n") do |line| + if line =~/^@(?:title|creator|date):(?:\s|$)/m + lang_hash_file_array[y[:lng_is]] ||= [] + lang_hash_file_array[y[:lng_is]] << line + elsif line =~/^@\S+?:(?:\s|$)/m \ + or line =~/^(?:\s*\n|%+ )/ + else break + end + end + end + lang_hash_file_array.each_pair do |lang,a| + idx_array[lang] ||= [] + idx_array=SiSU_HarvestAuthors::Harvest.new( + @opt, + @env, + a, + filename, + name, + idx_array, + lang + ).extract_harvest + end + end + the_idx=SiSU_HarvestAuthors::Index.new( + idx_array, + @@the_idx_authors + ).construct_book_author_index + SiSU_HarvestAuthors::OutputIndex.new( + @opt, + the_idx + ).html_print.html_songsheet + end + end + class Harvest + def initialize(opt,env,data,filename,name,idx_array,lang) + @opt, @env,@data,@filename,@name,@idx_array,@lang= + opt,env, data, filename, name, idx_array, lang + end + def extract_harvest + data, filename, name, idx_array, lang = + @data,@filename,@name,@idx_array,@lang + @title=@subtitle=@fulltitle=@author=@author_format=@date=nil + @authors=[] + rgx={} + rgx[:author]=/^@creator:(?:[ ]+|.+?:author:[ ]+)(.+?)(?:\||\n)/m + rgx[:title]=/^@title:[ ]+(.+)/ + rgx[:subtitle]=/^@title:.+?:subtitle:[ ]+(.+?)\n/m + rgx[:date]=/^@date:(?:[ ]+|.+?:published:[ ]+)(\d{4})/m + data.each do |para| + if para=~ rgx[:title] + @title=rgx[:title].match(para)[1] + end + if para=~ rgx[:subtitle] + @subtitle=rgx[:subtitle].match(para)[1] + end + if para=~ rgx[:author] + @author_format=rgx[:author].match(para)[1] + end + if para=~ rgx[:date] + @date=rgx[:date].match(para)[1] + end + break if @title && @subtitle && @author && @date + end + @fulltitle=@subtitle \ + ? (@title + ' - ' + @subtitle) + : @title + if @title \ + and @author_format + creator=SiSU_FormatAuthor::Author.new(@author_format.strip).author_details + @authors,@authorship=creator[:authors],creator[:authorship] + file=if name=~/~[a-z]{2,3}\.ss[mt]$/ + name.sub(/~[a-z]{2,3}\.ss[mt]$/,'') + else + name.sub(/\.ss[mt]$/,'') + end + page=if @env.output_dir_structure.by? == :language + "#{lang}/sisu_manifest.html" + else + "sisu_manifest.#{lang}.html" + end + idx_array[lang] <<= { + filename: filename, + file: file, + date: @date, + title: @fulltitle, + author: creator, + page: page, + lang: lang + } + else + #p "missing author field: #{@filename} title: #{@title}; author: #{@author_format}" + end + idx_array[lang]=idx_array[lang].flatten + idx_array + end + end + class Index + def initialize(idx_array,the_idx) + @idx_array,@the_idx=idx_array,the_idx + @@the_idx_authors=@the_idx + end + def capital(txt) + txt[0].chr.capitalize + txt[1,txt.length] + end + def construct_book_author_index + idx_array=@idx_array + idx_array.each_pair do |lang,idx_arr| + @@the_idx_authors[lang] ||= {} + idx_arr.each do |idx| + idx[:author][:last_first_format_a].each do |author| + author=author.strip + if @@the_idx_authors[lang][author].is_a?(NilClass) + @@the_idx_authors[lang][author]={ md: [] } + end + @@the_idx_authors[lang][author][:md] << { + filename: idx[:filename], + file: idx[:file], + author: idx[:author], + title: idx[:title], + date: idx[:date], + page: idx[:page], + lang: idx[:lang] + } + end + end + end + @the_idx=@@the_idx_authors + end + end + class OutputIndex + require_relative 'i18n' # i18n.rb + def initialize(opt,the_idx) + @opt,@the_idx=opt,the_idx + @env=SiSU_Env::InfoEnv.new + @rc=SiSU_Env::GetInit.new.sisu_yaml.rc + @alphabet_list=%W[9 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z] + @alph=@alphabet_list.dup + @letter=@alph.shift + end + def html_file_open + @the_idx.keys.each do |lng| + @output ||={} + @output[lng] ||={} + harvest_pth,file='','' + if @env.output_dir_structure.by? == :language + harvest_pth=@env.path.webserv + '/' \ + + @opt.base_stub + '/' \ + + lng + '/' \ + + 'manifest' + file="#{harvest_pth}/authors.html" + elsif @env.output_dir_structure.by? == :filetype + harvest_pth=@env.path.webserv + '/' \ + + @opt.base_stub + '/' \ + + 'manifest' + file="#{harvest_pth}/authors.#{lng}.html" + elsif @env.output_dir_structure.by? == :filename + harvest_pth=@env.path.webserv + '/' \ + + @opt.base_stub + file="#{harvest_pth}/authors.#{lng}.html" + end + FileUtils::mkdir_p(harvest_pth) \ + unless FileTest.directory?(harvest_pth) + fileinfo=(@opt.act[:verbose][:set]==:on \ + || @opt.act[:verbose_plus][:set]==:on \ + || @opt.act[:urls_selected][:set]==:on \ + || @opt.act[:maintenance][:set]==:on) \ + ? ("file://#{file}") : '' + SiSU_Screen::Ansi.new( + @opt.act[:color_state][:set], + "harvest authors (#{@opt.files.length} files)", + fileinfo + ).dark_grey_title_hi unless @opt.act[:quiet][:set]==:on + @output[lng][:html]=File.new(file,'w') + end + end + def html_file_close + @the_idx.keys.each do |lng| + @output[lng][:html].close + @output[lng][:html_mnt].close \ + if @output[lng][:html_mnt].is_a?(File) + end + end + def html_print + def html_songsheet + html_file_open + html_head + html_alph + html_body + html_tail + html_file_close + end + def html_head_adjust(lng,type='') + css_path,topics='','' + if @env.output_dir_structure.by? == :language + css_path=(type !~/maintenance/) \ + ? '../../_sisu/css/harvest.css' + : 'harvest.css' + topics='topics.html' + elsif @env.output_dir_structure.by? == :filetype + css_path=(type !~/maintenance/) \ + ? '../_sisu/css/harvest.css' + : 'harvest.css' + topics="topics.#{lng}.html" + elsif @env.output_dir_structure.by? == :filename + css_path=(type !~/maintenance/) \ + ? './_sisu/css/harvest.css' + : 'harvest.css' + topics="topics.#{lng}.html" + end + ln=SiSU_i18n::Languages.new.language.list + harvest_languages='' + @the_idx.keys.each do |lg| + if @env.output_dir_structure.by? == :language + harvest_pth="../../#{lg}/manifest" + file="#{harvest_pth}/authors.html" + elsif @env.output_dir_structure.by? == :filetype + harvest_pth='.' + file="#{harvest_pth}/authors.#{lg}.html" + elsif @env.output_dir_structure.by? == :filename + harvest_pth='.' + file="#{harvest_pth}/authors.#{lg}.html" + end + l=ln[lg][:t] + harvest_languages += + %{<a href="#{file}">#{l}</a> } + end + sv=SiSU_Env::InfoVersion.instance.get_version + if @env.output_dir_structure.by? == :language + home_pth='../..' + output_structure_by= + '(output organised by language & filetype)' + elsif @env.output_dir_structure.by? == :filetype + home_pth='..' + output_structure_by= + '(output organised by filetype)' + elsif @env.output_dir_structure.by? == :filename + home_pth='.' + output_structure_by= + '(output organised by filename)' + else + home_pth='.' + output_structure_by='(output organised by ?)' + end + <<WOK +<!DOCTYPE html> +<html> +<head> +<meta charset="utf-8"> +<title>SiSU Metadata Harvest - Authors</title> +<meta http-equiv="Content-Type" content="text/html;charset=utf-8" /> +<meta name="dc.title" content= "SiSU metadata harvest, Authors - SiSU information Structuring Universe, Structured information Serialised Units" /> +<meta name="dc.subject" content= "document structuring, ebook, publishing, PDF, LaTeX, XML, ODF, SQL, postgresql, sqlite, electronic book, electronic publishing, electronic document, electronic citation, data structure, citation systems, granular search, digital library" /> +<meta name="generator" content="#{sv.project} #{sv.version} of #{sv.date_stamp} (n*x and Ruby!)" /> +<link rel="generator" href="http://www.jus.uio.no/sisu/SiSU" /> +<link href="#{css_path}" rel="stylesheet" > +<link rel="shortcut icon" href="../_sisu/image/rb7.ico" /> +</head> +<body lang="en" xml:lang="en"> +<a name="top" id="top"></a> +<a name="up" id="up"></a> +<a name="start" id="start"></a> +<h1>SiSU Metadata Harvest - Authors #{output_structure_by}</h1> +<p>[<a href="#{home_pth}/index.html"> HOME </a>] also see <a href="#{topics}">SiSU Metadata Harvest - Topics</a></p> +<p>#{@env.widget_static.search_form}</p> +<hr /> +<p class="tiny">#{harvest_languages}</p> +<hr /> +WOK + end + def html_head + @the_idx.keys.each do |lng| + @output[lng][:html_mnt] \ + << html_head_adjust(lng,'maintenance') \ + if @opt.act[:maintenance][:set]==:on + @output[lng][:html] \ + << html_head_adjust(lng) + end + end + def html_alph + a=[] + a << '<p>' + @alph.each do |x| + a << ((x =~/[0-9]/) \ + ? '' + : %{<a href="##{x}">#{x}</a>, }) + end + a=a.join + @the_idx.keys.each do |lng| + @output[lng][:html_mnt] << a \ + if @opt.act[:maintenance][:set]==:on + @output[lng][:html] << a + end + end + def html_tail + a =<<WOK +<hr /> +<a name="bottom" id="bottom"></a> +<a name="down" id="down"></a> +<a name="end" id="end"></a> +<a name="finish" id="finish"></a> +<a name="stop" id="stop"></a> +<a name="credits"></a> +#{SiSU_Proj_HTML::Bits.new.credits_sisu} +</body> +</html> +WOK + @the_idx.keys.each do |lng| + @output[lng][:html_mnt] << a \ + if @output[lng][:html_mnt].is_a?(File) + @output[lng][:html] << a + end + end + def do_html(lng,html) + @output[lng][:html_mnt] << html \ + if @output[lng][:html_mnt].is_a?(File) + @output[lng][:html] << html + end + def do_string_name(lng,attrib,string) + f=/^(\S)/.match(string[0])[1] + if @lng != lng + @alph=@alphabet_list.dup + @letter=@alph.shift + @lng = lng + end + if @letter < f + while @letter < f + if @alph.length > 0 + @letter=@alph.shift + if @output[lng][:html_mnt].is_a?(File) + @output[lng][:html_mnt] \ + << %{\n<p class="letter"><a name="#{@letter}"></p>#{@letter}</a><p class="book_index_lev1"><a name="#{@letter.downcase}"></a></p>} + end + @output[lng][:html] \ + << %{\n<p class="letter"><a name="#{@letter}">#{@letter}</a></p><p class="book_index_lev1"><a name="#{@letter.downcase}"></a></p>} + else break + end + end + end + end + def html_body + the_idx=@the_idx + the_idx.each_pair do |lng,lng_array| + lng_array.sort.each do |a| + do_string_name(lng,'',a) + name=a[0].sub(/(.+?)(?:,.+|$)/,'\1').gsub(/\s+/,'_') + x = %{<p class="author"><a name="#{name}">#{a[0]}</a></p>} + if @output[lng][:html_mnt].is_a?(File) + @output[lng][:html_mnt] << x + end + @output[lng][:html] << x + lang_code_insert=SiSU_Env::FilenameLanguageCodeInsert.new(@opt,lng).language_code_insert + works=[] + a[1][:md].each do |i| + manifest_at=if @env.output_dir_structure.by? == :language + i[:file] + Sfx[:html] + elsif @env.output_dir_structure.by? == :filetype + i[:file] + lang_code_insert + Sfx[:html] + elsif @env.output_dir_structure.by? == :filename + './' + i[:file] + '/' + i[:page] + else '' #error + end + work=[ + "#{i[:date]} #{i[:title]}", + %{<p class="publication">#{i[:date]} <a href="#{manifest_at}">#{i[:title]}</a>, #{i[:author][:authors_s]}</p>} + ] + works<<=(@output[lng][:html_mnt].is_a?(File)) \ + ? (work.concat([%{<p class="publication">[<a href="#{i[:file]}.sst">src</a>] #{i[:date]} <a href="file://#{manifest_at}">#{i[:title]}</a>, #{i[:author][:authors_s]} -- [<a href="#{i[:file]}.sst">#{i[:file]}.sst</a>]</p>}])) + : work + end + works.sort_by {|y| y[0]}.each do |z| + @output[lng][:html] << z[1] + @output[lng][:html_mnt] << z[2] \ + if @output[lng][:html_mnt].is_a?(File) + end + end + end + end + self + end + def screen_print + def cycle + the_idx=@the_idx + the_idx.sort.each do |a| + puts a[0] + a[1][:md].each do |x| + puts "\t" + x[:file] + end + end + end + self + end + end +end +__END__ |