From 0e6fc15ada3c5d9a86b227163f35a54993b32529 Mon Sep 17 00:00:00 2001 From: Ralph Amissah Date: Tue, 2 Dec 2008 23:54:23 -0500 Subject: sisu harvest, introduce module along with header syntax addition & modification * sisu markup, additional header and new format rule: * @creator: / @author: header field, introduced author name format rules for more usable metadata harvesting: surname comma other names, additional authors separated by semi-colon * param added meta-tag, @topic_register: formatting topic levels are separated from sub-levels by a colon, a semi-colon separates main topics if there are multiple topics at lowest sub-level, a pipe can be used to create multiple headings * harvest module, harvests metadata from document set currently extracts: (i) authors and their writings from document set; (ii) topics and associated writings from document set (topics use topic_register header). harvest (when run against documents common to a directory of a site) extracts metadata and organises the documents on a site by author and topic information provided (there is a new "topic_register" header, with formatting rules similar to those of the book index), results are placed in [output_path]/sisu_site_metadata. sisu --harvest *.sst * by author (see change in param @creator: / @author: header field) * by topic / subject index (see addition in param of @topic_register: header field) initially there should be an example samples here: http://www.jus.uio.no/sisu/sisu_site_metadata/harvest_authors.html http://www.jus.uio.no/sisu/sisu_site_metadata/harvest_topics.html together with update markup source files The authors and their writings list will be made to take on a more biblographical form, with the use of additional fields as required. (concept example, suitable for medium sized sites [to remove size constraint: implement SQL equivalent]) make feature more robust * css, for harvest output added * remote placement of sisu_site_metadata (output produced by metadata harvest) * sisu markup, update document samples accordingly * tidy copyright marks in program headers, remove repetition of dates [version bump because formatting rule introduced to author / creator header - where new site metadata harvest feature is used, (at present changes changes should not be noticed except when using metadata harvest)] --- lib/sisu/v0/param.rb | 45 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 6 deletions(-) (limited to 'lib/sisu/v0/param.rb') diff --git a/lib/sisu/v0/param.rb b/lib/sisu/v0/param.rb index 6ab30e86..2829abe1 100644 --- a/lib/sisu/v0/param.rb +++ b/lib/sisu/v0/param.rb @@ -14,8 +14,7 @@ SiSU, a framework for document structuring, publishing and search - Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, - 2007, 2008 Ralph Amissah + Copyright (C) Ralph Amissah This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free @@ -119,19 +118,20 @@ module SiSU_Param @doc={ :lv=>[] } @doc[:fns],@doc[:fnb],@doc[:scr_suffix]='','','' @@publisher='SiSU scribe' - attr_accessor :cmd,:mod,:env,:fn,:fns,:fnb,:fnn,:fnt,:fnl,:flv,:fnz,:fnstex,:ocn,:sfx_src,:sfx,:pdf,:file_type,:dir_out,:dir_tex,:dir_lout,:txt_path,:site_skin,:sisu,:sisu_version,:ruby_version,:title,:dc_title,:html_title,:subtitle,:subtitle_tex,:creator_home,:dc_creator,:translator,:illustrator,:prepared_by,:digitized_by,:dc_subject,:dc_description,:dc_publisher,:dc_contributor,:dc_date,:dc_date_created,:dc_date_issued,:dc_date_available,:dc_date_valid,:dc_date_modified,:date_scheme,:date_created_scheme,:date_issued_scheme,:date_available_scheme,:date_valid_scheme,:date_modified_scheme,:dc_type,:dc_format,:dc_identifier,:dc_source,:dc_language,:language_original,:dc_relation,:dc_coverage,:dc_rights,:keywords,:comments,:abstract,:cls_loc,:cls_dewey,:cls_pg,:cls_isbn,:papersize,:papersize_array,:toc,:lv1,:lv2,:lv3,:lv4,:lv5,:lv6,:pagenew,:pagebreak,:num_top,:toc_lev_limit,:flag_endnotes,:flag_auto_endnotes,:flag_separate_endnotes,:flag_separate_endnotes_make,:flag_auto_heading_num,:markup,:markup_instruction,:markup_version,:markup_declared,:make_bold,:make_italic,:flag_tables,:vocabulary,:doc_skin,:doc_css,:yaml,:lnk,:prefix_a,:prefix_b,:suffix,:information,:contact,:icon,:image,:ad_url,:ad_png,:ad_alt,:ad_began,:flag_promo,:promo,:ad_home,:stmp,:stmpd,:sc_filename,:sc_number,:sc_date,:sc_time,:sc_info,:yamladdr,:locale,:wc_lines,:wc_words,:wc_bytes,:file_encoding,:file_size,:user,:home,:hostname,:pwd,:firstseg,:programs,:creator_copymark,:lang,:en,:dgst,:dgst_skin,:generated,:tags,:tag_array,:concord_make,:seg_names,:seg_autoname_safe,:set_header_title,:set_heading_top,:set_heading_seg,:heading_seg_first,:heading_seg_first_flag,:base_program,:man_section,:man_name,:man_synopsis,:ec,:opt,:sem_tag,:book_idx,:doc_cont_idx + attr_accessor :cmd,:mod,:env,:fn,:fns,:fnb,:fnn,:fnt,:fnl,:flv,:fnz,:fnstex,:ocn,:sfx_src,:sfx,:pdf,:file_type,:dir_out,:dir_tex,:dir_lout,:txt_path,:site_skin,:sisu,:sisu_version,:ruby_version,:title,:dc_title,:html_title,:subtitle,:subtitle_tex,:creator_home,:dc_creator,:authors,:authorship,:translator,:illustrator,:prepared_by,:digitized_by,:dc_subject,:dc_description,:dc_publisher,:dc_contributor,:dc_date,:dc_date_created,:dc_date_issued,:dc_date_available,:dc_date_valid,:dc_date_modified,:date_scheme,:date_created_scheme,:date_issued_scheme,:date_available_scheme,:date_valid_scheme,:date_modified_scheme,:dc_type,:dc_format,:dc_identifier,:dc_source,:dc_language,:language_original,:dc_relation,:dc_coverage,:dc_rights,:keywords,:comments,:abstract,:cls_loc,:cls_dewey,:cls_pg,:cls_isbn,:papersize,:papersize_array,:toc,:lv1,:lv2,:lv3,:lv4,:lv5,:lv6,:pagenew,:pagebreak,:num_top,:toc_lev_limit,:flag_endnotes,:flag_auto_endnotes,:flag_separate_endnotes,:flag_separate_endnotes_make,:flag_auto_heading_num,:markup,:markup_instruction,:markup_version,:markup_declared,:make_bold,:make_italic,:flag_tables,:vocabulary,:doc_skin,:doc_css,:yaml,:lnk,:prefix_a,:prefix_b,:suffix,:information,:contact,:icon,:image,:ad_url,:ad_png,:ad_alt,:ad_began,:flag_promo,:promo,:ad_home,:stmp,:stmpd,:sc_filename,:sc_number,:sc_date,:sc_time,:sc_info,:yamladdr,:locale,:wc_lines,:wc_words,:wc_bytes,:file_encoding,:file_size,:user,:home,:hostname,:pwd,:firstseg,:programs,:creator_copymark,:lang,:en,:dgst,:dgst_skin,:generated,:tags,:tag_array,:concord_make,:seg_names,:seg_autoname_safe,:set_header_title,:set_heading_top,:set_heading_seg,:heading_seg_first,:heading_seg_first_flag,:base_program,:man_section,:man_name,:man_synopsis,:ec,:opt,:sem_tag,:book_idx,:topic_register,:original_publication_details def initialize(fns_array,opt) - @env=@fn=@fns=@fnb=@fnn=@fnt=@fnl=@flv=@fnz=@fnstex=@ocn=@sfx_src=@sfx=@pdf=@file_type=@dir_out=@dir_tex=@dir_lout=@txt_path=@flag_endnotes=@flag_auto_endnotes=@flag_separate_endnotes=@flag_separate_endnotes_make=@site_skin=@sisu=@sisu_version=@ruby_version=@title=@dc_title=@html_title=@subtitle=@subtitle_tex=@creator_home=@dc_creator=@translator=@illustrator=@prepared_by=@digitized_by=@dc_subject=@dc_description=@dc_publisher=@dc_contributor=@dc_date=@dc_date_created=@dc_date_issued=@dc_date_available=@dc_date_valid=@dc_date_modified=@date_scheme=@date_created_scheme=@date_issued_scheme=@date_available_scheme=@date_valid_scheme=@date_modified_scheme=@dc_type=@dc_format=@dc_identifier=@dc_source=@dc_language=@language_original=@dc_relation=@dc_coverage=@dc_rights=@keywords=@comments=@abstract=@cls_loc=@cls_dewey=@cls_pg=@cls_isbn=@papersize=@toc=@lv1=@lv2=@lv3=@lv4=@lv5=@lv6=@pagenew=@pagebreak=@num_top=@toc_lev_limit=@flag_auto_heading_num=@make_bold=@make_italic=@flag_tables=@vocabulary=@doc_skin=@doc_css=@yaml=@lnk=@prefix_a=@prefix_b=@suffix=@information=@contact=@icon=@ad_url=@ad_png=@ad_alt=@ad_began=@promo=@ad_home=@stmp=@stmpd=@sc_filename=@sc_number=@sc_date=@sc_time=@sc_info=@yamladdr=@locale=@wc_lines=@wc_words=@wc_bytes=@file_encoding=@file_size=@firstseg=@programs=@creator_copymark=@lang=@en=@dgst=@dgst_skin=@generated=@heading_seg_first=@base_program=@man_synopsis=@doc_cont_idx=nil + @env=@fn=@fns=@fnb=@fnn=@fnt=@fnl=@flv=@fnz=@fnstex=@ocn=@sfx_src=@sfx=@pdf=@file_type=@dir_out=@dir_tex=@dir_lout=@txt_path=@flag_endnotes=@flag_auto_endnotes=@flag_separate_endnotes=@flag_separate_endnotes_make=@site_skin=@sisu=@sisu_version=@ruby_version=@title=@dc_title=@html_title=@subtitle=@subtitle_tex=@creator_home=@dc_creator=@translator=@illustrator=@prepared_by=@digitized_by=@dc_subject=@dc_description=@dc_publisher=@dc_contributor=@dc_date=@dc_date_created=@dc_date_issued=@dc_date_available=@dc_date_valid=@dc_date_modified=@date_scheme=@date_created_scheme=@date_issued_scheme=@date_available_scheme=@date_valid_scheme=@date_modified_scheme=@dc_type=@dc_format=@dc_identifier=@dc_source=@dc_language=@language_original=@dc_relation=@dc_coverage=@dc_rights=@keywords=@comments=@abstract=@cls_loc=@cls_dewey=@cls_pg=@cls_isbn=@papersize=@toc=@lv1=@lv2=@lv3=@lv4=@lv5=@lv6=@pagenew=@pagebreak=@num_top=@toc_lev_limit=@flag_auto_heading_num=@make_bold=@make_italic=@flag_tables=@vocabulary=@doc_skin=@doc_css=@yaml=@lnk=@prefix_a=@prefix_b=@suffix=@information=@contact=@icon=@ad_url=@ad_png=@ad_alt=@ad_began=@promo=@ad_home=@stmp=@stmpd=@sc_filename=@sc_number=@sc_date=@sc_time=@sc_info=@yamladdr=@locale=@wc_lines=@wc_words=@wc_bytes=@file_encoding=@file_size=@firstseg=@programs=@creator_copymark=@lang=@en=@dgst=@dgst_skin=@generated=@heading_seg_first=@base_program=@man_synopsis=@topic_register=@original_publication_details=nil @man_section=1 @man_name='man page "name/whatis" information not provided, set in header @man: name=[whatis information]' @data,@fns,@cmd,@mod,@opt=fns_array,opt.fns,opt.cmd,opt.mod,opt #@data used as data @flag_tables,@set_header_title,@set_heading_top,@set_heading_seg,@heading_seg_first_flag,@flag_promo,@book_idx=false,false,false,false,false,false,false @seg_autoname_safe=true @sem_tag=false - @markup_instruction,@markup_declared,@image='','','' #check which other values should be set to empty rather than nil + @authorship,@markup_instruction,@markup_declared,@image='','','','' #check which other values should be set to empty rather than nil @markup=@markup_instruction #use @markup_instruction @doc,@fn,@make_italic,@make_bold,@tag_hash,@ec={},{},{},{},{},{},{} @flv,@lang,@seg_names,@tags,@tag_array,@tag_a,@ec[:image],@ec[:audio],@ec[:multimedia]=Array.new(9){[]} + @authors=[] @papersize_array=[] @rgx_image=/(?:^|[^_\\])\{\s*(\S+?\.(?:png|jpg|gif))/ @rgx_audio=/\{\s*(\S+?\.(?:mp3|ogg))/ @@ -291,6 +291,38 @@ module SiSU_Param else @dc_creator=/(?:0~|@)(?:creator|author)-?:?\s+(.+?)$/m.match(para)[1] end @dc_creator.strip! + authors=@dc_creator.scan(/[^;]+/) + authors.each do |a| + if a =~/"(.+?)"/ + @authors << { :the => $1 } + else #if a =~/,/ + x=a.scan(/[^,]+/) + if x.length == 1 + @authors << { :the => x[0].strip } + elsif x.length == 2 + @authors << { :the => x[0].strip, :others => x[1].strip } + else #p x.length + end + end + end + l = @authors.length + authorship='' + @authors.each_with_index do |a,i| + authorship += if a[:others] + if (l - i) > 1 + "#{a[:others].strip} #{a[:the].strip}, " + else + "#{a[:others].strip} #{a[:the].strip}" + end + else + if (l - i) > 2 + "#{a[:the].strip}, " + else + "#{a[:the].strip}" + end + end + end + @authorship=@dc_creator=authorship when /^(?:0~(?:translator|translated_by)|@(?:translator|translated_by):)\s+(.+?)$/m #% metainfo @translator=$1 when /^(?:0~(?:illustrator|illustrated_by)|@(?:illustrator|illustrated_by):)\s+(.+?)$/m #% metainfo @@ -517,8 +549,9 @@ module SiSU_Param end when /^(?:0~suffix|@suffix:)\s+(.+?)$/m; @suffix=$1 #% metainfo when /^(?:0~information|@information:)\s+(.+?)$/m; @information=$1 #% metainfo - when /^(?:0~doc_cont(?:ent)?_in?de?x|@doc_cont(?:ent)?_in?de?x:)\s+(.+?)$/m; @doc_cont_idx=$1 #% metainfo, similar syntax to book index, leave out the ={} i.e. use equivalent of ={(.+?)} + when /^(?:0~topic_register|@topic_register:)\s+(.+?)$/m; @topic_register=$1 #% metainfo, similar syntax to book index, leave out the ={} i.e. use equivalent of ={(.+?)} when /^(?:0~contact|@contact:)\s+(.+?)$/m; @contact=$1 #% metainfo + when /^(?:0~original_publication|@original_publication:)\s+(.+?)$/m; @original_publication=$1 #% details of original publication when /^(?:0~icon|@icon:)\s+(.+?)$/m; @icon=$1 #% processing when /^(?:0~promo|@promo:)\s+(.+?)$/m @flag_promo=true -- cgit v1.2.3