diff options
author | Ralph Amissah <ralph.amissah@gmail.com> | 2007-06-02 11:27:06 +0100 |
---|---|---|
committer | Ralph Amissah <ralph.amissah@gmail.com> | 2007-06-02 11:27:06 +0100 |
commit | 26767cc88c0548ad7978021796d0ccc4c9f7ffed (patch) | |
tree | fe225e99e180b5d2925cbf776826f74db27e1888 /lib/sisu/v0/dal.rb | |
parent | restrict use to ruby1.8 branch, i.e. < 1.9 (diff) |
0.53.0, pre-build, see changelog, library naming changed for scm, placed under v0 (instead of 0.53)upstream/0.53.0sisu_0.53.0
Diffstat (limited to 'lib/sisu/v0/dal.rb')
-rw-r--r-- | lib/sisu/v0/dal.rb | 1066 |
1 files changed, 1066 insertions, 0 deletions
diff --git a/lib/sisu/v0/dal.rb b/lib/sisu/v0/dal.rb new file mode 100644 index 00000000..2b711609 --- /dev/null +++ b/lib/sisu/v0/dal.rb @@ -0,0 +1,1066 @@ +=begin + * Name: SiSU information Structuring Universe - Structured information, Serialized Units + * Author: Ralph Amissah + * http://www.jus.uio.no/sisu + * http://www.jus.uio.no/sisu/SiSU/download.html + + * Description: preprocessing, (document abstraction), data abstraction used in subsequent processing + + * Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 Ralph Amissah + + * License: GPL 2 or later + + Summary of GPL 2 + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + + If you have Internet connection, the latest version of the GPL should be + available at these locations: + http://www.fsf.org/licenses/gpl.html + http://www.gnu.org/copyleft/gpl.html + http://www.jus.uio.no/sisu/gpl2.fsf + + SiSU was first released to the public on January 4th 2005 + + SiSU uses: + + * Standard SiSU markup syntax, + * Standard SiSU meta-markup syntax, and the + * Standard SiSU object citation numbering and system + + © Ralph Amissah 1997, current 2007. + All Rights Reserved. + + * Ralph Amissah: ralph@amissah.com + ralph.amissah@gmail.com +=end +module SiSU_DAL + require "#{SiSU_lib}/defaults" + require "#{SiSU_lib}/sysenv" + require "#{SiSU_lib}/param" + require "#{SiSU_lib}/dal_syntax" + require "#{SiSU_lib}/dal_doc_str" + require "#{SiSU_lib}/i18n" + include SiSU_Env + include SiSU_Param + include SiSU_Viz + include Syntax + class Instantiate < SiSU_Param::Parameters::Instructions + def initialize + @@flag_vocab=0 + @@endnote={} + @@endnote_array=@@word_mode=[] + @@endnote_counter,@@endnote_counter_asterisk,@@endnote_counter_dag=1,1,1 #added + @@line_mode='' + end + end + class Source <Instantiate + @@dal_array=[] + @@fns=nil + def initialize(opt) + @opt=opt + @@fns||@opt.fns + @my_make_fns=SiSU_Env::Create_file.new(@opt.cmd,@opt.fns) + @fnm=@my_make_fns.marshal_meta + SiSU_Env::Create_system_link.new.images + end + def read #creates dal + begin + dal=[] + @@dal_array=[] + @@fns=@opt.fns + create_dal + rescue; SiSU_Errors::Info_error.new($!,$@,@opt.cmd,@opt.fns).error + ensure + Instantiate.new + end + end + def get #reads dal, unless does not exist then creates first + begin + dal=[] + unless @@fns==@opt.fns + @@fns=@opt.fns + @@dal_array=[] + end + dal=if @@dal_array.empty?; read_fnm + else @@dal_array.dup #check + end + rescue; SiSU_Errors::Info_error.new($!,$@,@opt.cmd,@opt.fns).error + ensure + Instantiate.new + end + end + protected + def create_dal + dal_array=[] + tell=SiSU_Screen::Ansi.new(@opt.cmd,'Document Abstraction') + tell.green_title_hi unless @opt.cmd =~/q/ + file_array=IO.readlines(@opt.fns,'') + file_array.each do |l| + if l =~/\r\n/; l.gsub!(/\r\n/,"\n") + end + end + meta=file_array.dup + meta=meta.join.split("\n\n") #check whether can be eliminated, some of these are large objects to have twice + @md=SiSU_Param::Parameters::Instructions.new(meta,@opt).extract + meta=nil + dal=SiSU_DAL::Make.new(@md,file_array).song + SiSU_Screen::Ansi.new(@md.cmd,@md.fns,"~meta/#{@md.fns}.meta").output if @md.cmd =~/v/ + tell=SiSU_Screen::Ansi.new(@md.cmd,"dal -> #{@my_make_fns.meta}") if @md.cmd =~/M/ + tell.txt_grey unless @md.cmd =~/q/ + dal.each{|s| dal_array << "#{s.strip}\n\n" unless s.strip.empty?} + dal_array + end + def read_fnm + dal=[] + dal=if FileTest.file?(@fnm); File.open(@fnm){ |f| dal=Marshal.load(f)} + else SiSU_DAL::Source.new(@opt).create_dal + end + end + end + class Output + def initialize(md,data) + @md,@data=md,data + @my_make=SiSU_Env::Create_file.new(@md.cmd,@md.fns) + dir=SiSU_Env::Info_env.new(@md.fns) + @hard="#{dir.path.dal}/#{@md.fns}.meta" + end + def hard_output + if @md.cmd =~/M/ + filename_meta=@my_make.file_meta + @data.each {|s| filename_meta.puts s.strip + "\n\n" unless s.strip.empty?} + else File.unlink(@hard) if FileTest.file?(@hard) + end + end + def marshal + marshal_meta=@my_make.marshal_meta + File.open(marshal_meta,'w'){|f| Marshal.dump(@data.to_a,f)} + end + end + class Make + @@endnote={} + @@endnote_array=@@word_mode=[] + @@endnote_counter,@@endnote_counter_asterisk,@@endnote_counter_dag=1,1,1 + @@comment='%' + @@dp=nil + def initialize(md,data) + @md,@data=md,data + @@word_mode=[] + @env=SiSU_Env::Info_env.new(@md.fns) + @skin=SiSU_Env::Info_skin.new(@md) + @dp=@@dp ||=SiSU_Env::Info_env.new.digest.pattern + l=SiSU_Env::Standardise_language.new.file_to_language(@md.fns) + @language=l[:l] + @tr=SiSU_Translate::Source.new(@md,@language) + end + def reset + @@flag_vocab=0 + @@endnote={} + @@endnote_array=@@word_mode=[] + @@endnote_counter,@@endnote_counter_asterisk,@@endnote_counter_dag=1,1,1 + @@line_mode='' + end + def song + reset + data=@data + @metafile="#{@env.path.dal}/#{@md.fns}.meta" + my_make_source_file=SiSU_Env::Create_file.new(@md.cmd,@md.fns) + data=data.join.split("\n\n") + data=SiSU_document_structure::Code.new(@md,data).code + data_new=[] + data.each do |x| + data_new << if x =~ /\n\n/m; x.split(/\n\n+/) + else x + end + end + data=data_new.flatten + data=substitutions_and_insertions?(data) + data=Syntax::Markup.new(@md,data).songsheet + data=character_check(data) + data=images(data) + data=SiSU_document_structure::Tables.new(@md,data).tables + data=numbering_song(data) #tr issue + data=endnotes(data) + data=object_digest(data) + meta=metadata(data) + outputdata=data + meta + if @md.cmd =~/[mM]/ + SiSU_DAL::Output.new(@md,outputdata).hard_output + SiSU_DAL::Output.new(@md,outputdata).marshal + end + reset + outputdata + end + protected + def character_check(data) + require 'iconv' + reset + @tuned_file=[] + endnote_no=1 + data.each do |para| + para.strip! + para.gsub!(/^([12])~\?\s+/,'\1~ ') #conditional header for incorporated document 2004w12 + para.gsub!(/^[{~}]\s*$/,'') + para.gsub!(/^#{@@comment}.*/,'') #remove comment and divider #% + para.gsub!(/<~#>|~#\s*/,'<~#>') + para.gsub!(/-#\s*/,'<-#><~#>') + #para.gsub!(/(#\{{3} arch-tag:|0\{{3}~cvs)\s+/, "0{{~rcs ") #KEEP ... ENABLE WIDER USE OF REVISION CONTROL + para.gsub!(/(~\{ )\s+/,'\1') + para.gsub!(/ \/\//,'<br />') #added 2004w29 + para.gsub!(/<br>/,'<br />') #needed by xml, xhtml etc. + #para.gsub!(/<p>/,'<p />') #consider + para.gsub!(/`/,"'") + para.gsub!(/\342\200\231/,"'") #if para =~/’/ #Avoid #‘ ’ #“ ” + para.gsub!(/\t/,' ') + para.gsub!(/�/,' ') #watch, replace with char code + para.gsub!(/[“”]/,'""') + para.gsub!(/[–—]/,'-') #— – chk + para.gsub!(/·/,'*') + para.gsub!(/\\copy(?:right)?\b/,'©') + para.gsub!(/\\trademark\b|\\tm\b/,'®') + #non_utf8(para) + para=para + "\n" + case para + when /\^~/ # endnotes + #% Note must do this first (earlier loop) and then enter gathered data into ~^\d+ + sub_para=para.dup + @@endnote_array << sub_para.gsub!(/\n/,'').gsub!(/\^~\s+(.+)\s*/, %{~\{#{endnote_no} \\1 \}~}).strip + endnote_no+=1 + para=nil if para =~/\^~ .+/ #removes 'binary' endnote now in endnote array for later insertion + end + @tuned_file << para unless para.nil? + end + @tuned_file=@tuned_file.flatten + end + def images(data) + tuned_file=[] + @rmgk=false + if SiSU_Env::Info_settings.new.program?('rmagick'); @rmgk=SiSU_Env::Load.new('RMagick').prog + else tell=SiSU_Screen::Ansi.new(@md.cmd,'use of RMagick is not enabled in sisurc.yml') + tell.warn if @md.cmd =~/[vVM]/ + end + data.each do |para| + para.strip! + if para =~/\{\s*\S+\.(?:png|jpg|gif)(?:\s*|\s+.+)?\}(?:(?:https?|ftp):\S+|image)/ + if para !~/\{\s*\S+\.(?:png|jpg|gif)\s+\d+x\d+\s+/ + m=/\{\s*(\S+\.(?:png|jpg|gif))/ + if @rmgk + imgs=para.scan(m).flatten + images=imgs.each do |image| + dir=SiSU_Env::Info_env.new(@md.fns) + path_image=[dir.path.image_source_local_tex,dir.path.image_source_remote_tex,dir.path.image_source_tex] + image_path=nil + path_image.each do |image_path| + break if FileTest.exist?("#{image_path}/#{image}") + end + if FileTest.exist?("#{image_path}/#{image}") + img=Magick::ImageList.new("#{image_path}/#{image}") + img_col,img_row=img.columns,img.rows + if img_col > img_row #landscape + if img_col> 640 #480 + img_col=640 #480 + img_row=((1.00*img_col/img.columns)*img.rows).round + end + else #portrait + if img_col> 640 #480 + img_col=640 #480 + img_row=((1.00*img_col/img.columns)*img.rows).round + end + if img_row > 640 + img_row=640 + img_col=((1.00*img_row/img.rows)*img.columns).round + end + end + para.gsub!(/(#{image})/,"#{image} #{img_col}x#{img_row}") + else para.gsub!(/\{\s*(\S+)\.(png|jpg|gif).+?\}((?:https?|ftp):\S+|image)/,'[ \1 (\2 missing) ]') + end + end + else + images=para.scan(m) do |image| + tell=SiSU_Screen::Ansi.new(@md.cmd,'where image dimensions have not been provided RMagick is required',image) + tell.warn #unless @opt.cmd =~/q/ + end + end + end + end + para.gsub!(/\{\s+(\S+\.(?:png|jpg|gif))\s+/i,'{\1 ') if para =~/\{\s+\S+\.(?:png|jpg|gif).+?\}(?:(?:https?|ftp):\S+|image)/ + tuned_file << para unless para.nil? + end + tuned_file + end + def output_filetypes_in_cmd(cmd_shortcut,source=nil) + #make list of file types in shortcut command (as configured), e.g. when sisu -3 is used + cf_defaults=SiSU_Env::Info_processing_flag.new + cmd_list=case cmd_shortcut.to_s + when /0/; cf_defaults.cf_0 + when /1/; cf_defaults.cf_1 + when /2/; cf_defaults.cf_2 + when /3/; cf_defaults.cf_3 + when /4/; cf_defaults.cf_4 + when /5/; cf_defaults.cf_5 + end + file_type_names=[] + file_type_names <<= if cmd_list =~ /y/; 'sisu_manifest.html' + end + file_type_names <<= if cmd_list =~ /h/; ['toc.html', 'doc.html'] + end + file_type_names <<= if cmd_list =~ /p/; ['landscape.pdf', 'portrait.pdf'] + end + file_type_names <<= if cmd_list =~ /o/; 'opendocument.odt' + end + file_type_names <<= if cmd_list =~ /b/; 'scroll.xhtml' + end + file_type_names <<= if cmd_list =~ /x/; 'sax.xml' + end + file_type_names <<= if cmd_list =~ /X/; 'dom.xml' + end + file_type_names <<= if cmd_list =~ /a/; 'plain.txt' + end + file_type_names <<= if cmd_list =~ /g/; 'wiki.txt' + end + file_type_names <<= if cmd_list =~ /w/; 'concordance.html' + end + file_type_names <<= if cmd_list =~ /N/; 'digest.txt' + end + file_type_names <<= if source and cmd_shortcut =~ /s/; source + end + file_type_names <<= if cmd_shortcut =~ /S/; 'sisupod.zip' + end + file_type_names=file_type_names.flatten + end + def substitutions_and_insertions?(data) + tuned_file=[] + if data[0] =~ /^#!\s*(?:\/usr\/bin\/env sisu|\/usr\/bin\/sisu)/ # remove bang from top #! (however file is stripped, so will be removed provided no content preceeds it) + data[0].gsub!(/^#!\s*\/usr\/bin\/sisu/,'') + data[0].gsub!(/^#!\s*\/usr\/bin\/env sisu/,'') + end + if data[0] =~ /^(SiSU\s+[\d.]*|sisu-[\d.]+)$/ # SiSU identifier + data[0].gsub!(/^(SiSU\s*[\d.]*)$/,'% \1') + data[0].gsub!(/^(sisu-[\d.]+)$/,'% \1') + end + data.each do |para| + para=if @md.markup_version.to_f >= 0.38 + SiSU_document_structure::Structure.new(@md,para).structure_markup_normalize + else para + end + #para.gsub!(/<url:(\S+?)>/,'\1') #consider, would permit use of text hyperlinks if desired, dal_syntax more appropriate? + para.gsub!(/^((?:[1-9]|:?[A-C])~\S*)\s*$/,'\1~ [Note: heading marker::required title missing]~#') #conditional header for incorporated document 2004w12 + if para =~/^@\S+?:/ + para.gsub!(/^@(\S+?):\s+/,'0~\1 ') + para.gsub!(/^@(\S+?):([+-])\s+/,'0~\1\2 ') + end + if para !~/^%+\s/ and + para =~/^(?:_\*\s+)?\{(?:~\^\s+)?(.+?)\s\[(?:\d(?:[sS]+))\]\}(?:\.\.\/\S+?\/|\S+?\.(?:sst|ssm)\b)(?:\s+~\{.+?\}~)?(?:\s+\*~\S+)*\s*$/ + txt,cmd,source,url_dir,note,manifest=nil,nil,nil,nil,nil,nil + url_and_stub=SiSU_Env::Info_env.new.url + if defined? url_and_stub.remote + @output_url="#{url_and_stub.remote}" + if para =~/\{(.+?)\s\[(\d[sS]*)\]\}((\S+?)\.ss[tm])(\s+~\{.+?\}~)?/ + #syntax e.g.: { "Sphinx or Robot", Leena Krohn [3sS]}sphinx_or_robot.leena_krohn.1996.sst + txt,cmd,source,url_dir,note=$1,$2,$3,$4,$5 + elsif para =~/\{(.+?)\s\[(\d[sS]*)\]\}\.\.\/(\S+?)\/(\s+~\{.+?\}~)?/ + #syntax e.g.: { "Sphinx or Robot", Leena Krohn [3sS]}../sphinx_or_robot.leena_krohn.1996/ + txt,cmd,url_dir,note=$1,$2,$3,$4 + end + manifest="{#{txt} }#@output_url/#{url_dir}/toc.html#{note}\n\n" + else + puts "error, does currently support relative paths (reltive paths were removed, as had problems for citation, and was not suited to all output types should possibly reconsider) #{__FILE__} #{__LINE__}" + if para =~/\{(?:~\^\s+)?(.+?)\s\[(\d[sS]*)\]\}\.\.\/(\S+?)\/(\s+~\{.+?\}~)?/ + txt,cmd,url_dir,note=$1,$2,$3,$4 + manifest="{ #{txt} }../#{url_dir}/toc.html#{note}\n\n" + end + end + tuned_file << manifest + output_filetypes_in_cmd(cmd,source).each do |o_f| + describe = case o_f + when /sisu_manifest.html/; '~^ document manifest' + when /toc.html/; ' html, segmented text' + when /doc.html/; ' html, scroll, document in one' + when /landscape.pdf/; ' pdf, landscape' + when /portrait.pdf/; ' pdf, portrait' + when /opendocument.odt/; ' open document' + when /scroll.xhtml/; ' xhtml scroll' + when /sax.xml/; ' xml, sax' + when /dom.xml/; ' xml, dom' + when /plain.txt/; ' plain text utf-8' + when /wiki.txt/; ' wiki text' + when /concordance.html/; ' concordance' + when /digest.txt/; ' dcc, document content certificate (digests)' + when /#{source}/; ' markup source text' + when /sisupod.zip/; ' zipped markup source pod' + else nil + end + if describe + if @output_url + tuned_file << "_1 {#{describe} }#@output_url/#{url_dir}/#{o_f}\n\n" if describe + else + tuned_file << "_1 { #{describe} }../#{url_dir}/#{o_f}\n\n" + end + end + end + elsif para =~/<:insert\d+!?>/ and para !~/^%\s+/ + @skin.select + ins=SiSU_Viz::Inserts.new + case para + when /^\s*<:insert1>\s*$/ + para=[] + ins.insert1.split(/\n\n/).each{|x| para << x } + when /^\s*<:insert2>\s*$/ + para=[] + ins.insert2.split(/\n\n/).each{|x| para << x } + when /^\s*<:insert3>\s*$/ + para=[] + ins.insert3.split(/\n\n/).each{|x| para << x << "\n"} + para=ins.insert3 + when /^\s*<:insert4>\s*$/ + para=[] + ins.insert4.split(/\n\n/).each{|x| para << x << "\n"} + para=ins.insert4 + when /^\s*<:insert5>\s*$/ + para=[] + ins.insert5.split(/\n\n/).each{|x| para << x << "\n"} + when /^\s*<:insert6>\s*$/ + para=[] + ins.insert6.split(/\n\n/).each{|x| para << x << "\n"} + when /^\s*<:insert7>\s*$/ + para=[] + ins.insert7.split(/\n\n/).each{|x| para << x << "\n"} + end + para.each{|x| tuned_file << x } + else tuned_file << para + end + tuned_file.flatten! + tuned_file.compact! + end + tuned_file + end + def numbering_song(data) + data=number_plaintext_para(data) + data=name_endnote_seg(data) #tr issue + data=auto_number_heading_ie_title(data) #tr issue + data=ocn(data) unless @md.markup =~/not_to/ + data=minor_numbering(data) #unless @md.markup =~/not_to/ + data=name_para_seg_filename(data) + data=set_heading_seg(data) unless @md.set_heading_seg + data=set_heading_top(data) unless @md.set_heading_top + data=set_header_title(data) unless @md.set_header_title + data + end + def number_plaintext_para(data) + @tuned_file=[] + data.each do |para| + para.gsub!(/(^|[^<][^v][^>])\n/,'\1 ') #messy, but idea is that tables should retain breaks + para.gsub!(/^/,"\n") unless para =~/¡/ + para.gsub!(/^\s+|\s$/,"\n") + @tuned_file << para + end + @tuned_file=@tuned_file.flatten + end + def name_endnote_seg(data) + @tuned_file=[] + data.each do |para| + para.gsub!(/<:3>\s*<:ee>/, <<-WOK +#{@@endnote['special_align']} <p /><br />\r +#{@@endnote['seg_name_3']} <p /> +#{@@endnote['special_align_close']} + WOK + ) + para.gsub!(/<:2>\s*<:ee>/, <<-WOK +#{@@endnote['special_align']} <p /><br />\r +#{@@endnote['seg_name_2']} <p /> +#{@@endnote['special_align_close']} + WOK + ) + para.gsub!(/<:1>\s*<:ee>/, <<-WOK +#{@@endnote['special_align']} <p /><br />\r +#{@@endnote['seg_name_1']} <p /> +#{@@endnote['special_align_close']} + WOK + ) + @tuned_file << para + end + # debug 2003w46 adding revision control info + if @md.flag_auto_endnotes and @md.flag_separate_endnotes_make + @tuned_file << "\n4~endnotes Endnotes <~0;0:0;u0>" #prob numbering, revisit + end + @tuned_file << "\n<ENDNOTES>" + @tuned_file=@tuned_file.flatten + end + def owner_details_seg + data << '4~owner.details Owner Details' + end + def number_sub_heading(para,num,title_no) + case para + when /#{num}~- /; para.gsub!(/#{num}~- /,"#{title_no} ") + when /^#{num}~#\s*/; para.gsub!(/^#{num}~#\s*/,"#{title_no} ") + when /^#{num}~[a-z_\.]+ / + para.gsub!(/^#{num}~([a-z_\.]+)\s+(.+)/i,%{#{num}~\\1 #{title_no} \\2 <:name##{title_no}>}) + else para.gsub!(/^#{num}~ /,"#{num}~#{title_no} #{title_no} ") #main + end + if @md.toc_lev_limit and @md.toc_lev_limit < num + para.gsub!(/^[5-8]~(?:~\S+)?\s*/,'!_ ') + end + para + end + def auto_number_heading_ie_title(data) #also does some segment naming + @tuned_file=[] + if @md.markup =~/num_top/ or @md.num_top # watch, 2003w23 + input="#{@md.markup}"[/num_top\=([1-6])/,1] if @md.markup + input||=@md.num_top if @md.num_top !~/^$/ + end + num_top=input.to_i + t_no1=t_no2=t_no3=t_no4=0 + no1=num_top; no2=(num_top + 1); no3=(num_top + 2); no4=(num_top + 3) + t_not=0 + data.each do |para| #@md.seg_names << [additions to segment names] + if (@md.markup =~/num_top/ or (@md.num_top and @md.num_top !~/^$/)) and para !~/^0~/ + if (para =~/^(?:#{no1}|^#{no2}|^#{no3}#{no4})~#/ and para !~/^4~endnotes?/) + t_not+=1 #; t_no2=0; t_no3=0 + para.gsub!(/^(#{no1})~#\s*/,"\\1~ps#{t_not} ") + para.gsub!(/^(#{no2})~#\s*/,"\\1~ps#{t_not} ") + para.gsub!(/^(#{no3})~#\s*/,"\\1~ps#{t_not} ") + para.gsub!(/^(#{no4})~#\s*/,"\\1~ps#{t_not} ") + end + if para =~/#{no1}~/ + @subnumber=1 + @subnumber=0 if para =~/#{no1}~/ + end + if para =~/^[0-6]~[ \w-]/ and para !~ /(?:[0-6]~[\w-]+-|4~endnotes|^[0-6]~([a-z_\.]+)\s+[\d.]+)\s/ and para !~/<~#>|<-#>/ + if para =~/^#{no1}~/ + t_no1+=1; t_no2=0; t_no3=0 + title_no="#{t_no1}" + if not @md.seg_names.nil? and not @md.seg_names.include?(title_no) + para.gsub!(/^#{no1}~\s+(\S+)#/,"#{no1}~#{title_no} \\1 #{title_no} ") #shift placement of auto-number to after first word, e.g. Article # not # Article, added on occasion of ABF (20040329) + para.gsub!(/^#{no1}\{\s+(Article|Clause|Section)\s+#/i,%{#{no1}~#{title_no} \\1 #{title_no}. }) + unless para =~/^#{no1}~\s+[\d.]+\s/ #fix -> if the title starts with a numbering scheme, do not auto-number, review + para.gsub!(/^#{no1}~\s+/,"#{no1}~#{title_no} #{title_no}. ") + end + @md.seg_names << title_no + #else puts "warning segment name #{title_no} already exists" + end + unless para =~/^#{no1}~([a-z_\.]+)\s+[A-Z]\.?\s/ #bug -> tmp fix, excludes A. B. C. lettering, but not roman numerals, is arbitrary, review required + para.gsub!(/^#{no1}~([a-z_\.]+)\s+(.+)/i,%{#{no1}~\\1 #{title_no}. \\2 <:name##{title_no}>}) + end + para.gsub!(/^#{no1}~#\s*/,"#{title_no}. ") + end + if para =~/^#{no2}~/ + t_no2+=1; t_no3=0 + title_no="#{t_no1}.#{t_no2}" + para=number_sub_heading(para,no2,title_no) + end + if para =~/^#{no3}~/ + t_no3+=1 + title_no="#{t_no1}.#{t_no2}.#{t_no3}" + para=number_sub_heading(para,no3,title_no) + end + elsif para =~ /^[0-6]~[\w-]+-/ # endnotes, watch2005 + para.gsub!(/^#{no1}~([a-z_\.]+)- /,"#{no1}~\\1 ") + para.gsub!(/^#{no2}~([a-z_\.]+)- /,"#{no2}~\\1 ") + para.gsub!(/^#{no3}~([a-z_\.]+)- /,"#{no3}~\\1 ") + end + elsif @md.markup =~/num_extract/ #AS DANGEROUS force enable with document, note already does this type of numbering for cisg, locate and coordinate logic, is currently misplaced in code, chengwei inspired 2004w23/4 + unless para =~ /^[0-6]~\S+/ #endnotes watch? + if para =~/^[1-6]~\s+([\d\.]+)/ #risky (must be unique) consider output to 4~~\d instead of 4~\d + name_num=$1 + para.gsub!(/^([1-6]~)\s+/,"\\1#{name_num} ") + end + end + if @md.toc_lev_limit + end + end + @tuned_file << para + end + @tuned_file=@tuned_file.flatten + end + def ocn(data) #and auto segment numbering increment + @tuned_file=[] + object_array=SiSU_document_structure::OCN.new(@md,data).ocn + object_array.each do |o| + @tuned_file <<= if o.ocn; "#{o.txt} <~#{o.ocn};#{o.lv};#{o.type}>" + else o.txt + end + end + @tuned_file=@tuned_file.flatten + end + def minor_numbering(data) #and auto segment numbering increment + @tuned_file=[] + number_small,letter_small=0,0 + letter=%w( a b c d e f g h i j k l m n o p q r s t u v w x y z ) + data.each do |para| + if para =~/\w|\S|<|\(/ + if para !~/^%% |^0~|^4~endnotes|^<\/center>|<:ee>|<:e[:_]>|^\^~ |<:e[:_]\d+?>|^<:p[bn]>|^<:\#|<:- |<[:!]!4|^(?:alt|code|group|poem|table)\{|^\}(?:alt|code|group|poem|table)|^\}table$|<table|<\/table>|<td|<\/td>|<th|<\/th>|<tr>|<\/tr>|<hr width|<:4-endnotes>|\[endnotes\]|<:zz>|<:isbn-|<:journal-|<:conference-|<ENDNOTES?>/i #ocn here # added with Tune.code #¡ + if para=~/^[1-8]~/; number_small,letter_small=0,0 #% sub-number system, (baby numbering) reset with any change of major number (more obviously should be placed in number titles, but that is conditionally executed, check and move later) + end + if para =~/^#[ 1]/ + letter_small=0 + number_small=0 if para =~ /^#1/ + number_small+=1 + para.gsub!(/^#[ 1]/,"#{number_small}. ") #change 2004 + end + if para =~/^_# / + para.gsub!(/^_# /,"<:i1> #{letter[letter_small]}. ") #change 2004 + letter_small+=1 + end + end + end + @tuned_file << para + end + @tuned_file=@tuned_file.flatten + end + def name_para_seg_filename(data) + # paragraph name/numbering rules + # manual naming overrides, manual naming may be + # alpha-numeric characters mixed, + # numeric only (a number), if + # all segments have been named, + # the numbers used are over 1000 or + # it is not minded that auto-numbering uses a funny scheme for naming segments (not yet implemented) + # [for now a warning is printed for such documents on use of maintenance or very-verbose flag] + # auto-naming takes the form of giving numbers to segments + # the rules for which are as follows + # if the title/heading text starts with a numeric, then that is used (1 3.1 3rd etc.) + # otherwise the level 4 segment number from the embedded document structure info is used + # if there is none a sequential number is designated, preceded by an underscore + @tuned_file=[] + art_filename_auto=1 + @counter=1 + @unique_auto_name=[] + puts 'manual segment names, numbers used as names, risk warning (segmented html)' if not @md.seg_autoname_safe and @md.cmd =~/[MV]/ + data.each do |para| + para=SiSU_document_structure::Structure.new(@md,para).structure_markup + if para !~/^0~/ + if para =~/^[456]~ / + if para=~/^4/ and not @md.set_heading_seg + @md.set_heading_seg=true + end + if para =~/^[456]~(?:\s\S+)?\s+([\d.,:-]+)/m #heading starts with a recognised numeric or word followed by a recognised numerical construct, use that as name + pattern=$1 + pattern.gsub!(/(?:[:,-]|\W)/,'.') + pattern.gsub!(/\.$/,'') + if not @md.seg_names.nil? and not @md.seg_names.include?(pattern) + para.gsub!(/^([456])~\s*/,"\\1~#{pattern} ") + @md.seg_names << pattern + else puts 'warn, there may be a conflicting numbering scheme' if @md.cmd =~/[VM]/ + end + end + if para =~/^4~\s.+?;4:(\d+);/m #extract segment name from embedded document structure info + pattern=$1 + pattern.gsub!(/(?:[:,-]|\W)/,'.') + pattern.gsub!(/\.$/,'') + if not @md.seg_names.nil? and not @md.seg_names.include?(pattern) + para.gsub!(/^(4)~\s*/,"\\1~#{pattern} ") + @md.seg_names << pattern + else + para.gsub!(/^(4)~\s*/,"\\1~~#{pattern} ") + @md.seg_names << "~#{pattern}" + end + end + if para =~/^4~\s+/ #if still not segment name, provide a numerical one + if not @md.seg_names.nil? and not @md.seg_names.include?(art_filename_auto) + para.gsub!(/^4~\s+/,%{4~_#{art_filename_auto} }) + @md.seg_names << art_filename_auto + else puts 'segment name (numbering) error' + end + art_filename_auto+=1 + end + end + end + @tuned_file << if para =~/^([1-6])~/m and (@md.pagenew or @md.pagebreak); m=$1 #watch ref~ + para_tmp=[] + if @md.pagenew.to_s =~/#{m}/; para_tmp << "<:pn>\n" << para + end + if @md.pagebreak.to_s =~/#{m}/; para_tmp << "<:pb>\n" << para + end + para_result=unless para_tmp.length > 0; para + else para_tmp + end + else para + end + end + if @md.seg_names.length > 0 + @md.set_heading_seg=true + end + @tuned_file=@tuned_file.flatten + end + def set_heading_top(data) #% make sure no false positives + unless @md.set_heading_top + puts "\tdocument contains no top level heading, (will have to manufacture one)" if @md.cmd =~/[MV]/ + @tuned_file=[] + data.each do |para| + unless @md.set_heading_top + if para !~/^(?:@\S+:|0~\S+)\s/m and para !~/\A\s*\Z/m + @md.set_heading_top=true + head=if @md.title ; "1~ #{@md.title}" + else '1~ [no title provided]' + end + @tuned_file << head + end + end + @tuned_file << para + end + @tuned_file=@tuned_file.flatten + end + end + def set_heading_seg(data) #% make sure no false positives + unless @md.set_heading_seg + puts "\tdocument contains no segment level, (will have to manufacture one)" if @md.cmd =~/[MV]/ + @tuned_file=[] + data.each do |para| + unless @md.set_heading_seg + if para !~/^(?:@\S+:|0~\S+|[123]~)/m and para !~/\A\s*\Z/m and para !~/<:p[bn]>/ + @md.set_heading_seg=true + head=if @md.title ; "4~seg [#{@md.title}]" + else '4~seg [segment]' + end + @tuned_file << head + end + end + @tuned_file << para + end + @tuned_file=@tuned_file.flatten + end + end + def set_header_title(data) #% make sure no false positives + unless @md.set_header_title + puts "\t no document title provided, (will have to manufacture one)" if @md.cmd =~/[MV]/ + @tuned_file=[] + data.each do |para| + unless @md.set_header_title + if para !~/^%{1,2}\s/m and para !~/\A\s*\Z/m + @tuned_file << "0~title #{@md.heading_seg_first}" + @md.title=@md.heading_seg_first + @md.set_header_title=true + end + end + @tuned_file << para + end + @tuned_file=@tuned_file.flatten + end + end + def endnotes(data) + @tuned_file=[] + endnote_no,endnote_ref=1,1 + #% endnote work zone + data.each do |para| + # manually numbered endnotes <!e(\d)!> <!e_(\d)!> --> + if @md.mod.inspect =~/--no-asterisk|--no-annotate/ + para.gsub!(/~\[[*]\s.+?\]~/,'') + end + if @md.mod.inspect =~/--no-dagger|--no-annotate/ + para.gsub!(/~\[[+]\s.+?\]~/,'') + end + case para + # auto-numbered endnotes <!e!> <!e_!> --> + when /~\{\s+.+?\}~|~\[[*+]\s+.+?\]~/ + para.gsub!(/\s*(\}~|\]~)/,' \1') # required 2003w31 + word_mode=para.scan(/\S+/) + word_mode=endnote_call_number(word_mode) + para=word_mode.join(' ') + endnote_ref+=1 + when /~\^(?:\s|$)|<:e>/ + #%Note inserts endnotes previously gathered from /^(<!e[:_]!>|[-~]\{{3})/ (in earlier loop) + word_mode=para.scan(/\S+/) + word_mode=endnote_call_number(word_mode) + para=word_mode.join(' ') + endnote_ref+=1 + end + @tuned_file << para + end + @tuned_file=@tuned_file.flatten + end + def endnote_call_number(data) + data.each do |word| + case word + when /~\{/ + unless word =~/~\{[*+]+/ + word.gsub!(/~\{/,"~\{#{@@endnote_counter} ") + @@endnote_counter+=1 + end + when /~\[/ + if word =~/~\[[+]/ + word.gsub!(/~\[[+]/,"~\[\+#{@@endnote_counter_dag} ") + @@endnote_counter_dag+=1 + else + word.gsub!(/~\[[*]?/,"~\[\*#{@@endnote_counter_asterisk} ") + @@endnote_counter_asterisk+=1 + end + when /~\^|<:e>/ + word.gsub!(/~\^|<:e>/,"#{@@endnote_array[@@endnote_counter-1]}") + @@endnote_counter+=1 + end + end + end + def metadata(data) + meta,@dc,@rc,@cvs,dctitle,add=Array.new(6){[]} + dir=SiSU_Env::Info_env.new(@md.fns) + base_html="#{dir.url.root}/#{@md.fnb}" + ocnm=ocnd=ocnv=0 + ocnm+=1 + header0='<:pn>' + header1="\n1~ Document Information <~0;0:0;m#{ocnm}>" + ocnm+=1 + header4="\n4~metadata MetaData <~0;m#{ocnm};m#{ocnm}>" + ocnm+=1; ocnd+=1 + head_no_dc="<~0;m#{ocnm};d#{ocnd}>" + ocnm+=1; ocnd+=1 + head_no_dc_tag="<~0;m#{ocnm};d#{ocnd}>" + data.each do |para| + case para + when /^0~(title|creator|author|translator|translated_by|illustrator|illustrated_by|prepared_by|digitized_by|description|publisher|contributor|date\.created|date\.issued|date\.available|date\.valid|date\.modified|date|type|format|rights|identifier|source|language)/i + m=$1 + ocnm+=1; ocnd+=1 + @dc << case para + when /^0~title/ + "\n#{@tr.dc_title}: <u>#{@md.dc_title}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~(?:creator|author)/ + "\n#{@tr.creator}: <u>#{@md.dc_creator}</u> <~0;m#{ocnm};d#{ocnd}>" + when /0~(?:translator|translated_by)/ + "\n#{@tr.translator}: <u>#{@md.translator}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~(?:illustrator|illustrated_by)/ + "\n#{@tr.illustrator}: <u>#{@md.illustrator}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~prepared_by/ + "\n#{@tr.prepared_by}: <u>#{@md.prepared_by}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~digitized_by/ + "\n#{@tr.digitized_by}: <u>#{@md.digitized_by}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~description/ + "\n#{@tr.description}: <u>#{@md.dc_description}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~subject/ + "\n#{@tr.subject}: <u>#{@md.dc_subject}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~abstract/ + "\n#{@tr.abstract}: <u>#{@md.dc_abstract}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~publisher/ + "\n#{@tr.publisher}: <u>#{@md.dc_publisher}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~contributor/ + "\n#{@tr.contributor}: <u>#{@md.dc_contributor}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~date.created/ + "\n#{@tr.date_created}: <u>#{@md.dc_date_created}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~date.issued/ + "\n#{@tr.date_issued}: <u>#{@md.dc_date_issued}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~date.available/ + "\n#{@tr.date_available}: <u>#{@md.dc_date_available}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~date.modified/ + "\n#{@tr.date_modified}: <u>#{@md.dc_date_modified}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~date.valid/ + "\n#{@tr.date_valid}: <u>#{@md.dc_date_valid}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~date/ + "\n#{@tr.date}: <u>#{@md.dc_date}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~type/ + "\n#{@tr.type}: <u>#{@md.dc_type}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~format/ + "\n#{@tr.format}: <u>#{@md.dc_format}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~rights/ + "\n#{@tr.rights}: <u>#{@md.dc_rights}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~identifier/ + "\n#{@tr.identifier}: <u>#{@md.dc_identifier}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~source/ + "\n#{@tr.source}: <u>#{@md.dc_source}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~language/ + "\n#{@tr.language}: <u>#{@md.dc_language}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~language.original/ + "\n#{@tr.language_original}: <u>#{@md.language_original}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~relation/ + "\n#{@tr.relation}: <u>#{@md.dc_relation}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~coverage/ + "\n#{@tr.coverage}: <u>#{@md.dc_coverage}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~keywords/ + "\n#{@tr.keywords}: <u>#{@md.keywords}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~comments/ + "\n#{@tr.comments}: <u>#{@md.comments}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~cls_loc/ + "\n#{@cls_dewey}: <u>#{@md.cls_dewey}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~cls_dewey/ + "\n#{@tr.cls_dewey}: <u>#{@md.cls_dewey}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~cls_gutenberg|0~cls_pg/ + "\n#{@tr.cls_gutenberg}: <u>#{@md.cls_gutenberg}</u> <~0;m#{ocnm};d#{ocnd}>" + #"\n#{@tr.cls_gutenberg}: <u>#{@md.cls_pg}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~cls_isbn/ + "\n#{@tr.cls_isbn}: <u>#{@md.cls_isbn}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~prefix(?:_a)?/ + "\n#{@tr.prefix_a}: <u>#{@md.prefix_a}</u> <~0;m#{ocnm};d#{ocnd}>" + when /^0~prefix_b/ + "\n#{@tr.prefix_b}: <u>#{@md.prefix_b}</u> <~0;m#{ocnm};d#{ocnd}>" + else para.gsub(/^0~(#{m})\s+(.+)/m,"\n#{m.capitalize}: <u>\\2</u> <~0;m#{ocnm};d#{ocnd}>") + end + end + end + ocnm+=1; ocnv+=1 + head_no_rc="<~0;m#{ocnm};v#{ocnv}>" + ocnm+=1; ocnv+=1 + head_no_rc_tag="<~0;m#{ocnm};v#{ocnv}>" + data.each do |para| + case para + when /^0~(?:cvs|rcs)\+\s+/ #note the + sign to turn on use of cvs id + ocnm+=1; ocnv+=1 + @cvs << "#{@tr.sc_number}: <u>#{@md.sc_number}</u> <~0;m#{ocnm};v#{ocnv}>" + ocnm+=1; ocnv+=1 + @cvs << "#{@tr.sc_date}: <u>#{@md.sc_date}</u> <~0;m#{ocnm};v#{ocnv}>" + ocnm+=1; ocnv+=1 + @cvs << "CVS/RCS time: <u>#{@md.sc_time}</u> <~0;m#{ocnm};v#{ocnv}>" + ocnm+=1; ocnv+=1 + when /^0~cvs[+\s]/ #enable pattern above instead if you wish the default to be to include cvs tags from all documents KEEP + when /^0~cvs\s+/ #enable pattern above instead if you wish the default to be to include cvs tags from all documents KEEP + end + end + if true #default version information + ocnm+=1; ocnv+=1 + if @md.sc_filename and @md.sc_filename.length > 3 + @rc << "#{@tr.sourcefile}: <u>#{@md.sc_filename}</u> <~0;m#{ocnm};v#{ocnv}>" + else @rc << "#{@tr.sourcefile}: <u>#{@md.fns}</u> <~0;m#{ocnm};v#{ocnv}>" + end + ocnm+=1; ocnv+=1 + if @md.file_encoding and @md.file_encoding.length > 3 #translate + @rc << "Filetype: <u>#{@md.file_encoding}</u> <~0;m#{ocnm};v#{ocnv}>" + end + ocnm+=1; ocnv+=1 + if @md.dgst #change. enable by default + @rc << "#{@tr.sourcefile_digest}, #{@md.dgst[0]} <u>#{@md.dgst[1]}</u> <~0;m#{ocnm};v#{ocnv}>" + ocnm+=1; ocnv+=1 + end + if @md.dgst_skin #change. enable by default + @rc << "Skin_Digest: #{@md.dgst_skin[0]} <u>#{@md.dgst_skin[1]}</u> <~0;m#{ocnm};v#{ocnv}>" + ocnm+=1; ocnv+=1 + end + @rc << "<b>Generated</b> #{head_no_rc}" if @rc.length > 0 + @rc << "#{@tr.last_generated}: <u>#{Time.now}</u> <~0;m#{ocnm};v#{ocnv}>" + ocnm+=1; ocnv+=1 + if @md.sisu_version[:version] + @rc << "#{@tr.sisu_version}: <u>#{@md.sisu_version[:project]}</u> <u>#{@md.sisu_version[:version]}</u> of #{@md.sisu_version[:date_stamp]} (#{@md.sisu_version[:date]}) <~0;m#{ocnm};v#{ocnv}>" + ocnm+=1; ocnv+=1 + end + @rc << "#{@tr.ruby_version}: <u> #{@md.ruby_version}</u> <~0;m#{ocnm};v#{ocnv}>" + end + meta << header0 + meta << header1 + meta << header4 + meta << "Document Manifest @\n #{base_html}/#{@md.fn[:manifest]} <~0;m#{ocnm};m#{ocnm}>" + meta << "<b>Dublin Core</b> (DC) #{head_no_dc}" if @dc.length > 0 + meta << "<i>DC tags included with this document are provided here.</i> #{head_no_dc_tag}" if @dc.length > 0 + @dc.each { |x| meta << x } + meta << "<b>Version Information</b> #{head_no_rc}" if @rc.length > 0 + if @cvs.length > 0 + meta << "<i>Note the version information provided here, is specific to the host site.</i> #{head_no_rc_tag}" + @cvs.each { |x| meta << x } + end + @rc.each { |x| meta << x } + ## ENDNOTE RELATED endnote related + meta << "\n<EOF>" + meta=object_digest(meta) + end + def stamped(para,hash_class) + @tuned=[] + para=strip_clean_extra_spaces(para) + digest_all=hash_class.hexdigest(para) # print "#{hash_class.name}: "; puts digest_all #length==32 or 64 + stripped=strip_clean_of_markup(para) + digest_strip=hash_class.hexdigest(stripped) + case para + when /~\{[\d*+]+\s+.+?\}~|~\[[*+]\d+\s+.+?\]~/ + en_and_para,en_and_para_digest=[],[] + para.gsub!(/\s*(\}~|\]~)/,' \1') #watch + para_plus_en=para.scan(/.*?~\{.+?\}~|.*?~\[.+?\]~/) + para_tail=if para =~/(?:.*?~\{.+?\}~|.*?~\[.+?\]~)+([\s\S]+)/ + /(?:.*?~\{.+?\}~|.*?~\[.+?\]~)+(.+?<~\d+;(?:\w|[0-6]:)\d+;\w\d+>)/.match(para)[1] + else '' + end + para_plus_en << para_tail + en_and_para_digest << endnote_digest(para_plus_en) + para_new=en_and_para_digest.join(' ') + @tuned << para_new + '<' + digest_strip + ':' + digest_all + '>' unless para.nil? + else @tuned << para + '<' + digest_strip + ':' + digest_all + '>' unless para.nil? + end + @tuned.join + end + def object_digest(data) + # 1. clean/stripped text without any markup, paragraph, headings etc. without endnotes + # 2. endnotes clean/stripped text digest only (there may be several endnotes within a paragraph) + # 3. whole object, text with markup and any endnotes, (question: with or without the endnote digests??? presumption better without, [however may be easier to check with?]) + # [digests should not include other digests] + # vim==/<[0-9a-f]\{#{@@dl}\}\(:[0-9a-f]\{#{@@dl}\}\)\?>/ + require 'digest/md5' + require 'digest/sha2' + @tuned_file=[] + data.compact! + data.each do |para| + para.strip! + if para=~/<~\d+;(?:\w|[0-6]:)\d+;\w\d+>/ + if @env.digest.type =~/sha256/ + for hash_class in [ Digest::SHA256 ] + @tuned_file << stamped(para,hash_class) + end + else + for hash_class in [ Digest::MD5 ] + @tuned_file << stamped(para,hash_class) + end + end + else @tuned_file << para unless para.nil? + end + end + @tuned_file=@tuned_file.flatten + #use md5 or to create hash of each dal object including ocn, & add into to each dal object + end + def endnote_digest(data) + para_bit=[] + data.each do |en_plus| + para_bit <<= case en_plus + when /~\{|~\[/ + if en_plus =~/~\{.+?\}~|~\[.+?\]~/ + para_txt,en_open,en_txt,en_close=/(.*?)(~\{|~\[)(.+?)(\}~|\]~)/m.match(en_plus)[1..4] + stripped_en=strip_clean_of_markup(en_txt) + if @env.digest.type =~/sha256/ + digest_en_strip=Digest::SHA256.hexdigest(stripped_en) + else + digest_en_strip=Digest::MD5.hexdigest(stripped_en) + end + para_txt + en_open + en_txt + '<' + digest_en_strip + '>' + en_close + else puts "Error Exception - problem encountered with:\n#{en_plus}" #arbitrary exception, tidy up + end + else en_plus + end + end + para_bit.join + end + def strip_clean_extra_spaces(s) # dal output tuned + s=s.dup + s=s.gsub(/[ ]+([,.;:?](?:$|\s))/,'\1') + s=s.gsub(/ [ ]+/,' ') + s=s.gsub(/^ [ ]+/,'') + s=s.gsub(/ [ ]+$/,'') + s=s.gsub(/(<\/[bi]>')[ ]+(s )/,'\1\2') + end + def strip_clean_of_markup(s) # used for digest, define rules, make same as in db clean + #consider: <\/?[ib]>|<(?:\/ )?br>|<del>(.+?)<\/del> + s=s.dup + s=s.gsub(/(?:<\/?[ib]>|<~\d+;(?:\w|[0-6]:)\d+;\w\d+>|<#@dp:#@dp>|^[1-6]~\S+|~\{\d+\s.+?\}~)/,'') # markup and endnotes removed + #% same as db clean --> + s=s.gsub(/<del>(.+?)<\/del>/,'DELETED(\1)') # deletions + s=s.gsub(/<sup>(\d+)<\/sup>/,'[\1]') + s=s.gsub(/(?: \\;)+/,' ') + #s=s.gsub(/<!T[h]?¡.+?!>/,"[TABLE]\n") # tables + #s=s.gsub(/<!¡¡\d+(.+?)!>/,'\1') # tables + #s=s.gsub(/¡¡\d+¡/,' ') # tables + #s=s.gsub(/¡/,' ') # tables tidy later + #s=s.gsub(/<.+?>/,'') + s=s.gsub(/\{.+?\.(?:png|jpg|gif).+?\}(?:https?|ftp)\\\:\S+ /,' [image] ') # else image names found in search + s=s.gsub(/\s\s+/,' ') + s=s.strip + end + end +end +__END__ +dal output, rules to simplify parsing +nodes === objects === paragraphs === text blocks separated by \n\n + +dal output: +:verse :group and :code have -end +:table is not used |