# coding: utf-8 =begin * Name: SiSU * Description: a framework for document structuring, publishing and search * Author: Ralph Amissah * Copyright: (C) 1997 - 2009 Ralph Amissah All Rights Reserved. * License: GPL 3 or later: SiSU, a framework for document structuring, publishing and search Copyright (C) Ralph Amissah This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . If you have Internet connection, the latest version of the GPL should be available at these locations: * SiSU uses: * Standard SiSU markup syntax, * Standard SiSU meta-markup syntax, and the * Standard SiSU object citation numbering and system * Hompages: * Download: * Ralph Amissah ** Description: system environment, resource control and configuration details =end module SiSU_hash class Object_digest def initialize(md,data,env=nil) @md,@data,@env=md,data,env @env ||=SiSU_Env::Info_env.new(@md.fns) end def object_digest # 1. clean/stripped text without any markup, paragraph, headings etc. without endnotes # 2. endnotes clean/stripped text digest only (there may be several endnotes within a paragraph) # 3. whole object, text with markup and any endnotes, (question: with or without the endnote digests??? presumption better without, [however may be easier to check with?]) # [digests should not include other digests] # vim==/<[0-9a-f]\{#{@@dl}\}\(:[0-9a-f]\{#{@@dl}\}\)\?>/ require 'digest/md5' require 'digest/sha2' data=@data @tuned_file=[] data.compact! data.each do |para| para.strip! if para=~/#{Mx[:id_o]}~\d+;(?:\w|[0-6]:)\d+;\w\d+#{Mx[:id_c]}/ \ and para !~/#{Rx[:meta]}/ #test should not be necessary remove if @env.digest.type =~/sha256/ for hash_class in [ Digest::SHA256 ] @tuned_file << stamped(para,hash_class) end else for hash_class in [ Digest::MD5 ] @tuned_file << stamped(para,hash_class) end end else @tuned_file << para unless para.nil? end end @tuned_file=@tuned_file.flatten #use md5 or to create hash of each dal object including ocn, & add into to each dal object end def endnote_digest(data) para_bit=[] data.each do |en_plus| para_bit <<= case en_plus when /#{Mx[:en_a_o]}|#{Mx[:en_b_o]}/ if en_plus =~/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}|#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/ para_txt,en_open,en_txt,en_close=/(.*?)(#{Mx[:en_a_o]}|#{Mx[:en_b_o]})(.+?)(#{Mx[:en_a_c]}|#{Mx[:en_b_c]})/m.match(en_plus)[1..4] stripped_en=strip_clean_of_markup(en_txt) digest_en_strip=if @env.digest.type =~/sha256/ Digest::SHA256.hexdigest(stripped_en) else Digest::MD5.hexdigest(stripped_en) end para_txt + en_open + en_txt + Mx[:id_o] + digest_en_strip + Mx[:id_c] + en_close else puts "Error Exception - problem encountered with:\n#{en_plus}" #arbitrary exception, tidy up end else en_plus end end para_bit.join end def stamped(para,hash_class) @tuned=[] para=strip_clean_extra_spaces(para) digest_all=hash_class.hexdigest(para) # print "#{hash_class.name}: "; puts digest_all #length==32 or 64 stripped=strip_clean_of_markup(para) digest_strip=hash_class.hexdigest(stripped) unless para =~/#{Mx[:fa_o]}code#{Mx[:fa_c]}/ case para when /#{Mx[:en_a_o]}[\d*+]+\s+.+?#{Mx[:en_a_c]}|#{Mx[:en_b_o]}[*+]\d+\s+.+?#{Mx[:en_b_c]}/m en_and_para,en_and_para_digest=[],[] para.gsub!(/\s*(#{Mx[:en_a_c]}|#{Mx[:en_b_c]})/m,' \1') #watch para_plus_en=para.scan(/.*?#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}|.*?#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m) para_tail=if para =~/(?:.*?#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}|.*?#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]})+([\s\S]+)/m /(?:.*?#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}|.*?#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]})+(.*?#{Mx[:id_o]}~\d+;(?:\w|[0-6]:)\d+;\w\d+#{Mx[:id_c]})/m.match(para)[1] #/(?:.*?#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}|.*?#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]})+(.+?#{Mx[:id_o]}~\d+;(?:\w|[0-6]:)\d+;\w\d+#{Mx[:id_c]})/m.match(para)[1] else '' end para_plus_en << para_tail en_and_para_digest << endnote_digest(para_plus_en) para_new=en_and_para_digest.join(' ') @tuned << para_new + Mx[:id_o] + digest_strip + ':' + digest_all + Mx[:id_c] unless para.nil? else @tuned << para + Mx[:id_o] + digest_strip + ':' + digest_all + Mx[:id_c] unless para.nil? end else @tuned << para + Mx[:id_o] + digest_strip + ':' + digest_all + Mx[:id_c] unless para.nil? end @tuned.join end def strip_clean_extra_spaces(s) # dal output tuned s=s.dup s=s.gsub(/[ ]+([,.;:?](?:$|\s))/,'\1') unless s =~/#{Mx[:en_a_o]}|#{Mx[:en_b_o]}/ s=s.gsub(/ [ ]+/,' ') s=s.gsub(/^ [ ]+/,'') s=s.gsub(/ [ ]+$/,'') s=s.gsub(/((?:#{Mx[:fa_bold_c]}|#{Mx[:fa_italics_c]})')[ ]+(s )/,'\1\2') s=s.gsub(/((?:#{Mx[:fa_bold_c]}|#{Mx[:fa_italics_c]})')[ ]+(s )/,'\1\2') end def strip_clean_of_markup(s) # used for digest, define rules, make same as in db clean #consider: <\/?[ib]>|<(?:\/ )?br>|(.+?)<\/del> s=s.dup s=s.gsub(/(?:<\/?[ib]>|#{Mx[:id_o]}~\d+;(?:\w|[0-6]:)\d+;\w\d+#{Mx[:id_c]}|#{Mx[:id_o]}#@dp:#@dp#{Mx[:id_c]}|^#{Mx[:lv_o]}[1-6]:\S+?#{Mx[:lv_c]}|#{Mx[:en_a_o]}\d+\s.+?#{Mx[:en_a_c]})/m,'') # markup and endnotes removed #% same as db clean --> s=s.gsub(/(.+?)<\/del>/,'DELETED(\1)') # deletions s=s.gsub(/(\d+)<\/sup>/,'[\1]') s=s.gsub(/(?:#{Mx[:nbsp]})+/,' ') #s=s.gsub(//,"[TABLE]\n") # tables #s=s.gsub(//,'\1') # tables #s=s.gsub(/¡¡\d+¡/,' ') # tables #s=s.gsub(/¡/,' ') # tables tidy later #s=s.gsub(/<.+?>/,'') s=s.gsub(/#{Mx[:lnk_o]}.+?\.(?:png|jpg|gif).+?#{Mx[:lnk_c]}(?:https?|file|ftp)\\\:\S+ /,' [image] ') # else image names found in search s=s.gsub(/\s\s+/,' ') s=s.strip end end end __END__