diff options
Diffstat (limited to 'lib/sisu/v0/shared_xml.rb')
-rw-r--r-- | lib/sisu/v0/shared_xml.rb | 66 |
1 files changed, 50 insertions, 16 deletions
diff --git a/lib/sisu/v0/shared_xml.rb b/lib/sisu/v0/shared_xml.rb index 3c34e67f..41e8c393 100644 --- a/lib/sisu/v0/shared_xml.rb +++ b/lib/sisu/v0/shared_xml.rb @@ -166,6 +166,8 @@ module SiSU_XML_munge #¢£¥§©ª«®°±²³µ¶¹º»¼½¾×÷ ##para.gsub!(//, '&#;') ##para.gsub!(//, '&;') + para.gsub!(/</u, '<') # '<' # < + para.gsub!(/>/u, '>') # '>' # > para.gsub!(/¢/u, '¢') # '¢' # ¢ para.gsub!(/£/u, '£') # '£' # £ para.gsub!(/¥/u, '¥') # '¥' # ¥ @@ -250,10 +252,25 @@ module SiSU_XML_munge para.gsub!(/ü/u, 'ý') # 'ü' # ý para.gsub!(/þ/u, 'þ') # 'þ' # þ para.gsub!(/ÿ/u, 'ÿ') # 'ÿ' # ÿ + para.gsub!(/‘/u, '‘') # '‘' # ‘ + para.gsub!(/’/u, '’') # '’' # ’ + para.gsub!(/“/u, '“') # “ # “ + para.gsub!(/”/u, '”') # ” # ” + para.gsub!(/–/u, '–') # – # – + para.gsub!(/—/u, '—') # — # — + para.gsub!(/∝/u, '∝') # ∝ # ∝ + para.gsub!(/∞/u, '∞') # ∞ # ∞ + para.gsub!(/™/u, '™') # ™ # ™ + para.gsub!(/✠/u, '✠') # ✗ # ✠ + para.gsub!(/ /u, ' ') # space identify + para.gsub!(/ /u, ' ') # space identify end end def html(para='') if @sys.locale =~/utf-?8/i # instead ucs for utf8 #require 'iconv' ? Iñtërnâtiônàlizætiøn + para.gsub!(/ /u, ' ') # space identify + para.gsub!(/ /u, ' ') # space identify + else para.gsub!(/¢/u, '¢') # ¢ para.gsub!(/£/u, '£') # £ para.gsub!(/¥/u, '¥') # ¥ @@ -338,17 +355,32 @@ module SiSU_XML_munge para.gsub!(/ü/u, 'ü') # ý para.gsub!(/þ/u, 'þ') # þ para.gsub!(/ÿ/u, 'ÿ') # ÿ + para.gsub!(/‘/u, '&#lsquo;') # ‘ # ‘ + para.gsub!(/’/u, '&#rsquo;') # ’ # ’ + para.gsub!(/“/u, '“') # “ # “ + para.gsub!(/”/u, '”') # ” # ” + para.gsub!(/–/u, '–') # – # – + para.gsub!(/—/u, '—') # — # — + para.gsub!(/∝/u, '∝') # ∝ # ∝ + para.gsub!(/∞/u, '∞') # ∞ # ∞ + para.gsub!(/™/u, '™') # ™ # ™ + para.gsub!(/✠/u, '✠') # ✠ + #para.gsub!(/✠/u, '†') # † # † incorrect replacement † + para.gsub!(/ /u, ' ') # space identify + para.gsub!(/ /u, ' ') # space identify end end self end def tidywords(wordlist) wordlist.each do |x| + #imperfect solution will not catch all possible cases x.gsub!(/&/,'&') unless x =~/&\S+;/ + x.gsub!(/&([A-Z])/,'&\1') end end def markup(para='') - wordlist=para.scan(/\S+|\n/) #\n needed for tables, check though added 2005w17 + wordlist=para.scan(/&[#0-9a-z]+;|\S+|\n/) #\n needed for tables, check though added 2005w17 para=tidywords(wordlist).join(' ').strip para.gsub!(/#{Mx[:br_line]}|#{Mx[:br_nl]}/,'<br />') para.gsub!(/#{Mx[:mk_o]}:name#\S+?#{Mx[:mk_c]}/,'') @@ -377,23 +409,25 @@ module SiSU_XML_munge para.gsub!(/#{Mx[:br_page]}\s*/,'') para.gsub!(/#{Mx[:br_page_new]}\s*/,'') para.gsub!(/#{Mx[:pa_non_object_no_heading]}|#{Mx[:pa_non_object_dummy_heading]}/,''); para.gsub!(/<[-~]#>/,'') - para.gsub!(/(?:^|[^_\\])\{\s*(\S+?\.(?:jpg|png|gif))\s+(\d+)x(\d+)(\s+[^}]+)?\}(https?:\/\/\S+)/, - %{<image xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:actuate="onLoad" xlink:show="embed" xlink:href="#{@dir.url.images_local}/\\1" width="\\2" height="\\3" />[\\1] \\4}) - para.gsub!(/(?:^|[^_\\])\{\s*(\S+?\.(?:jpg|png|gif))(\s+[^}]+)?\}(https?:\/\/\S+)/, - %{<image xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:actuate="onLoad" xlink:show="embed" xlink:href="#{@dir.url.images_local}/\\1"/>\\1}) - para.gsub!(/(^|#{Mx[:gl_c]}|\s)\{([^}]+)\}(https?:\/\/[^"><]+?)([,.:;"><]?(?=\s|$))/, + para.gsub!(/(?:^|[^_\\])#{Mx[:lnk_o]}\s*(\S+?\.(?:jpg|png|gif))\s+(\d+)x(\d+)(\s+[^}]+)?#{Mx[:lnk_c]}(https?:\/\/\S+)/, + %{<image xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:actuate="onLoad" xlink:show="embed" xlink:href="#{@dir.url.images_local}/\\1" width="\\2" height="\\3" />[\\1] \\4}) + para.gsub!(/(?:^|[^_\\])#{Mx[:lnk_o]}\s*(\S+?\.(?:jpg|png|gif))(\s+[^}]+)?#{Mx[:lnk_c]}(https?:\/\/\S+)/, + %{<image xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:actuate="onLoad" xlink:show="embed" xlink:href="#{@dir.url.images_local}/\\1"/>\\1}) + para.gsub!(/(?:^|[^_\\])#{Mx[:lnk_o]}\s*(\S+?\.(?:jpg|png|gif))\s+(\d+)x(\d+)(\s+[^}]+)?#{Mx[:lnk_c]}image/, + %{<image xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:actuate="onLoad" xlink:show="embed" xlink:href="#{@dir.url.images_local}/\\1" width="\\2" height="\\3" />[\\1] \\4}) + para.gsub!(/(?:^|[^_\\])#{Mx[:lnk_o]}\s*(\S+?\.(?:jpg|png|gif))(\s+[^}]+)?#{Mx[:lnk_c]}image/, + %{<image xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:actuate="onLoad" xlink:show="embed" xlink:href="#{@dir.url.images_local}/\\1"/>\\1}) + para.gsub!(/(^|#{Mx[:gl_c]}|\s)#{Mx[:lnk_o]}(.+?)#{Mx[:lnk_c]}(https?:\/\/[^"><]+?)([,.:;"><]?(?=\s|$))/, '\1<link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="\3">\2</link>\4') #watch, compare html_tune para.gsub!(/(^|#{Mx[:gl_c]}|\s)((?:https?|file|ftp):\/\/\S+?\.[^'"><\s]+?)([;.,]?(?=\s|$))/, %{\\1#{@url_brace.xml_open}<link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="\\2">\\2</link>#{@url_brace.xml_close}\\3}) para.gsub!(/\b[_\\]((?:https?|file|ftp):\/\/\S+?\.[^'"><\s]+?)([;.,]?(?:\s|$))/, - '<link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="\1">\1</link>\2') #escaped urls not linked, deal with later - para.gsub!(/ /,' ') - #para.gsub!(/ /,' ') #clean + '<link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="\1">\1</link>\2') #escaped urls not linked, deal with later else para.gsub!(/(^|[^}])_</m,'\1<'); para.gsub!(/(^|[^}])_>/m,'\1>') #code-block: angle brackets special characters para.gsub!(/(^|[^}])_</m,'\1<'); para.gsub!(/(^|[^}])_>/m,'\1>') - para.gsub!(/ /,' ') end + para.gsub!(/ |#{Mx[:nbsp]}/m,' ') para end def markup_light(para='') @@ -406,11 +440,11 @@ module SiSU_XML_munge para.gsub!(/<[-~]#>/,'') para.gsub!(/(^|#{Mx[:gl_c]}|\s)&\s+/,'\1& ') #sort para.gsub!(/&([^;]{1,5})/,'&\1') #sort, rough estimate, revisit #WATCH found in node not sax - para.gsub!(/(?:^|[^_\\])\{(\S+?\.(?:png|jpg|gif)) .+?\}(?:(?:https?|file|ftp):\/\/\S+|image)/, + para.gsub!(/(?:^|[^_\\])#{Mx[:lnk_o]}(\S+?\.(?:png|jpg|gif)) .+?#{Mx[:lnk_c]}(?:(?:https?|file|ftp):\/\/\S+|image)/, "<image.path>#{@dir.url.images_local}\/\\1</image.path>") - para.gsub!(/ /,' ') + para.gsub!(/ |#{Mx[:nbsp]}/,' ') #para.gsub!(/ /,' ') #clean - wordlist=para.scan(/\S+|\n/) #\n needed for tables, check though added 2005w17 + wordlist=para.scan(/&[#0-9a-z]+;|\S+|\n/) #\n needed for tables, check though added 2005w17 para=tidywords(wordlist).join(' ').strip para end @@ -429,11 +463,11 @@ module SiSU_XML_munge para.gsub!(/<[-~]#>/,'') para.gsub!(/(^|#{Mx[:gl_c]}|\s)&\s+/,'\1& ') #sort para.gsub!(/&([^;]{1,5})/,'&\1') #sort, rough estimate, revisit #WATCH found in node not sax - para.gsub!(/(?:^|[^_\\])\{(\S+?\.(?:png|jpg|gif)) .+?\}(?:(?:https?|file|ftp):\/\/\S+|image)/, + para.gsub!(/(?:^|[^_\\])#{Mx[:lnk_o]}(\S+?\.(?:png|jpg|gif)) .+?#{Mx[:lnk_c]}(?:(?:https?|file|ftp):\/\/\S+|image)/, "<image.path>#{@dir.url.images_local}\/\\1</image.path>") - para.gsub!(/ /,' ') + para.gsub!(/ |#{Mx[:nbsp]}/,' ') #para.gsub!(/ /,' ') #clean - wordlist=para.scan(/\S+|\n/) #\n needed for tables, check though added 2005w17 + wordlist=para.scan(/&[#0-9a-z]+;|\S+|\n/) #\n needed for tables, check though added 2005w17 para=tidywords(wordlist).join(' ').strip para end |