-*- mode: org -*-
#+TITLE:       sisu object munge
#+DESCRIPTION: documents - structuring, various output representations & search
#+FILETAGS:    :sisu:munge:objects:
#+AUTHOR:      Ralph Amissah
#+EMAIL:       [[mailto:ralph.amissah@gmail.com][ralph.amissah@gmail.com]]
#+COPYRIGHT:   Copyright (C) 2015 - 2021 Ralph Amissah
#+LANGUAGE:    en
#+STARTUP:     content hideblocks hidestars noindent entitiespretty
#+OPTIONS:     H:3 num:nil toc:t \n:nil @:t ::t |:t ^:nil _:nil -:t f:t *:t <:t
#+PROPERTY:    header-args  :exports code
#+PROPERTY:    header-args+ :noweb yes
#+PROPERTY:    header-args+ :eval no
#+PROPERTY:    header-args+ :results no
#+PROPERTY:    header-args+ :cache no
#+PROPERTY:    header-args+ :padline no
#+PROPERTY:    header-args+ :mkdirp yes

* object_munge.rb

#+BEGIN_SRC ruby  :tangle "../lib/sisu/object_munge.rb"
# <<sisu_document_header>>
module SiSU_Object_Munge
  def i_src_o_strip_markup(txtobj)
    txtobj=txtobj.
      gsub(/#{Mx[:srcrgx_bold_o]}(.+?)#{Mx[:srcrgx_bold_c]}/m,'\1').
      gsub(/#{Mx[:srcrgx_italics_o]}(.+?)#{Mx[:srcrgx_italics_c]}/m,'\1').
      gsub(/#{Mx[:srcrgx_underscore_o]}(.+?)#{Mx[:srcrgx_underscore_c]}/m,'\1').
      gsub(/#{Mx[:srcrgx_cite_o]}(.+?)#{Mx[:srcrgx_cite_c]}/m,'\1').
      gsub(/#{Mx[:srcrgx_insert_o]}(.+?)#{Mx[:srcrgx_insert_c]}/m,'\1').
      gsub(/#{Mx[:srcrgx_strike_o]}(.+?)#{Mx[:srcrgx_strike_c]}/m,'\1').
      gsub(/#{Mx[:srcrgx_superscript_o]}(\d+)#{Mx[:srcrgx_superscript_c]}/m,'[\1]').
      gsub(/#{Mx[:srcrgx_superscript_o]}(.+?)#{Mx[:srcrgx_superscript_c]}/m,'\1').
      gsub(/#{Mx[:srcrgx_subscript_o]}(.+?)#{Mx[:srcrgx_subscript_c]}/m,'\1').
      gsub(/#{Mx[:srcrgx_hilite_o]}(.+?)#{Mx[:srcrgx_hilite_c]}/m,'\1').
      gsub(/#{Mx[:gl_o]}#(?:126|152)#{Mx[:gl_c]}/i,'~').
      gsub(/#{Mx[:en_a_o]}([\d*+]+)\s+(?:.+?)#{Mx[:en_a_c]}/m,''). # endnote removed
      gsub(/#{Mx[:en_b_o]}([\d*+]+)\s+(?:.+?)#{Mx[:en_b_c]}/m,''). # endnote removed
      gsub(/(?:#{Mx[:nbsp]})+/,' ').
      gsub(/(?:#{Mx[:br_nl]})+/,"\n").
      gsub(/(?:#{Mx[:br_paragraph]})+/,"\n").
      gsub(/(?:#{Mx[:br_line]})+/,"\n").
      gsub(/#{Mx[:gl_o]}(?:#lt|#060)#{Mx[:gl_c]}/,'<').
      gsub(/#{Mx[:gl_o]}(?:#gt|#062)#{Mx[:gl_c]}/,'>').
      gsub(/#{Mx[:gl_o]}#(?:038|amp)#{Mx[:gl_c]}/,'&').
      gsub(/#{Mx[:gl_o]}#033#{Mx[:gl_c]}/,'!').
      gsub(/#{Mx[:gl_o]}#035#{Mx[:gl_c]}/,'#').
      gsub(/#{Mx[:gl_o]}#042#{Mx[:gl_c]}/,'*').
      gsub(/#{Mx[:gl_o]}#045#{Mx[:gl_c]}/,'-').
      gsub(/#{Mx[:gl_o]}#047#{Mx[:gl_c]}/,'/').
      gsub(/#{Mx[:gl_o]}#095#{Mx[:gl_c]}/,'_').
      gsub(/#{Mx[:gl_o]}#123#{Mx[:gl_c]}/,'{').
      gsub(/#{Mx[:gl_o]}#125#{Mx[:gl_c]}/,'}').
      gsub(/#{Mx[:gl_o]}#126#{Mx[:gl_c]}/,'~').
      gsub(/#{Mx[:gl_o]}#169#{Mx[:gl_c]}/,'©').
      gsub(/[ ][ ]s+/,' ').
      strip
if txtobj =~/Reading this/
  puts txtobj
  if txtobj =~ /#{Mx[:srcrgx_italics_o]}(.+?)#{Mx[:srcrgx_italics_c]}/
    puts __LINE__
    puts Mx[:srcrgx_italics_o]
    puts txtobj
  end
end
; txtobj
  end
  def i_ao_o_strip_markup(txtobj)
    txtobj=txtobj.gsub(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'\1').
      gsub(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'\1').
      gsub(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'\1').
      gsub(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'\1').
      gsub(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'\1').
      gsub(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,'\1').
      gsub(/#{Mx[:fa_superscript_o]}(\d+)#{Mx[:fa_superscript_c]}/,'[\1]').
      gsub(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'\1').
      gsub(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'\1').
      gsub(/#{Mx[:fa_hilite_o]}(.+?)#{Mx[:fa_hilite_c]}/,'\1').
      gsub(/#{Mx[:gl_o]}#(?:126|152)#{Mx[:gl_c]}/i,'~').
      gsub(/#{Mx[:en_a_o]}([\d*+]+)\s+(?:.+?)#{Mx[:en_a_c]}/,''). # endnote removed
      gsub(/#{Mx[:en_b_o]}([\d*+]+)\s+(?:.+?)#{Mx[:en_b_c]}/,''). # endnote removed
      gsub(/(?:#{Mx[:nbsp]})+/,' ').
      gsub(/(?:#{Mx[:br_nl]})+/,"\n").
      gsub(/(?:#{Mx[:br_paragraph]})+/,"\n").
      gsub(/(?:#{Mx[:br_line]})+/,"\n").
      gsub(/#{Mx[:gl_o]}(?:#lt|#060)#{Mx[:gl_c]}/,'<').
      gsub(/#{Mx[:gl_o]}(?:#gt|#062)#{Mx[:gl_c]}/,'>').
      gsub(/#{Mx[:gl_o]}#(?:038|amp)#{Mx[:gl_c]}/,'&').
      gsub(/#{Mx[:gl_o]}#033#{Mx[:gl_c]}/,'!').
      gsub(/#{Mx[:gl_o]}#035#{Mx[:gl_c]}/,'#').
      gsub(/#{Mx[:gl_o]}#042#{Mx[:gl_c]}/,'*').
      gsub(/#{Mx[:gl_o]}#045#{Mx[:gl_c]}/,'-').
      gsub(/#{Mx[:gl_o]}#047#{Mx[:gl_c]}/,'/').
      gsub(/#{Mx[:gl_o]}#095#{Mx[:gl_c]}/,'_').
      gsub(/#{Mx[:gl_o]}#123#{Mx[:gl_c]}/,'{').
      gsub(/#{Mx[:gl_o]}#125#{Mx[:gl_c]}/,'}').
      gsub(/#{Mx[:gl_o]}#126#{Mx[:gl_c]}/,'~').
      gsub(/#{Mx[:gl_o]}#169#{Mx[:gl_c]}/,'©').
      gsub(/[ ][ ]s+/,' ').
      strip
  end
  def i_ao_o_src_markup_restore(txtobj)
    @txtobj=txtobj
    def textface_marks
      @txtobj.gsub(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'*{\1}*').
        gsub(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'/{\1}/').
        gsub(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'_{\1}_').
        gsub(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'"{\1}"').
        gsub(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'+{\1}+').
        gsub(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,'-{\1}-').
        gsub(/#{Mx[:fa_superscript_o]}(\d+)#{Mx[:fa_superscript_c]}/,'^{[\1]}^').
        gsub(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'^{\1}^').
        gsub(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,',{\1},').
        gsub(/#{Mx[:fa_hilite_o]}(.+?)#{Mx[:fa_hilite_c]}/,'\1').
        gsub(/#{Mx[:gl_o]}#(?:126|152)#{Mx[:gl_c]}/i,'~').
        gsub(/#{Mx[:en_a_o]}([\d*+]+)\s+(?:.+?)#{Mx[:en_a_c]}/,'~{\1 \2}~').
        gsub(/#{Mx[:en_b_o]}([\d*+]+)\s+(?:.+?)#{Mx[:en_b_c]}/,''). # endnote removed
        gsub(/(?:#{Mx[:nbsp]})+/,' ').
        gsub(/(?:#{Mx[:br_nl]})+/,"\n").
        gsub(/(?:#{Mx[:br_paragraph]})+/,"\n").
        gsub(/(?:#{Mx[:br_line]})+/,"\n").
        gsub(/#{Mx[:gl_o]}(?:#lt|#060)#{Mx[:gl_c]}/,'<').
        gsub(/#{Mx[:gl_o]}(?:#gt|#062)#{Mx[:gl_c]}/,'>').
        gsub(/#{Mx[:gl_o]}#(?:038|amp)#{Mx[:gl_c]}/,'&').
        gsub(/#{Mx[:gl_o]}#033#{Mx[:gl_c]}/,'!').
        gsub(/#{Mx[:gl_o]}#035#{Mx[:gl_c]}/,'#').
        gsub(/#{Mx[:gl_o]}#042#{Mx[:gl_c]}/,'*').
        gsub(/#{Mx[:gl_o]}#045#{Mx[:gl_c]}/,'-').
        gsub(/#{Mx[:gl_o]}#047#{Mx[:gl_c]}/,'/').
        gsub(/#{Mx[:gl_o]}#095#{Mx[:gl_c]}/,'_').
        gsub(/#{Mx[:gl_o]}#123#{Mx[:gl_c]}/,'{').
        gsub(/#{Mx[:gl_o]}#125#{Mx[:gl_c]}/,'}').
        gsub(/#{Mx[:gl_o]}#126#{Mx[:gl_c]}/,'~').
        gsub(/#{Mx[:gl_o]}#169#{Mx[:gl_c]}/,'©').
        gsub(/[ ][ ]s+/,' ').
        strip
    end
    def object_marks
      @txtobj
    end
    self
  end
  def clean_text(txtobj,markup=:ao)
    if txtobj.class==String
      txtobj=if markup ==:ao
        i_ao_o_strip_markup(txtobj)
      elsif markup ==:src
        i_src_o_strip_markup(txtobj)
      else p __FILE__; p __LINE__
      end
    elsif txtobj.class.inspect=~/^SiSU_AO_DocumentStructure::/
      txtobj.obj=i_ao_o_strip_markup(txtobj.obj)
    else p 'error'
    end
    txtobj
  end
  def footnotes_inline(txtobj)
  end
  def footnotes_ref_and_note(txtobj)
  end
  def src_markup(txtobj)
    txtobj
  end
  def extract_endnotes(doc_obj_txt,endnotes_)               #% used for extraction of endnotes from paragraphs
    if endnotes_ ==:separate
      notes_a=doc_obj_txt.scan(/#{Mx[:en_a_o]}([\d]+\s+.+?)#{Mx[:en_a_c]}/)
      ##notes_a=doc_obj_txt.scan(/#{Mx[:en_a_o]}([\d*+]+\s+.+?)#{Mx[:en_a_c]}/)
      #notes_b=doc_obj_txt.scan(/#{Mx[:en_b_o]}([\d*+]+\s+.+?)#{Mx[:en_b_c]}/)
      n=[]
      notes_a.flatten.each do |note| #high cost to deal with <br> appropriately within plaintext, consider
        note=note.dup.to_s
        note=note.gsub(/^([\d]+)\s+/,'^~\1 ').
          gsub(/#{Mx[:br_line]}|#{Mx[:br_nl]}/,
            ' \\\\\\ ')
        n << note
      end
      notes_a=n.flatten
      doc_obj_txt=doc_obj_txt.
        gsub(/#{Mx[:en_a_o]}([\d]+)\s+(?:.+?)#{Mx[:en_a_c]}/,'~^')   # endnote marker marked up
    else
      doc_obj_txt=doc_obj_txt.
        gsub(/#{Mx[:en_b_o]}[\d]+\s+(.+?)#{Mx[:en_b_c]}/,
          '~[ \1 ]~').     # inline endnote with marker marked up
        gsub(/#{Mx[:en_a_o]}([*+]+)\s+(.+?)#{Mx[:en_a_c]}/,
          '~{\1 \2 }~'). # inline endnote with marker marked up
        gsub(/#{Mx[:en_b_o]}([*+]+)\s+(.+?)#{Mx[:en_b_c]}/,
          '~[\1 \2 ]~') # inline endnote with marker marked up
    end
    [doc_obj_txt,notes_a]
  end
  def objects #def i_ao_o_src_markup_restore(txtobj)
    def code_(dob)
      if dob.is==:code
        dob.obj=dob.obj.gsub(/(^|[^}])_([<>])/m,'\1\2'). # _> _<
          gsub(/(^|[^}])_([<>])/m,'\1\2') # _<_<
      end
      dob
    end
    def block_(dob)
      dob.obj=if dob.of==:block                                   # watch
        dob.obj.gsub(/#{Mx[:gl_o]}●#{Mx[:gl_c]}/,"* ").
          gsub(/#{Mx[:br_line]}|#{Mx[:br_nl]}/,"\n")
      else dob.obj.gsub(/#{Mx[:br_line]}|#{Mx[:br_nl]}/,"\n\n")
      end
      dob
    end
    def textface_marks_po4a(dob,endnotes_=:inline)
      notes=''
      dob.obj=dob.obj.
        gsub(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,
          Mx[:src_bold_o] + '\1' + Mx[:src_bold_c]).
        gsub(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,
          Mx[:src_italics_o] + '\1' + Mx[:src_italics_c]).
        gsub(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,
          Mx[:src_underscore_o] + '\1' + Mx[:src_underscore_c]).
        gsub(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,
          Mx[:src_subscript_o] + '\1' + Mx[:src_subscript_c]).
        gsub(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,
          Mx[:src_superscript_o] + '\1' + Mx[:src_superscript_c]).
        gsub(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,
          Mx[:src_insert_o] + '\1' + Mx[:src_insert_c]).
        gsub(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,
          Mx[:src_cite_o] + '\1' + Mx[:src_cite_c]).
        gsub(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,
          Mx[:src_strike_o] + '\1' + Mx[:src_strike_c]).
        gsub(/#{Mx[:fa_monospace_o]}(.+?)#{Mx[:fa_monospace_c]}/,
          Mx[:src_monospace_o] + '\1' + Mx[:src_monospace_c])
      unless dob.is==:code
        dob.obj=dob.obj.
          gsub(/#{Mx[:lnk_o]}(.+?)#{Mx[:lnk_c]}#{Mx[:rel_o]}\S+?#{Mx[:rel_c]}/,'\1').
          gsub(/#{Mx[:url_o]}_(\S+?)#{Mx[:url_c]}/,'\1').
          gsub(/#{Mx[:lnk_o]}(.+?)#{Mx[:lnk_c]}#{Mx[:url_o]}(\S+?)#{Mx[:url_c]}/,
            '\1 [link: <\2>]').
          gsub(/#{Mx[:lnk_o]}(.+?)#{Mx[:lnk_c]}image/,
            '\1 [link: local image]').
          gsub(/#{Mx[:url_o]}(\S+?)#{Mx[:url_c]}/,'\1')
        dob.obj,notes=extract_endnotes(dob.obj,endnotes_)
        dob.obj=dob.obj.
          gsub(/#{Mx[:gl_o]}(?:#lt|#060)#{Mx[:gl_c]}/,'<').
          gsub(/#{Mx[:gl_o]}(?:#gt|#062)#{Mx[:gl_c]}/,'>').
          gsub(/#{Mx[:gl_o]}#(?:038|amp)#{Mx[:gl_c]}/,'&').
          gsub(/#{Mx[:gl_o]}#033#{Mx[:gl_c]}/,'!').
          gsub(/#{Mx[:gl_o]}#035#{Mx[:gl_c]}/,'#').
          gsub(/#{Mx[:gl_o]}#042#{Mx[:gl_c]}/,'*').
          gsub(/#{Mx[:gl_o]}#045#{Mx[:gl_c]}/,'-').
          gsub(/#{Mx[:gl_o]}#047#{Mx[:gl_c]}/,'/').
          gsub(/#{Mx[:gl_o]}#095#{Mx[:gl_c]}/,'_').
          gsub(/#{Mx[:gl_o]}#123#{Mx[:gl_c]}/,'{').
          gsub(/#{Mx[:gl_o]}#125#{Mx[:gl_c]}/,'}').
          gsub(/#{Mx[:gl_o]}#126#{Mx[:gl_c]}/,'~').
          gsub(/#{Mx[:gl_o]}#169#{Mx[:gl_c]}/,'©')
      end
      dob=block_(dob)
      dob=code_(dob)
      dob.obj=dob.obj.gsub(/#{Mx[:br_page]}\s*|#{Mx[:br_page_new]}/,''). # remove page breaks
        gsub(/#{Mx[:url_o]}_(\S+?)#{Mx[:url_c]}/,'\1').
        gsub(/#{Mx[:mk_o]}:name#(\S+?)#{Mx[:mk_c]}/,'').                 # remove name links
        gsub(/&nbsp;|#{Mx[:nbsp]}/,' ').                                 # decide on
        gsub(/(?:^|[^_\\])#{Mx[:lnk_o]}(\S+?\.(?:png|jpg|gif)) .+?#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/,
          '    [ \1 ]'). #"[ #{dir.url.images_local}\/\\1 ]")
        gsub(/(?:^|[^_\\])#{Mx[:lnk_o]}(\S+?\.(?:png|jpg|gif)) .+?#{Mx[:lnk_c]}image/,
          '    [ \1 ]'). #"[ #{dir.url.images_local}\/\\1 ]")
        gsub(/(?:^|[^_\\])\{\s*\S+?\.(?:png|jpg|gif)\s+.+?"(.*?)"\s*\}\S+/,
          '[image: "\1"]')
      [dob,notes]
    end
    def object_marks
      @txtobj
    end
    self
  end
end
__END__
#+END_SRC

* document header

#+NAME: sisu_document_header
#+BEGIN_SRC text
encoding: utf-8
- Name: SiSU

  - Description: documents, structuring, processing, publishing, search
    object_munge

  - Author: Ralph Amissah
    <ralph.amissah@gmail.com>

  - Copyright: (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
    2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2019,
    2020, 2021, Ralph Amissah,
    All Rights Reserved.

  - License: GPL 3 or later:

    SiSU, a framework for document structuring, publishing and search

    Copyright (C) Ralph Amissah

    This program is free software: you can redistribute it and/or modify it
    under the terms of the GNU General Public License as published by the Free
    Software Foundation, either version 3 of the License, or (at your option)
    any later version.

    This program is distributed in the hope that it will be useful, but WITHOUT
    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
    more details.

    You should have received a copy of the GNU General Public License along with
    this program. If not, see <http://www.gnu.org/licenses/>.

    If you have Internet connection, the latest version of the GPL should be
    available at these locations:
    <http://www.fsf.org/licensing/licenses/gpl.html>
    <http://www.gnu.org/licenses/gpl.html>

    <http://www.sisudoc.org/sisu/en/manifest/gpl.fsf.html>

  - SiSU uses:
    - Standard SiSU markup syntax,
    - Standard SiSU meta-markup syntax, and the
    - Standard SiSU object citation numbering and system

  - Homepages:
    <http://www.sisudoc.org>

  - Git
    <https://git.sisudoc.org/projects/>
    <https://git.sisudoc.org/projects/?p=software/sisu.git;a=summary>
    <https://git.sisudoc.org/projects/?p=markup/sisu-markup-samples.git;a=summary>
#+END_SRC