#! /usr/local/bin/gawk -f
# ider

#     $Id: ider,v 1.7 1997/03/05 01:39:27 priestdo Exp $

# script to ad id attributes to an sgml document 
# by Greg E Priest-Dorman 

# Assumes that the start tag used to signal the begining of labeling
# is the last tag on the line containing it.

# Assumes all tags have end tags except empty tags and those are 
# specified in the variable empty.
 
# start tags CAN span lines, but the "cost" is that no attribute
# should end in "id" except "id".  This is true for the cesDoc.dtd.  

# Usage:
#
# ider [prefix="STRING"] [empty=" TAG1 TAG2 TAG3 ... "]
#      [label=" TAG1 TAG2 TAG3 ... "] [bodytest="TAG"]
#      [fixfile="name"] FILENAME 

# User should check and change the variables "prefix" "empty" "label"
# and "bodytest" as desired.  CesDoc users need only change "prefix".
# variables should be changed on the command line as in:
#
#   ider prefix="Osl" Osl.ces > Osl.ids.ces
#
# the above command will produce Osl.ids.ces and possibly Osl.ces.fix
#
# or (a more complex example ) if you have part 1 chapter 2
# in a file by itself, startign with the DIV of chapter 2, you could 
# do:
#
# ider prefix="Osl.1.2" label=" p poem list div " bodytest="div" > Osl.1.2.id
# 
# this will produce Osl.1.2.id and possibly Osl.1.2.fix
#

# Prints preexisting IDs on any tag that gets a new ID to a file
# fixfile to facilitate correction.  (It DOES overwrite any old
# fixfile of the same name) The sgml may be invalid until these IDs
# are corrected (unless there are no references to the old IDs).  
# The only side effect I can see of this process is that if an id 
# attribute appears on a line by itself in the original then you 
# will have a blank line in the file.

# FIXED in v 1.7
# ider now correctly counts empty elemnts 
# ider now generates a cleaner ".fix" file

# BUGS
# This script does not ignore tags inside comments and it should.
# 


BEGIN { 

        # prefix to appear at the beginning if each id
  
  prefix = "Oen"

        # empty is a list of all empty tags space seporated, with a
        # leading and trailng space.  The default list is for the
        # cesDoc.dtd

    empty = " ptr gap catref "

        # label is a list of all tags to label space seporated, with a
        # leading and trailng space.  To label all the tags after bodytest, 
        # set label equal to ""

    label = " quote p q poem list div "

        # for use on documents that do not have a BODY tag change
        # the value of bodytest to the desired tag name or set
        # inbody = 1, default is set for use on full cesDoc documents.
        # All tags begining with the line AFTER bodytest is matched
        # will be checked for labeling.

    bodytest = "body"

        # fixfile is the name of the file to write the IDs that
        # have changed.  If not specified it defaults to the name of 
        # the input file with with ".fix" appended to it.
        # This file is in the form of a sed script that will convert 
        # old IDs referenced in the text (or other texts) to the new IDs
        # some care need to be exercised in using this script.  The lines 
        # may need to be inverted and/or you might want to use a different
        # prefix which can then be changed as the last line of the fixfile. 

    fixfile = FILENAME ".fix"

#
# user should not change any of the variables below this point
#
    FS = "<"  
    OFS = "<"
    INGNORECASE = 1
        #  
      }	# END OF BEGIN

# function to write id's to be fixed to a file in sed format
# NOTE: this function modifies contence the current field

function writefix(fixstring,tloc)
{
  if (match( fixstring, "id=[^ >]*"))
    {
      print "s/" substr( fixstring, RSTART +2, RLENGTH -2) "/=\"" tloc "\"/g" >fixfile
	sub("id=[^ >]*", "", fixstring )
	}     
return fixstring
  }

# This first bit with tagsplit cleans up any ids on tags that span
# lines.  It is realy ugly, but it does the job.  I would like to know
# of an easyer way to chack for all the cases...

{ 
  if ( tagsplit != 0)           # we have a multiline start tag that was id'd
    {                                 
      if ( match( $1, ">"))     # found the end of it
	{ 
	  tagsplit = 0          # reset the multiline switch 
	    if (match( $1, "id=[^>]*>" ))   
	      {
                $1 = writefix($1,tloc)
		  }
	}   
      else 
	{
          $1 = writefix($1,tloc)
	    }
    }

}

# 
# here we go...
#

{

# Are we in the body yet?

  if (inbody == 0)
    { 
      inbody = match( $0, "<" bodytest )
	}
  else if ( NF > 1)  # there is at least one tag on this line
    {                
      for ( field = 2; field <= NF; ++field )  # check each tag 
	{ 
	  
	  if ( match($field,"^\/") ) # end tag
	    {                      
	      tree[depth] = 0        # zero out the counter at this depth
		depth--              # move pointer back   
		}

	  else if ( match($field,"^[A-z]") ) # start tag
	    {
	      tree[depth++]++                # increment depth and counter
		
# get the name of the current tag

                match($field,"^[^ >]*")
                tag =  substr($field, RSTART, RLENGTH) 
		padtag = " " tag " "            # pad with blanks to prevent 
                                             # mismatch
	      
# check if this tag should be labled 
	      
	      if ( match( label, padtag )||(label == "" ))
		{   
		  tloc = prefix          
		    for ( i = 0; i <= (depth -1); i++)
		      {           
			tloc = tloc "." tree[i]         
			  }
		  
# what if there is already an ID on the element
# This gets very messy...
#

# This first part looks at the case of a start tag that spans lines if
# the id attibute is on this line, we are in the clear, otherwise set
# the tagsplit switch so we will cach it on the next line(s)

		  if (field == NF &&  !(match( $field, ">"))) 
		    {
		      if (match( $field, " id=[^ ]*"))   
			{
                          $field = writefix($field,tloc)
			    } 
		      else tagsplit = 1
			}

# Now deal with the normal case, where the start tag is all on this line

		  else if (match( $field, " id=[^>]*>"))
		    {
                      $field = writefix($field,tloc)
                       }


# Write the new id as the first attribute
  
		  sub(tag, tag " id=\"" tloc "\"", $field )

		    }

# check if this tag was an empty element
		
		if ( match( empty, padtag )) { depth-- }

} # end of IF START TAG  
} # end of CHECK EACH TAG
} # end of THERE IS AT LEAST ONE TAG ON THIS LINE
  print
    } # END

END  {
print "Done" | "cat 1>&2"
print "Thank you for flying with Multext-East." | "cat 1>&2"
print "Remember to change BYTECOUNT and look at " fixfile | "cat 1>&2"
print "If you have any comments about this script" | "cat 1>&2"
print "please send them to priestdo@cs.vassar.edu" | "cat 1>&2"
  }