#! /usr/local/bin/gawk -f # ider # $Id: ider,v 1.7 1997/03/05 01:39:27 priestdo Exp $ # script to ad id attributes to an sgml document # by Greg E Priest-Dorman # Assumes that the start tag used to signal the begining of labeling # is the last tag on the line containing it. # Assumes all tags have end tags except empty tags and those are # specified in the variable empty. # start tags CAN span lines, but the "cost" is that no attribute # should end in "id" except "id". This is true for the cesDoc.dtd. # Usage: # # ider [prefix="STRING"] [empty=" TAG1 TAG2 TAG3 ... "] # [label=" TAG1 TAG2 TAG3 ... "] [bodytest="TAG"] # [fixfile="name"] FILENAME # User should check and change the variables "prefix" "empty" "label" # and "bodytest" as desired. CesDoc users need only change "prefix". # variables should be changed on the command line as in: # # ider prefix="Osl" Osl.ces > Osl.ids.ces # # the above command will produce Osl.ids.ces and possibly Osl.ces.fix # # or (a more complex example ) if you have part 1 chapter 2 # in a file by itself, startign with the DIV of chapter 2, you could # do: # # ider prefix="Osl.1.2" label=" p poem list div " bodytest="div" > Osl.1.2.id # # this will produce Osl.1.2.id and possibly Osl.1.2.fix # # Prints preexisting IDs on any tag that gets a new ID to a file # fixfile to facilitate correction. (It DOES overwrite any old # fixfile of the same name) The sgml may be invalid until these IDs # are corrected (unless there are no references to the old IDs). # The only side effect I can see of this process is that if an id # attribute appears on a line by itself in the original then you # will have a blank line in the file. # FIXED in v 1.7 # ider now correctly counts empty elemnts # ider now generates a cleaner ".fix" file # BUGS # This script does not ignore tags inside comments and it should. # BEGIN { # prefix to appear at the beginning if each id prefix = "Oen" # empty is a list of all empty tags space seporated, with a # leading and trailng space. The default list is for the # cesDoc.dtd empty = " ptr gap catref " # label is a list of all tags to label space seporated, with a # leading and trailng space. To label all the tags after bodytest, # set label equal to "" label = " quote p q poem list div " # for use on documents that do not have a BODY tag change # the value of bodytest to the desired tag name or set # inbody = 1, default is set for use on full cesDoc documents. # All tags begining with the line AFTER bodytest is matched # will be checked for labeling. bodytest = "body" # fixfile is the name of the file to write the IDs that # have changed. If not specified it defaults to the name of # the input file with with ".fix" appended to it. # This file is in the form of a sed script that will convert # old IDs referenced in the text (or other texts) to the new IDs # some care need to be exercised in using this script. The lines # may need to be inverted and/or you might want to use a different # prefix which can then be changed as the last line of the fixfile. fixfile = FILENAME ".fix" # # user should not change any of the variables below this point # FS = "<" OFS = "<" INGNORECASE = 1 # } # END OF BEGIN # function to write id's to be fixed to a file in sed format # NOTE: this function modifies contence the current field function writefix(fixstring,tloc) { if (match( fixstring, "id=[^ >]*")) { print "s/" substr( fixstring, RSTART +2, RLENGTH -2) "/=\"" tloc "\"/g" >fixfile sub("id=[^ >]*", "", fixstring ) } return fixstring } # This first bit with tagsplit cleans up any ids on tags that span # lines. It is realy ugly, but it does the job. I would like to know # of an easyer way to chack for all the cases... { if ( tagsplit != 0) # we have a multiline start tag that was id'd { if ( match( $1, ">")) # found the end of it { tagsplit = 0 # reset the multiline switch if (match( $1, "id=[^>]*>" )) { $1 = writefix($1,tloc) } } else { $1 = writefix($1,tloc) } } } # # here we go... # { # Are we in the body yet? if (inbody == 0) { inbody = match( $0, "<" bodytest ) } else if ( NF > 1) # there is at least one tag on this line { for ( field = 2; field <= NF; ++field ) # check each tag { if ( match($field,"^\/") ) # end tag { tree[depth] = 0 # zero out the counter at this depth depth-- # move pointer back } else if ( match($field,"^[A-z]") ) # start tag { tree[depth++]++ # increment depth and counter # get the name of the current tag match($field,"^[^ >]*") tag = substr($field, RSTART, RLENGTH) padtag = " " tag " " # pad with blanks to prevent # mismatch # check if this tag should be labled if ( match( label, padtag )||(label == "" )) { tloc = prefix for ( i = 0; i <= (depth -1); i++) { tloc = tloc "." tree[i] } # what if there is already an ID on the element # This gets very messy... # # This first part looks at the case of a start tag that spans lines if # the id attibute is on this line, we are in the clear, otherwise set # the tagsplit switch so we will cach it on the next line(s) if (field == NF && !(match( $field, ">"))) { if (match( $field, " id=[^ ]*")) { $field = writefix($field,tloc) } else tagsplit = 1 } # Now deal with the normal case, where the start tag is all on this line else if (match( $field, " id=[^>]*>")) { $field = writefix($field,tloc) } # Write the new id as the first attribute sub(tag, tag " id=\"" tloc "\"", $field ) } # check if this tag was an empty element if ( match( empty, padtag )) { depth-- } } # end of IF START TAG } # end of CHECK EACH TAG } # end of THERE IS AT LEAST ONE TAG ON THIS LINE print } # END END { print "Done" | "cat 1>&2" print "Thank you for flying with Multext-East." | "cat 1>&2" print "Remember to change BYTECOUNT and look at " fixfile | "cat 1>&2" print "If you have any comments about this script" | "cat 1>&2" print "please send them to priestdo@cs.vassar.edu" | "cat 1>&2" }