-- (C) Copyright International Business Machines Corporation 23 January 
-- 1990.  All Rights Reserved. 
--  
-- See the file USERAGREEMENT distributed with this software for full 
-- terms and conditions of use. 
-- File: tokenize.p
-- Author: Andy Lowry
-- SCCS Info: @(#)tokenize.p	1.3 3/2/92

-- This the tokenizer.  It is parameterized by four character classes:
-- word chars, white chars, quote chars, and bracket chars.  These
-- parameters are provided at initialization time.  Thereafter,
-- strings can be broken into tokens based on the given classes.  See
-- the definitions modules for a description of the four types of
-- tokens.

tokenize: using (tokenize, tokInternal)

process (Q: tokenizeInitQ)
  
declare
  args: tokenizeInit;
  tokQ: tokenizeQ;		-- our service port
  wordClass: charClass;		-- chars that can be in word tokens
  whiteClass: charClass;	-- chars that are skipped
  quoteClass: charClass;	-- chars that can surround quoted strings
  brackets: charMap;		-- bracket pairs for bracketed strings
  closeBrackets: charClass;	-- closing bracket characters
  allChars: charClass;		-- includes all classes... to check
				-- for a char in multiple classes
begin
  receive args from Q;
  
  -- create the character classes
  new wordClass;
  for c in args.wordChars[] inspect
    block begin
      insert copy of c into wordClass;
    on (duplicateKey)
      exit badClasses;
    end block;
  end for;

  new whiteClass;
  for c in args.whiteChars[] inspect
    block begin
      insert copy of c into whiteClass;
    on (duplicateKey)
      exit badClasses;
    end block;
  end for;
  
  new quoteClass;
  for c in args.quoteChars[] inspect
    block begin
      insert copy of c into quoteClass;
    on (duplicateKey)
      exit badClasses;
    end block;
  end for;
  
  if (size of args.bracketChars) mod 2 <> 0 then
    exit badClasses;		-- need an even number of bracket chars
  end if;
  new brackets;
  new closeBrackets;
  new allChars;
  block declare
    bracketPair: charPair;	-- a pair of bracket chars
    pos: integer;		-- current position in bracketChars
  begin
    pos <- 0;
    while pos < size of args.bracketChars repeat
      new bracketPair;
      bracketPair.first <- args.bracketChars[pos];
      bracketPair.second <- args.bracketChars[pos + 1];
      insert copy of bracketPair.first into allChars;
      insert copy of bracketPair.second into closeBrackets;
      insert bracketPair into brackets;
      pos <- pos + 2;
    end while;
  on (duplicateKey)
    exit badClasses;		-- either open or close bracket
				-- already appeared as such
  end block;
  
  -- make sure all the character classes are disjoint... open bracket
  -- chars are already in the allChars table
  block begin
    merge copy of wordClass into allChars;
    merge copy of whiteClass into allChars;
    merge copy of quoteClass into allChars;
    merge copy of closeBrackets into allChars;
  on (duplicateKey)
    exit badClasses;
  end block;
  discard allChars;		-- don't really need this for anything

  -- create a service port and give back the capability
  new tokQ;
  connect args.tokenize to tokQ;
  return args;
  discard Q;
  
  -- Now serve the public
  while 'true' repeat
    block declare
      tokArgs: tokenize;
      tokenString: charString;
      token: token;
      closeChar: char;
      quoteChar: char;
      state: tokenState;
      advance: boolean;
      stack: tokenizeStack;
      stackEntry: tokenizeStackEntry;
    begin
      receive tokArgs from tokQ;
      
      new tokArgs.tokens;	-- no tokens parsed yet
      new tokenString;		-- no chars yet in current token
      closeChar <- ' ';		-- ignored when stack is empty, but
				-- must be init for typestate correctness
      quoteChar <- ' ';		-- ditto
      new stack;		-- no pending bracketed strings
      state <- 'start';
      -- loop over all the characters, using a state machine to parse
      -- tokens as we go
      for c in tokArgs.string[] inspect
	advance <- 'false';
	while not advance repeat
	  select state
	  where ('start')
	    -- here when we're looking for the beginning of a new token
	    select
	    where (exists of whiteClass[c])
	      -- whitespace... skip it and don't change state
	      advance <- 'true';
	    where (exists of wordClass[c])
	      -- word character.. start accumulating a word token
	      state <- 'inWord';
	      insert copy of c into tokenString;
	      advance <- 'true';
	    where (exists of quoteClass[c])
	      -- quote character... begin accumulating a quoted string
	      quoteChar := c;
	      state <- 'inQuote';
	      advance <- 'true';
	    where (exists of brackets[c])
	      -- open bracket character... begin accumulating a
	      -- bracketed string token
	      new stackEntry;
	      stackEntry.brackets <- brackets[c];
	      closeChar := stackEntry.brackets.second;
	      -- embedded bracketed strings include their brackets
	      if size of stack <> 0 then
		insert copy of c into tokenString;
	      end if;
	      insert stackEntry into stack;
	      advance <- 'true';
	    where (size of stack <> 0 and c = closeChar)
	      -- this is the closing bracket for a bracketed string...
	      -- terminate it and install it if this is the bottom of
	      -- the stack
	      remove stackEntry from stack[size of stack - 1];
	      if size of stack = 0 then
		block declare
		  bTok: bracketedStringToken;
		begin
		  new bTok;
		  bTok.openBracket <- stackEntry.brackets.first;
		  bTok.closeBracket <- stackEntry.brackets.second;
		  bTok.string <- tokenString;
		  new tokenString;
		  unite token.bracketed from btok;
		  insert token into tokArgs.tokens;
		end block;
	      else
		insert copy of c into tokenString;
	      end if;
	      advance <- 'true';
	    where (exists of closeBrackets[c] and 
		    (size of stack <> 0 or c <> closeChar))
	      -- closing bracket where it's not allowed
	      exit illFormed;
	    otherwise
	      -- this is a delimiter character... wrap it up as a token
	      -- and move on
	      unite token.delimiter from copy of c;
	      insert token into tokArgs.tokens;
	      advance <- 'true';
	    end select;
	    
	    
	  where ('inWord')
	    -- here when we're accumulating a word token.  Any character
	    -- in the word class continues this token, any other
	    -- terminates it without advancing the character position in
	    -- the string being parsed
	    if exists of wordClass[c] then
	      insert copy of c into tokenString;
	      advance <- 'true';
	    else
	      unite token.word from tokenString;
	      insert token into tokArgs.tokens;
	      new tokenString;
	      state <- 'start';
	      advance <- 'false';
	    end if;
	    
	  where ('inQuote')
	    -- here when we're accumulating a quoted string
	    if c = quoteChar then
	      -- this may be the closing quote, or it may be the first
	      -- in a doubled quote configuration
	      state <- 'sawQuote';
	    else
	      -- non-quote char... consume it
	      insert copy of c into tokenString;
	    end if;
	    advance <- 'true';
	    
	  where ('sawQuote')
	    -- here when we've just seen a quote char that either closes
	    -- the current quoted string or introduces a doubled quote
	    -- that should generate a single quote in the string text
	    if c = quoteChar then
	      -- doubled quote... insert one and go back to accumulating
	      -- more quoted text
	      insert copy of c into tokenString;
	      state <- 'inQuote';
	    else
	      -- Something else... previous quote terminated the string.
	      -- We install the token and advance to next char in
	      -- start state
	      block declare
		qTok: quotedStringToken;
	      begin
		new qTok;
		qTok.quoteChar := c;
		qTok.string <- tokenString;
		unite token.quoted from qTok;
		insert token into tokArgs.tokens;
		new tokenString;
		state <- 'start';
	      end block;
	    end if;
	    advance <- 'true';
	    
	  otherwise
	    -- unknown state
	    exit cantHappen;
	  end select;
	end while;
      end for;
      
      -- finished with tokenization of string... tie off a word token if
      -- we're in the middle of one.  Ditto for a quoted string.
      -- Otherwise we must be in the start state.  In any case, the
      -- bracket stack must be empty 
      select state
      where ('inWord')
	unite token.word from tokenString;
	insert token into tokArgs.tokens;
      where ('sawQuote')
	block declare
	  qTok: quotedStringToken;
	begin
	  new qTok;
	  qTok.quoteChar := quoteChar;
	  qTok.string <- tokenString;
	  unite token.quoted from qTok;
	  insert token into tokArgs.tokens;
	end block;
      where ('start')
      otherwise
	exit illFormed;
      end select;
      if size of stack <> 0 then
	exit illFormed;
      end if;
      
      -- we're in a good state... return the tokens
      return tokArgs;
      
    on exit(illFormed)
      -- character string could not be tokenized properly
      return tokArgs exception illFormed;

    end block;

  end while;			-- bottom of service loop
  
on exit (badClasses)
  -- character classes given in initialization message were erroneous
  return args exception badClasses;
  
on exit (cantHappen)
  print charString#"CantHappen happened in tokenizer";
  
on (disconnected)
  -- nothing more to tokenize
end process
