heptagon/compiler/minils/parsing/mls_lexer.mll

(* lexer.mll *)


{
open Lexing
open Mls_parser

type lexical_error =
    Illegal_character
  | Unterminated_comment
  | Bad_char_constant
  | Unterminated_string;;

exception Lexical_error of lexical_error * int * int;;

let comment_depth = ref 0

let keyword_table = ((Hashtbl.create 149) : (string, token) Hashtbl.t);;

List.iter (fun (str,tok) -> Hashtbl.add keyword_table str tok) [
 "node", NODE;
 "returns", RETURNS;
 "var", VAR;
 "let", LET;
 "tel", TEL;
 "fby", FBY;
 "when", WHEN;
 "merge", MERGE;
 "type", TYPE;
 "true", BOOL(true);
 "false", BOOL(false);
 "pre", PRE;
 "or", OR;
 "not", NOT;
 "open", OPEN;
 "reset", RESET;
 "if", IF;
 "then", THEN;
 "else", ELSE;
 "with", WITH;
 "map", MAP;
 "fold", FOLD;
 "mapfold", MAPFOLD;
 "default", DEFAULT;
 "quo", INFIX3("quo");
 "mod", INFIX3("mod");
 "land", INFIX3("land");
 "lor", INFIX2("lor");
 "lxor", INFIX2("lxor");
 "lsl", INFIX4("lsl");
 "lsr", INFIX4("lsr");
 "asr", INFIX4("asr")
]


(* To buffer string literals *)

let initial_string_buffer = String.create 256
let string_buff = ref initial_string_buffer
let string_index = ref 0

let reset_string_buffer () =
  string_buff := initial_string_buffer;
  string_index := 0;
  ()

(*
let incr_linenum lexbuf =
  let pos = lexbuf.Lexing.lex_curr_p in
    lexbuf.Lexing.lex_curr_p <- { pos with
      Lexing.pos_lnum = pos.Lexing.pos_lnum + 1;
      Lexing.pos_bol = pos.Lexing.pos_cnum;
    }
*)

let store_string_char c =
  if !string_index >= String.length (!string_buff) then begin
    let new_buff = String.create (String.length (!string_buff) * 2) in
      String.blit (!string_buff) 0 new_buff 0 (String.length (!string_buff));
      string_buff := new_buff
  end;
  String.set (!string_buff) (!string_index) c;
  incr string_index


let get_stored_string () =
  let s = String.sub (!string_buff) 0 (!string_index) in
    string_buff := initial_string_buffer;
    s

let char_for_backslash = function
    'n' -> '\010'
  | 'r' -> '\013'
  | 'b' -> '\008'
  | 't' -> '\009'
  | c   -> c

let char_for_decimal_code lexbuf i =
  let c =
    100 * (int_of_char(Lexing.lexeme_char lexbuf i) - 48) +
     10 * (int_of_char(Lexing.lexeme_char lexbuf (i+1)) - 48) +
          (int_of_char(Lexing.lexeme_char lexbuf (i+2)) - 48) in
  char_of_int(c land 0xFF)


}

rule token = parse
  | [' ' '\010' '\013' '\009' '\012'] +   { token lexbuf }
  | "."             {DOT}
  | ".."            {DOTDOT}
  | "("             {LPAREN}
  | ")"             {RPAREN}
  | "*"             { STAR }
  | "{"             {LBRACE}
  | "}"             {RBRACE}
  | "["             {LBRACKET}
  | "]"             {RBRACKET}
  | ":"             {COLON}
  | ";"             {SEMICOL}
  | "="             {EQUAL}
  | "=="            {EQUALEQUAL}
  | "&"             {AMPERSAND}
  | "&&"            {AMPERAMPER}
  | "||"            {BARBAR}
  | ","             {COMMA}
  | "->"            {ARROW}
  | "|"             {BAR}
  | "-"             {SUBTRACTIVE "-"}
  | "-."            {SUBTRACTIVE "-."}
  | "^"             {POWER}
  | "@"             {AROBASE}
  | "<<"            {DOUBLE_LESS}
  | ">>"            {DOUBLE_GREATER}
  | (['A'-'Z']('_' ? ['A'-'Z' 'a'-'z' ''' '0'-'9']) * as id)
      {CONSTRUCTOR id}
  | (['A'-'Z' 'a'-'z']('_' ? ['A'-'Z' 'a'-'z' ''' '0'-'9']) * as id)
      { let s = Lexing.lexeme lexbuf in
        try Hashtbl.find keyword_table s
        with Not_found -> NAME id }
  | '-'? ['0'-'9']+
  | '-'? '0' ['x' 'X'] ['0'-'9' 'A'-'F' 'a'-'f']+
  | '-'? '0' ['o' 'O'] ['0'-'7']+
  | '-'? '0' ['b' 'B'] ['0'-'1']+
      { INT (int_of_string(Lexing.lexeme lexbuf)) }
  | '-'? ['0'-'9']+ ('.' ['0'-'9']*)? (['e' 'E'] ['+' '-']? ['0'-'9']+)?
      { FLOAT (float_of_string(Lexing.lexeme lexbuf)) }
  | "(*@ " (['A'-'Z' 'a'-'z']('_' ? ['A'-'Z' 'a'-'z' ''' '0'-'9']) * as id)
      {
	reset_string_buffer();
	let pragma_start = lexbuf.lex_start_pos + lexbuf.lex_abs_pos in
	begin try
	  pragma lexbuf
	with Lexical_error(Unterminated_comment, _, pragma_end) ->
	  raise(Lexical_error(Unterminated_comment, pragma_start, pragma_end))
	end;
	lexbuf.lex_start_pos <- pragma_start - lexbuf.lex_abs_pos;
	PRAGMA(id,get_stored_string())
      }
  | "(*"
      { let comment_start = lexbuf.lex_start_pos + lexbuf.lex_abs_pos in
        comment_depth := 1;
        begin try
          comment lexbuf
        with Lexical_error(Unterminated_comment, _, comment_end) ->
          raise(Lexical_error(Unterminated_comment,
                              comment_start, comment_end))
        end;
        token lexbuf }
   | ['!' '?' '~']
      ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':'
	  '<' '=' '>' '?' '@' '^' '|' '~'] *
      { PREFIX(Lexing.lexeme lexbuf) }
  | ['=' '<' '>' '&'  '|' '&' '$']
      ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>'
	  '?' '@' '^' '|' '~'] *
      { INFIX0(Lexing.lexeme lexbuf) }
  | ['@' '^']
      ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>'
	  '?' '@' '^' '|' '~'] *
      { INFIX1(Lexing.lexeme lexbuf) }
  | ['+' '-']
      ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>'
	  '?' '@' '^' '|' '~'] *
      { INFIX2(Lexing.lexeme lexbuf) }
  | "**"
      ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>'
	  '?' '@' '^' '|' '~'] *
      { INFIX4(Lexing.lexeme lexbuf) }
  | ['*' '/' '%']
      ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>'
	  '?' '@' '^' '|' '~'] *
      { INFIX3(Lexing.lexeme lexbuf) }
  | eof            {EOF}
  | _              {raise (Lexical_error (Illegal_character,
					  Lexing.lexeme_start lexbuf,
					  Lexing.lexeme_end lexbuf))}

and pragma = parse
    "(*"
      { let comment_start = lexbuf.lex_start_pos + lexbuf.lex_abs_pos in
        comment_depth := 1;
        begin try
          comment lexbuf
        with Lexical_error(Unterminated_comment, _, comment_end) ->
          raise(Lexical_error(Unterminated_comment,
                              comment_start, comment_end))
        end;
        pragma lexbuf }
  | "@*)"
      { }
  | eof
      { raise(Lexical_error(Unterminated_comment,0,
                            Lexing.lexeme_start lexbuf)) }

  | _
      { store_string_char(Lexing.lexeme_char lexbuf 0);
	pragma lexbuf }

and comment = parse
    "(*"
      { comment_depth := succ !comment_depth; comment lexbuf }
  | "*)"
      { comment_depth := pred !comment_depth;
        if !comment_depth > 0 then comment lexbuf }
  | "\""
      { reset_string_buffer();
        let string_start = lexbuf.lex_start_pos + lexbuf.lex_abs_pos in
        begin try
          string lexbuf
        with Lexical_error(Unterminated_string, _, string_end) ->
          raise(Lexical_error(Unterminated_string, string_start, string_end))
        end;
        comment lexbuf }
  | "''"
      { comment lexbuf }
  | "'" [^ '\\' '\''] "'"
      { comment lexbuf }
  | "'" '\\' ['\\' '\'' 'n' 't' 'b' 'r'] "'"
      { comment lexbuf }
  | "'" '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "'"
      { comment lexbuf }
  | eof
      { raise(Lexical_error(Unterminated_comment,0,
                            Lexing.lexeme_start lexbuf)) }
  | _
      { comment lexbuf }

and string = parse
    '"'
      { () }
  | '\\' ("\010" | "\013" | "\013\010") [' ' '\009'] *
      { string lexbuf }
  | '\\' ['\\' '"'  'n' 't' 'b' 'r']
      { store_string_char(char_for_backslash(Lexing.lexeme_char lexbuf 1));
        string lexbuf }
  | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9']
      { store_string_char(char_for_decimal_code lexbuf 1);
         string lexbuf }
  | eof
      { raise (Lexical_error
                (Unterminated_string, 0, Lexing.lexeme_start lexbuf)) }
  | _
      { store_string_char(Lexing.lexeme_char lexbuf 0);
        string lexbuf }

(* eof *)
Initial commit 2010-06-15 10:49:03 +02:00			`(* lexer.mll *)`


			`{`
			`open Lexing`
Refactoring and organisation. Separate parser, lexer and printer of hpetagon (Hetp_) from the minils ones (Mls_) ident_of_var -> ident_of_name get_current_location -> current_loc 2010-06-21 18:19:58 +02:00			`open Mls_parser`
Initial commit 2010-06-15 10:49:03 +02:00
			`type lexical_error =`
			`Illegal_character`
			`\| Unterminated_comment`
			`\| Bad_char_constant`
			`\| Unterminated_string;;`

			`exception Lexical_error of lexical_error * int * int;;`

			`let comment_depth = ref 0`

			`let keyword_table = ((Hashtbl.create 149) : (string, token) Hashtbl.t);;`

			`List.iter (fun (str,tok) -> Hashtbl.add keyword_table str tok) [`
			`"node", NODE;`
			`"returns", RETURNS;`
			`"var", VAR;`
			`"let", LET;`
			`"tel", TEL;`
			`"fby", FBY;`
			`"when", WHEN;`
Mls parsing to ammend. 2010-06-30 03:26:48 +02:00			`"merge", MERGE;`
Initial commit 2010-06-15 10:49:03 +02:00			`"type", TYPE;`
			`"true", BOOL(true);`
			`"false", BOOL(false);`
			`"pre", PRE;`
			`"or", OR;`
			`"not", NOT;`
			`"open", OPEN;`
			`"reset", RESET;`
			`"if", IF;`
			`"then", THEN;`
			`"else", ELSE;`
Mls parsing to ammend. 2010-06-30 03:26:48 +02:00			`"with", WITH;`
			`"map", MAP;`
			`"fold", FOLD;`
			`"mapfold", MAPFOLD;`
re re Parsing. 2010-06-30 15:44:56 +02:00			`"default", DEFAULT;`
Initial commit 2010-06-15 10:49:03 +02:00			`"quo", INFIX3("quo");`
			`"mod", INFIX3("mod");`
			`"land", INFIX3("land");`
			`"lor", INFIX2("lor");`
			`"lxor", INFIX2("lxor");`
			`"lsl", INFIX4("lsl");`
			`"lsr", INFIX4("lsr");`
			`"asr", INFIX4("asr")`
			`]`


			`(* To buffer string literals *)`

			`let initial_string_buffer = String.create 256`
			`let string_buff = ref initial_string_buffer`
			`let string_index = ref 0`

			`let reset_string_buffer () =`
			`string_buff := initial_string_buffer;`
			`string_index := 0;`
			`()`

			`(*`
			`let incr_linenum lexbuf =`
			`let pos = lexbuf.Lexing.lex_curr_p in`
			`lexbuf.Lexing.lex_curr_p <- { pos with`
			`Lexing.pos_lnum = pos.Lexing.pos_lnum + 1;`
			`Lexing.pos_bol = pos.Lexing.pos_cnum;`
			`}`
			`*)`

			`let store_string_char c =`
			`if !string_index >= String.length (!string_buff) then begin`
			`let new_buff = String.create (String.length (!string_buff) * 2) in`
			`String.blit (!string_buff) 0 new_buff 0 (String.length (!string_buff));`
			`string_buff := new_buff`
			`end;`
			`String.set (!string_buff) (!string_index) c;`
			`incr string_index`


			`let get_stored_string () =`
			`let s = String.sub (!string_buff) 0 (!string_index) in`
			`string_buff := initial_string_buffer;`
			`s`

			`let char_for_backslash = function`
			`'n' -> '\010'`
			`\| 'r' -> '\013'`
			`\| 'b' -> '\008'`
			`\| 't' -> '\009'`
			`\| c -> c`

			`let char_for_decimal_code lexbuf i =`
			`let c =`
			`100 * (int_of_char(Lexing.lexeme_char lexbuf i) - 48) +`
			`10 * (int_of_char(Lexing.lexeme_char lexbuf (i+1)) - 48) +`
			`(int_of_char(Lexing.lexeme_char lexbuf (i+2)) - 48) in`
			`char_of_int(c land 0xFF)`


			`}`

			`rule token = parse`
			`\| [' ' '\010' '\013' '\009' '\012'] + { token lexbuf }`
			`\| "." {DOT}`
Mls parsing to ammend. 2010-06-30 03:26:48 +02:00			`\| ".." {DOTDOT}`
Initial commit 2010-06-15 10:49:03 +02:00			`\| "(" {LPAREN}`
			`\| ")" {RPAREN}`
			`\| "*" { STAR }`
			`\| "{" {LBRACE}`
			`\| "}" {RBRACE}`
Mls parsing to ammend. 2010-06-30 03:26:48 +02:00			`\| "[" {LBRACKET}`
			`\| "]" {RBRACKET}`
Initial commit 2010-06-15 10:49:03 +02:00			`\| ":" {COLON}`
			`\| ";" {SEMICOL}`
			`\| "=" {EQUAL}`
			`\| "==" {EQUALEQUAL}`
			`\| "&" {AMPERSAND}`
			`\| "&&" {AMPERAMPER}`
			`\| "\|\|" {BARBAR}`
			`\| "," {COMMA}`
Mls parsing to ammend. 2010-06-30 03:26:48 +02:00			`\| "->" {ARROW}`
Initial commit 2010-06-15 10:49:03 +02:00			`\| "\|" {BAR}`
			`\| "-" {SUBTRACTIVE "-"}`
			`\| "-." {SUBTRACTIVE "-."}`
Mls parsing to ammend. 2010-06-30 03:26:48 +02:00			`\| "^" {POWER}`
			`\| "@" {AROBASE}`
Gros blop pour que Mlsc compile. -* Séparation du fichier minils avec création de mls_utils -* Lexer et Parser qui fonctionnent (pas complets encore) -* Use of menhir with --explain pour debug du parser -* Quelques refactoring (ident/name...) 2010-06-27 17:24:31 +02:00			`\| "<<" {DOUBLE_LESS}`
			`\| ">>" {DOUBLE_GREATER}`
Initial commit 2010-06-15 10:49:03 +02:00			`\| (['A'-'Z']('_' ? ['A'-'Z' 'a'-'z' ''' '0'-'9']) * as id)`
Gros blop pour que Mlsc compile. -* Séparation du fichier minils avec création de mls_utils -* Lexer et Parser qui fonctionnent (pas complets encore) -* Use of menhir with --explain pour debug du parser -* Quelques refactoring (ident/name...) 2010-06-27 17:24:31 +02:00			`{CONSTRUCTOR id}`
Initial commit 2010-06-15 10:49:03 +02:00			`\| (['A'-'Z' 'a'-'z']('_' ? ['A'-'Z' 'a'-'z' ''' '0'-'9']) * as id)`
			`{ let s = Lexing.lexeme lexbuf in`
Refactoring and organisation. Separate parser, lexer and printer of hpetagon (Hetp_) from the minils ones (Mls_) ident_of_var -> ident_of_name get_current_location -> current_loc 2010-06-21 18:19:58 +02:00			`try Hashtbl.find keyword_table s`
Gros blop pour que Mlsc compile. -* Séparation du fichier minils avec création de mls_utils -* Lexer et Parser qui fonctionnent (pas complets encore) -* Use of menhir with --explain pour debug du parser -* Quelques refactoring (ident/name...) 2010-06-27 17:24:31 +02:00			`with Not_found -> NAME id }`
Refactoring and organisation. Separate parser, lexer and printer of hpetagon (Hetp_) from the minils ones (Mls_) ident_of_var -> ident_of_name get_current_location -> current_loc 2010-06-21 18:19:58 +02:00			`\| '-'? ['0'-'9']+`
			`\| '-'? '0' ['x' 'X'] ['0'-'9' 'A'-'F' 'a'-'f']+`
			`\| '-'? '0' ['o' 'O'] ['0'-'7']+`
			`\| '-'? '0' ['b' 'B'] ['0'-'1']+`
Initial commit 2010-06-15 10:49:03 +02:00			`{ INT (int_of_string(Lexing.lexeme lexbuf)) }`
Refactoring and organisation. Separate parser, lexer and printer of hpetagon (Hetp_) from the minils ones (Mls_) ident_of_var -> ident_of_name get_current_location -> current_loc 2010-06-21 18:19:58 +02:00			`\| '-'? ['0'-'9']+ ('.' ['0'-'9']*)? (['e' 'E'] ['+' '-']? ['0'-'9']+)?`
Initial commit 2010-06-15 10:49:03 +02:00			`{ FLOAT (float_of_string(Lexing.lexeme lexbuf)) }`
			`\| "(@ " (['A'-'Z' 'a'-'z']('_' ? ['A'-'Z' 'a'-'z' ''' '0'-'9']) as id)`
			`{`
			`reset_string_buffer();`
			`let pragma_start = lexbuf.lex_start_pos + lexbuf.lex_abs_pos in`
			`begin try`
			`pragma lexbuf`
			`with Lexical_error(Unterminated_comment, _, pragma_end) ->`
			`raise(Lexical_error(Unterminated_comment, pragma_start, pragma_end))`
			`end;`
			`lexbuf.lex_start_pos <- pragma_start - lexbuf.lex_abs_pos;`
			`PRAGMA(id,get_stored_string())`
			`}`
			`\| "(*"`
			`{ let comment_start = lexbuf.lex_start_pos + lexbuf.lex_abs_pos in`
			`comment_depth := 1;`
			`begin try`
			`comment lexbuf`
			`with Lexical_error(Unterminated_comment, _, comment_end) ->`
			`raise(Lexical_error(Unterminated_comment,`
			`comment_start, comment_end))`
			`end;`
			`token lexbuf }`
			`\| ['!' '?' '~']`
			`['!' '$' '%' '&' '*' '+' '-' '.' '/' ':'`
			`'<' '=' '>' '?' '@' '^' '\|' '~'] *`
			`{ PREFIX(Lexing.lexeme lexbuf) }`
			`\| ['=' '<' '>' '&' '\|' '&' '$']`
			`['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>'`
			`'?' '@' '^' '\|' '~'] *`
			`{ INFIX0(Lexing.lexeme lexbuf) }`
			`\| ['@' '^']`
			`['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>'`
			`'?' '@' '^' '\|' '~'] *`
			`{ INFIX1(Lexing.lexeme lexbuf) }`
			`\| ['+' '-']`
			`['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>'`
			`'?' '@' '^' '\|' '~'] *`
			`{ INFIX2(Lexing.lexeme lexbuf) }`
			`\| "**"`
			`['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>'`
			`'?' '@' '^' '\|' '~'] *`
			`{ INFIX4(Lexing.lexeme lexbuf) }`
			`\| ['*' '/' '%']`
			`['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>'`
			`'?' '@' '^' '\|' '~'] *`
			`{ INFIX3(Lexing.lexeme lexbuf) }`
			`\| eof {EOF}`
			`\| _ {raise (Lexical_error (Illegal_character,`
			`Lexing.lexeme_start lexbuf,`
			`Lexing.lexeme_end lexbuf))}`

			`and pragma = parse`
			`"(*"`
			`{ let comment_start = lexbuf.lex_start_pos + lexbuf.lex_abs_pos in`
			`comment_depth := 1;`
			`begin try`
			`comment lexbuf`
			`with Lexical_error(Unterminated_comment, _, comment_end) ->`
			`raise(Lexical_error(Unterminated_comment,`
			`comment_start, comment_end))`
			`end;`
			`pragma lexbuf }`
			`\| "@*)"`
			`{ }`
			`\| eof`
			`{ raise(Lexical_error(Unterminated_comment,0,`
			`Lexing.lexeme_start lexbuf)) }`

			`\| _`
			`{ store_string_char(Lexing.lexeme_char lexbuf 0);`
			`pragma lexbuf }`

			`and comment = parse`
			`"(*"`
			`{ comment_depth := succ !comment_depth; comment lexbuf }`
			`\| "*)"`
			`{ comment_depth := pred !comment_depth;`
			`if !comment_depth > 0 then comment lexbuf }`
			`\| "\""`
			`{ reset_string_buffer();`
			`let string_start = lexbuf.lex_start_pos + lexbuf.lex_abs_pos in`
			`begin try`
			`string lexbuf`
			`with Lexical_error(Unterminated_string, _, string_end) ->`
			`raise(Lexical_error(Unterminated_string, string_start, string_end))`
			`end;`
			`comment lexbuf }`
			`\| "''"`
			`{ comment lexbuf }`
			`\| "'" [^ '\\' '\''] "'"`
			`{ comment lexbuf }`
			`\| "'" '\\' ['\\' '\'' 'n' 't' 'b' 'r'] "'"`
			`{ comment lexbuf }`
			`\| "'" '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "'"`
			`{ comment lexbuf }`
			`\| eof`
			`{ raise(Lexical_error(Unterminated_comment,0,`
			`Lexing.lexeme_start lexbuf)) }`
			`\| _`
			`{ comment lexbuf }`

			`and string = parse`
			`'"'`
			`{ () }`
			`\| '\\' ("\010" \| "\013" \| "\013\010") [' ' '\009'] *`
			`{ string lexbuf }`
			`\| '\\' ['\\' '"' 'n' 't' 'b' 'r']`
			`{ store_string_char(char_for_backslash(Lexing.lexeme_char lexbuf 1));`
			`string lexbuf }`
			`\| '\\' ['0'-'9'] ['0'-'9'] ['0'-'9']`
			`{ store_string_char(char_for_decimal_code lexbuf 1);`
			`string lexbuf }`
			`\| eof`
			`{ raise (Lexical_error`
			`(Unterminated_string, 0, Lexing.lexeme_start lexbuf)) }`
			`\| _`
			`{ store_string_char(Lexing.lexeme_char lexbuf 0);`
			`string lexbuf }`

			`(* eof *)`