查看正文内容
using System;
using System.IO;
using System.Linq;
using System.Collections.Generic;namespace Rosetta {
public enum TokenType {
End_of_input, Op_multiply, Op_divide, Op_mod, Op_add, Op_subtract,Op_negate, Op_not, Op_less, Op_lessequal, Op_greater, Op_greaterequal,Op_equal, Op_notequal, Op_assign, Op_and, Op_or, Keyword_if,Keyword_else, Keyword_while, Keyword_print, Keyword_putc, LeftParen, RightParen,LeftBrace, RightBrace, Semicolon, Comma, Identifier, Integer, String, None}/// <summary>/// Storage class for tokens/// </summary>public class Token {
public TokenType Type {
get; set; }public int Line {
get; set; }public int Position {
get; set; }public string Value {
get; set; }public override string ToString() {
if (Type == TokenType.Integer || Type == TokenType.Identifier) {
return String.Format("{0,-5} {1,-5} {2,-14} {3}", Line, Position, Type.ToString(), Value);} else if (Type == TokenType.String) {
return String.Format("{0,-5} {1,-5} {2,-14} \"{3}\"", Line, Position, Type.ToString(), Value.Replace("\n", "\\n"));}return String.Format("{0,-5} {1,-5} {2,-14}", Line, Position, Type.ToString());}}/// <summary>/// C# Example of Lexical scanner for Rosetta Compiler/// </summary>public class LexicalScanner {
// character classes private const string _letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_"; private const string _numbers = "0123456789";private const string _identifier = _letters + _numbers + "_";private const string _whitespace = " \t\n\r";// mappings from string keywords to token type private Dictionary<string, TokenType> _keywordTokenTypeMap = new Dictionary<string, TokenType>() {
{
"if", TokenType.Keyword_if },{
"else", TokenType.Keyword_else },{
"while", TokenType.Keyword_while },{
"print", TokenType.Keyword_print },{
"putc", TokenType.Keyword_putc }};// mappings from simple operators to token typeprivate Dictionary<string, TokenType> _operatorTokenTypeMap = new Dictionary<string, TokenType>() {
{
"+", TokenType.Op_add },{
"-", TokenType.Op_subtract },{
"*", TokenType.Op_multiply },{
"/", TokenType.Op_divide },{
"%", TokenType.Op_mod },{
"=", TokenType.Op_assign },{
"<", TokenType.Op_less },{
">", TokenType.Op_greater },{
"!", TokenType.Op_not },};private List<string> _keywords;private string _operators = "+-*/%=<>!%";private string _code;private List<Token> tokens = new List<Token>();private int _line = 1;private int _position = 1;public string CurrentCharacter {
get {
try {
return _code.Substring(0, 1);} catch (ArgumentOutOfRangeException) {
return "";}}}/// <summary>/// Lexical scanner initialiser/// </summary>/// <param name="code">Code to be tokenised</param>public LexicalScanner (string code) {
_code = code;_keywords = _keywordTokenTypeMap.Keys.ToList();}/// <summary>/// Advance the cursor forward given number of characters/// </summary>/// <param name="characters">Number of characters to advance</param>private void advance(int characters=1) {
try {
// reset position when there is a newlineif (CurrentCharacter == "\n") {
_position = 0;_line++;}_code = _code.Substring(characters, _code.Length - characters);_position += characters;} catch (ArgumentOutOfRangeException) {
_code = "";}}/// <summary>/// Outputs error message to the console and exits /// </summary>/// <param name="message">Error message to display to user</param>/// <param name="line">Line error occurred on</param>/// <param name="position">Line column that the error occurred at</param>public void error(string message, int line, int position) {
// output error to the console and exitConsole.WriteLine(String.Format("{0} @ {1}:{2}", message, line, position));Environment.Exit(1);}/// <summary>/// Pattern matching using first & follow matching/// </summary>/// <param name="recogniseClass">String of characters that identifies the token type/// or the exact match the be made if exact:true</param>/// <param name="matchClass">String of characters to match against remaining target characters</param>/// <param name="tokenType">Type of token the match represents.</param>/// <param name="notNextClass">Optional class of characters that cannot follow the match</param>/// <param name="maxLen">Optional maximum length of token value</param>/// <param name="exact">Denotes whether recogniseClass represents an exact match or class match. /// Default: false</param>/// <param name="discard">Denotes whether the token is kept or discarded. Default: false</param>/// <param name="offset">Optiona line position offset to account for discarded tokens</param>/// <returns>Boolean indicating if a match was made </returns>public bool match(string recogniseClass, string matchClass, TokenType tokenType, string notNextClass=null, int maxLen=Int32.MaxValue, bool exact=false, bool discard=false, int offset=0) {
// if we've hit the end of the file, there's no more matching to be doneif (CurrentCharacter == "")return false;// store _current_ line and position so that our vectors point at the start// of each tokenint line = _line;int position = _position;// special case exact tokens to avoid needing to worry about backtrackingif (exact) {
if (_code.StartsWith(recogniseClass)) {
if (!discard) tokens.Add(new Token() {
Type = tokenType, Value = recogniseClass, Line = line, Position = position - offset}); advance(recogniseClass.Length);return true;}return false;}// first match - denotes the token type usuallyif (!recogniseClass.Contains(CurrentCharacter))return false;string tokenValue = CurrentCharacter;advance();// follow match while we haven't exceeded maxLen and there are still characters// in the code streamwhile ((matchClass ?? "").Contains(CurrentCharacter) && tokenValue.Length <= maxLen && CurrentCharacter != "") {
tokenValue += CurrentCharacter;advance();}// ensure that any incompatible characters are not next to the token// eg 42fred is invalid, and neither recognized as a number nor an identifier.// _letters would be the notNextClassif (notNextClass != null && notNextClass.Contains(CurrentCharacter))error("Unrecognised character: " + CurrentCharacter, _line, _position);// only add tokens to the stack that aren't marked as discard - dont want// things like open and close quotes/commentsif (!discard) {
Token token = new Token() {
Type = tokenType, Value = tokenValue, Line = line, Position = position - offset };tokens.Add(token);}return true;}/// <summary>/// Tokenise the input code /// </summary>/// <returns>List of Tokens</returns>public List<Token> scan() {
while (CurrentCharacter != "") {
// match whitespacematch(_whitespace, _whitespace, TokenType.None, discard: true);// match integersmatch(_numbers, _numbers, TokenType.Integer, notNextClass:_letters);// match identifiers and keywordsif (match(_letters, _identifier, TokenType.Identifier)) {
Token match = tokens.Last();if (_keywords.Contains(match.Value))match.Type = _keywordTokenTypeMap[match.Value];}// match string similarly to comments without allowing newlines// this token doesn't get discarded thoughif (match("\"", null, TokenType.String, discard:true)) {
string value = "";int position = _position;while (!match("\"", null, TokenType.String, discard:true)) {
// not allowed newlines in stringsif (CurrentCharacter == "\n")error("End-of-line while scanning string literal. Closing string character not found before end-of-line", _line, _position);// end of file reached before finding end of stringif (CurrentCharacter == "")error("End-of-file while scanning string literal. Closing string character not found", _line, _position);value += CurrentCharacter;// deal with escape sequences - we only accept newline (\n)if (value.Length >= 2) {
string lastCharacters = value.Substring(value.Length - 2, 2);if (lastCharacters[0] == '\\') {
if (lastCharacters[1] != 'n') {
error("Unknown escape sequence. ", _line, position);}value = value.Substring(0, value.Length - 2).ToString() + "\n";}}advance();}tokens.Add(new Token() {
Type = TokenType.String, Value = value, Line = _line, Position = position - 1});}// match string literalsif (match("'", null, TokenType.Integer, discard:true)) {
int value;int position = _position;value = CurrentCharacter.ToCharArray()[0];advance();// deal with empty literals ''if (value == '\'')error("Empty character literal", _line, _position);// deal with escaped characters, only need to worry about \n and \\// throw werror on any otherif (value == '\\') {
if (CurrentCharacter == "n") {
value = '\n';} else if (CurrentCharacter == "\\") {
value = '\\';} else {
error("Unknown escape sequence. ", _line, _position - 1);}advance();}// if we haven't hit a closing ' here, there are two many characters// in the literalif (!match("'", null, TokenType.Integer, discard: true))error("Multi-character constant", _line, _position);tokens.Add(new Rosetta.Token() {
Type = TokenType.Integer, Value = value.ToString(), Line = _line, Position = position - 1 });}// match comments by checking for starting token, then advancing // until closing token is matchedif (match("/*", null, TokenType.None, exact: true, discard: true)) {
while (!match("*/", null, TokenType.None, exact: true, discard: true)) {
// reached the end of the file without closing comment!if (CurrentCharacter == "")error("End-of-file in comment. Closing comment characters not found.", _line, _position);advance();}continue;}// match complex operatorsmatch("<=", null, TokenType.Op_lessequal, exact: true);match(">=", null, TokenType.Op_greaterequal, exact: true);match("==", null, TokenType.Op_equal, exact: true);match("!=", null, TokenType.Op_notequal, exact: true);match("&&", null, TokenType.Op_and, exact: true);match("||", null, TokenType.Op_or, exact: true);// match simple operatorsif (match(_operators, null, TokenType.None, maxLen:1)) {
Token match = tokens.Last();match.Type = _operatorTokenTypeMap[match.Value];}// brackets, braces and separatorsmatch("(", null, TokenType.LeftParen, exact: true);match(")", null, TokenType.RightParen, exact: true);match("{", null, TokenType.LeftBrace, exact: true);match("}", null, TokenType.RightBrace, exact: true);match(";", null, TokenType.Semicolon, exact: true);match(",", null, TokenType.Comma, exact: true);}// end of file tokentokens.Add(new Rosetta.Token() {
Type = TokenType.End_of_input, Line = _line, Position = _position });return tokens;}static void Main (string[] args) {
StreamReader inputFile;// if we passed in a filename, read code from that, else// read code from stdinif (args.Length > 0) {
string path = args[0];try {
inputFile = new StreamReader(path);} catch (IOException) {
inputFile = new StreamReader(Console.OpenStandardInput(8192));}} else {
inputFile = new StreamReader(Console.OpenStandardInput(8192));}string code = inputFile.ReadToEnd();// strip windows line endings outcode = code.Replace("\r", "");LexicalScanner scanner = new LexicalScanner(code);List<Token> tokens = scanner.scan();foreach(Token token in tokens) {
Console.WriteLine(token.ToString());} }}
}
输出(测试用例三)
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input