i3/generate-command-parser.pl

#!/usr/bin/env perl
# vim:ts=4:sw=4:expandtab
#
# i3 - an improved dynamic tiling window manager
# © 2009-2012 Michael Stapelberg and contributors (see also: LICENSE)
#
# generate-command-parser.pl: script to generate parts of the command parser
# from its specification file parser-specs/commands.spec.
#
# Requires only perl >= 5.10, no modules.

use strict;
use warnings;
use Data::Dumper;
use v5.10;

# reads in a whole file
sub slurp {
    open my $fh, '<', shift;
    local $/;
    <$fh>;
}

# Stores the different states.
my %states;

# XXX: don’t hardcode input and output
my $input = '../parser-specs/commands.spec';
my @raw_lines = split("\n", slurp($input));
my @lines;

# XXX: In the future, we might switch to a different way of parsing this. The
# parser is in many ways not good — one obvious one is that it is hand-crafted
# without a good reason, also it preprocesses lines and forgets about line
# numbers. Luckily, this is just an implementation detail and the specification
# for the i3 command parser is in-tree (not user input).
# -- michael, 2012-01-12

# First step of preprocessing:
# Join token definitions which are spread over multiple lines.
for my $line (@raw_lines) {
    next if $line =~ /^\s*#/ || $line =~ /^\s*$/;

    if ($line =~ /^\s+->/) {
        # This is a continued token definition, append this line to the
        # previous one.
        $lines[$#lines] = $lines[$#lines] . $line;
    } else {
        push @lines, $line;
        next;
    }
}

# First step: We build up the data structure containing all states and their
# token rules.

my $current_state;

for my $line (@lines) {
    if (my ($state) = ($line =~ /^state ([A-Z_]+):$/)) {
        #say "got a new state: $state";
        $current_state = $state;
    } else {
        # Must be a token definition:
        # [identifier = ] <tokens> -> <action>
        #say "token definition: $line";

        my ($identifier, $tokens, $action) =
            ($line =~ /
                ^\s*                  # skip leading whitespace
                ([a-z_]+ \s* = \s*|)  # optional identifier
                (.*?) -> \s*          # token 
                (.*)                  # optional action
             /x);

        # Cleanup the identifier (if any).
        $identifier =~ s/^\s*(\S+)\s*=\s*$/$1/g;

        # Cleanup the tokens (remove whitespace).
        $tokens =~ s/\s*//g;

        # The default action is to stay in the current state.
        $action = $current_state if length($action) == 0;

        #say "identifier = *$identifier*, token = *$tokens*, action = *$action*";
        for my $token (split(',', $tokens)) {
            my $store_token = {
                token => $token,
                identifier => $identifier,
                next_state => $action,
            };
            if (exists $states{$current_state}) {
                push $states{$current_state}, $store_token;
            } else {
                $states{$current_state} = [ $store_token ];
            }
        }
    }
}

# Second step: Generate the enum values for all states.

# It is important to keep the order the same, so we store the keys once.
my @keys = keys %states;

open(my $enumfh, '>', 'GENERATED_enums.h');

# XXX: we might want to have a way to do this without a trailing comma, but gcc
# seems to eat it.
say $enumfh 'typedef enum {';
my $cnt = 0;
for my $state (@keys, '__CALL') {
    say $enumfh "    $state = $cnt,";
    $cnt++;
}
say $enumfh '} cmdp_state;';
close($enumfh);

# Third step: Generate the call function.
open(my $callfh, '>', 'GENERATED_call.h');
say $callfh 'static char *GENERATED_call(const int call_identifier) {';
say $callfh '    char *output = NULL;';
say $callfh '    switch (call_identifier) {';
my $call_id = 0;
for my $state (@keys) {
    my $tokens = $states{$state};
    for my $token (@$tokens) {
        next unless $token->{next_state} =~ /^call /;
        my ($cmd) = ($token->{next_state} =~ /^call (.*)/);
        my ($next_state) = ($cmd =~ /; ([A-Z_]+)$/);
        $cmd =~ s/; ([A-Z_]+)$//;
        # Go back to the INITIAL state unless told otherwise.
        $next_state ||= 'INITIAL';
        my $fmt = $cmd;
        # Replace the references to identified literals (like $workspace) with
        # calls to get_string().
        $cmd =~ s/\$([a-z_]+)/get_string("$1")/g;
        # Used only for debugging/testing.
        $fmt =~ s/\$([a-z_]+)/%s/g;
        $fmt =~ s/"([a-z0-9_]+)"/%s/g;

        say $callfh "         case $call_id:";
        say $callfh '#ifndef TEST_PARSER';
        my $real_cmd = $cmd;
        if ($real_cmd =~ /\(\)/) {
            $real_cmd =~ s/\(/(&current_match/;
        } else {
            $real_cmd =~ s/\(/(&current_match, /;
        }
        say $callfh "             output = $real_cmd;";
        say $callfh '#else';
        # debug
        $cmd =~ s/[^(]+\(//;
        $cmd =~ s/\)$//;
        $cmd = ", $cmd" if length($cmd) > 0;
        say $callfh qq|           printf("$fmt\\n"$cmd);|;
        say $callfh '#endif';
        say $callfh "             state = $next_state;";
        say $callfh "             break;";
        $token->{next_state} = "call $call_id";
        $call_id++;
    }
}
say $callfh '        default:';
say $callfh '            printf("BUG in the parser. state = %d\n", call_identifier);';
say $callfh '    }';
say $callfh '    return output;';
say $callfh '}';
close($callfh);

# Fourth step: Generate the token datastructures.

open(my $tokfh, '>', 'GENERATED_tokens.h');

for my $state (@keys) {
    my $tokens = $states{$state};
    say $tokfh 'cmdp_token tokens_' . $state . '[' . scalar @$tokens . '] = {';
    for my $token (@$tokens) {
        my $call_identifier = 0;
        my $token_name = $token->{token};
        if ($token_name =~ /^'/) {
            # To make the C code simpler, we leave out the trailing single
            # quote of the literal. We can do strdup(literal + 1); then :).
            $token_name =~ s/'$//;
        }
        my $next_state = $token->{next_state};
        if ($next_state =~ /^call /) {
            ($call_identifier) = ($next_state =~ /^call ([0-9]+)$/);
            $next_state = '__CALL';
        }
        my $identifier = $token->{identifier};
        say $tokfh qq|    { "$token_name", "$identifier", $next_state, { $call_identifier } }, |;
    }
    say $tokfh '};';
}

say $tokfh 'cmdp_token_ptr tokens[' . scalar @keys . '] = {';
for my $state (@keys) {
    my $tokens = $states{$state};
    say $tokfh '    { tokens_' . $state . ', ' . scalar @$tokens . ' },';
}
say $tokfh '};';

close($tokfh);
-												Implement a new parser for commands. (+test)

On the rationale of using a custom parser instead of a lex/yacc one, see this
quote from src/commands_parser.c:
     We use a hand-written parser instead of lex/yacc because our commands are
     easy for humans, not for computers. Thus, it’s quite hard to specify a
     context-free grammar for the commands. A PEG grammar would be easier, but
     there’s downsides to every PEG parser generator I have come accross so far.

     This parser is basically a state machine which looks for literals or strings
     and can push either on a stack. After identifying a literal or string, it
     will either transition to the current state, to a different state, or call a
     function (like cmd_move()).

     Special care has been taken that error messages are useful and the code is
     well testable (when compiled with -DTEST_PARSER it will output to stdout
     instead of actually calling any function).

During the migration phase (I plan to completely switch to this parser before
4.2 will be released), the new parser will parse every command you send to
i3 and save the resulting call stack. Then, the old parser will parse your
input and actually execute the commands. Afterwards, both call stacks will be
compared and any differences will be logged.

The new parser works with 100% of the test suite and produces identical call
stacks.

											
										
										
											2012-01-14 14:53:29 -05:00
+								#!/usr/bin/env perl
 								# vim:ts=4:sw=4:expandtab
 								#
 								# i3 - an improved dynamic tiling window manager
 								# © 2009-2012 Michael Stapelberg and contributors (see also: LICENSE)
 								#
 								# generate-command-parser.pl: script to generate parts of the command parser
 								# from its specification file parser-specs/commands.spec.
 								#
 								# Requires only perl >= 5.10, no modules.
 								use strict;
 								use warnings;
 								use Data::Dumper;
 								use v5.10;
 								# reads in a whole file
 								sub slurp {
 								    open my $fh, '<', shift;
 								    local $/;
 								    <$fh>;
 								}
 								# Stores the different states.
 								my %states;
 								# XXX: don’t hardcode input and output
 								my $input = '../parser-specs/commands.spec';
 								my @raw_lines = split("\n", slurp($input));
 								my @lines;
 								# XXX: In the future, we might switch to a different way of parsing this. The
 								# parser is in many ways not good — one obvious one is that it is hand-crafted
 								# without a good reason, also it preprocesses lines and forgets about line
 								# numbers. Luckily, this is just an implementation detail and the specification
 								# for the i3 command parser is in-tree (not user input).
 								# -- michael, 2012-01-12
 								# First step of preprocessing:
 								# Join token definitions which are spread over multiple lines.
 								for my $line (@raw_lines) {
 								    next if $line =~ /^\s*#/ || $line =~ /^\s*$/;
 								    if ($line =~ /^\s+->/) {
 								        # This is a continued token definition, append this line to the
 								        # previous one.
 								        $lines[$#lines] = $lines[$#lines] . $line;
 								    } else {
 								        push @lines, $line;
 								        next;
 								    }
 								}
 								# First step: We build up the data structure containing all states and their
 								# token rules.
 								my $current_state;
 								for my $line (@lines) {
 								    if (my ($state) = ($line =~ /^state ([A-Z_]+):$/)) {
 								        #say "got a new state: $state";
 								        $current_state = $state;
 								    } else {
 								        # Must be a token definition:
 								        # [identifier = ] <tokens> -> <action>
 								        #say "token definition: $line";
 								        my ($identifier, $tokens, $action) =
 								            ($line =~ /
 								                ^\s*                  # skip leading whitespace
 								                ([a-z_]+ \s* = \s*|)  # optional identifier
 								                (.*?) -> \s*          # token
 								                (.*)                  # optional action
 								             /x);
 								        # Cleanup the identifier (if any).
 								        $identifier =~ s/^\s*(\S+)\s*=\s*$/$1/g;
 								        # Cleanup the tokens (remove whitespace).
 								        $tokens =~ s/\s*//g;
 								        # The default action is to stay in the current state.
 								        $action = $current_state if length($action) == 0;
 								        #say "identifier = *$identifier*, token = *$tokens*, action = *$action*";
 								        for my $token (split(',', $tokens)) {
 								            my $store_token = {
 								                token => $token,
 								                identifier => $identifier,
 								                next_state => $action,
 								            };
 								            if (exists $states{$current_state}) {
 								                push $states{$current_state}, $store_token;
 								            } else {
 								                $states{$current_state} = [ $store_token ];
 								            }
 								        }
 								    }
 								}
 								# Second step: Generate the enum values for all states.
 								# It is important to keep the order the same, so we store the keys once.
 								my @keys = keys %states;
 								open(my $enumfh, '>', 'GENERATED_enums.h');
 								# XXX: we might want to have a way to do this without a trailing comma, but gcc
 								# seems to eat it.
 								say $enumfh 'typedef enum {';
 								my $cnt = 0;
 								for my $state (@keys, '__CALL') {
 								    say $enumfh "    $state = $cnt,";
 								    $cnt++;
 								}
 								say $enumfh '} cmdp_state;';
 								close($enumfh);
 								# Third step: Generate the call function.
 								open(my $callfh, '>', 'GENERATED_call.h');
 								say $callfh 'static char *GENERATED_call(const int call_identifier) {';
 								say $callfh '    char *output = NULL;';
 								say $callfh '    switch (call_identifier) {';
 								my $call_id = 0;
 								for my $state (@keys) {
 								    my $tokens = $states{$state};
 								    for my $token (@$tokens) {
 								        next unless $token->{next_state} =~ /^call /;
 								        my ($cmd) = ($token->{next_state} =~ /^call (.*)/);
 								        my ($next_state) = ($cmd =~ /; ([A-Z_]+)$/);
 								        $cmd =~ s/; ([A-Z_]+)$//;
 								        # Go back to the INITIAL state unless told otherwise.
 								        $next_state ||= 'INITIAL';
 								        my $fmt = $cmd;
 								        # Replace the references to identified literals (like $workspace) with
 								        # calls to get_string().
 								        $cmd =~ s/\$([a-z_]+)/get_string("$1")/g;
 								        # Used only for debugging/testing.
 								        $fmt =~ s/\$([a-z_]+)/%s/g;
 								        $fmt =~ s/"([a-z0-9_]+)"/%s/g;
 								        say $callfh "         case $call_id:";
 								        say $callfh '#ifndef TEST_PARSER';
 								        my $real_cmd = $cmd;
 								        if ($real_cmd =~ /\(\)/) {
 								            $real_cmd =~ s/\(/(&current_match/;
 								        } else {
 								            $real_cmd =~ s/\(/(&current_match, /;
 								        }
 								        say $callfh "             output = $real_cmd;";
 								        say $callfh '#else';
 								        # debug
 								        $cmd =~ s/[^(]+\(//;
 								        $cmd =~ s/\)$//;
 								        $cmd = ", $cmd" if length($cmd) > 0;
 								        say $callfh qq|           printf("$fmt\\n"$cmd);|;
 								        say $callfh '#endif';
 								        say $callfh "             state = $next_state;";
 								        say $callfh "             break;";
 								        $token->{next_state} = "call $call_id";
 								        $call_id++;
 								    }
 								}
 								say $callfh '        default:';
 								say $callfh '            printf("BUG in the parser. state = %d\n", call_identifier);';
 								say $callfh '    }';
 								say $callfh '    return output;';
 								say $callfh '}';
 								close($callfh);
 								# Fourth step: Generate the token datastructures.
 								open(my $tokfh, '>', 'GENERATED_tokens.h');
 								for my $state (@keys) {
 								    my $tokens = $states{$state};
 								    say $tokfh 'cmdp_token tokens_' . $state . '[' . scalar @$tokens . '] = {';
 								    for my $token (@$tokens) {
 								        my $call_identifier = 0;
 								        my $token_name = $token->{token};
 								        if ($token_name =~ /^'/) {
 								            # To make the C code simpler, we leave out the trailing single
 								            # quote of the literal. We can do strdup(literal + 1); then :).
 								            $token_name =~ s/'$//;
 								        }
 								        my $next_state = $token->{next_state};
 								        if ($next_state =~ /^call /) {
 								            ($call_identifier) = ($next_state =~ /^call ([0-9]+)$/);
 								            $next_state = '__CALL';
 								        }
 								        my $identifier = $token->{identifier};
 								        say $tokfh qq|    { "$token_name", "$identifier", $next_state, { $call_identifier } }, |;
 								    }
 								    say $tokfh '};';
 								}
 								say $tokfh 'cmdp_token_ptr tokens[' . scalar @keys . '] = {';
 								for my $state (@keys) {
 								    my $tokens = $states{$state};
 								    say $tokfh '    { tokens_' . $state . ', ' . scalar @$tokens . ' },';
 								}
 								say $tokfh '};';
 								close($tokfh);