i3/generate-command-parser.pl

#!/usr/bin/env perl
# vim:ts=4:sw=4:expandtab
#
# i3 - an improved dynamic tiling window manager
# © 2009-2012 Michael Stapelberg and contributors (see also: LICENSE)
#
# generate-command-parser.pl: script to generate parts of the command parser
# from its specification file parser-specs/commands.spec.
#
# Requires only perl >= 5.10, no modules.

use strict;
use warnings;
use Data::Dumper;
use v5.10;

# reads in a whole file
sub slurp {
    open my $fh, '<', shift;
    local $/;
    <$fh>;
}

# Stores the different states.
my %states;

# XXX: don’t hardcode input and output
my $input = '../parser-specs/commands.spec';
my @raw_lines = split("\n", slurp($input));
my @lines;

# XXX: In the future, we might switch to a different way of parsing this. The
# parser is in many ways not good — one obvious one is that it is hand-crafted
# without a good reason, also it preprocesses lines and forgets about line
# numbers. Luckily, this is just an implementation detail and the specification
# for the i3 command parser is in-tree (not user input).
# -- michael, 2012-01-12

# First step of preprocessing:
# Join token definitions which are spread over multiple lines.
for my $line (@raw_lines) {
    next if $line =~ /^\s*#/ || $line =~ /^\s*$/;

    if ($line =~ /^\s+->/) {
        # This is a continued token definition, append this line to the
        # previous one.
        $lines[$#lines] = $lines[$#lines] . $line;
    } else {
        push @lines, $line;
        next;
    }
}

# First step: We build up the data structure containing all states and their
# token rules.

my $current_state;

for my $line (@lines) {
    if (my ($state) = ($line =~ /^state ([A-Z_]+):$/)) {
        #say "got a new state: $state";
        $current_state = $state;
    } else {
        # Must be a token definition:
        # [identifier = ] <tokens> -> <action>
        #say "token definition: $line";

        my ($identifier, $tokens, $action) =
            ($line =~ /
                ^\s*                  # skip leading whitespace
                ([a-z_]+ \s* = \s*|)  # optional identifier
                (.*?) -> \s*          # token 
                (.*)                  # optional action
             /x);

        # Cleanup the identifier (if any).
        $identifier =~ s/^\s*(\S+)\s*=\s*$/$1/g;

        # Cleanup the tokens (remove whitespace).
        $tokens =~ s/\s*//g;

        # The default action is to stay in the current state.
        $action = $current_state if length($action) == 0;

        #say "identifier = *$identifier*, token = *$tokens*, action = *$action*";
        for my $token (split(',', $tokens)) {
            my $store_token = {
                token => $token,
                identifier => $identifier,
                next_state => $action,
            };
            if (exists $states{$current_state}) {
                push @{$states{$current_state}}, $store_token;
            } else {
                $states{$current_state} = [ $store_token ];
            }
        }
    }
}

# Second step: Generate the enum values for all states.

# It is important to keep the order the same, so we store the keys once.
my @keys = keys %states;

open(my $enumfh, '>', 'GENERATED_enums.h');

# XXX: we might want to have a way to do this without a trailing comma, but gcc
# seems to eat it.
say $enumfh 'typedef enum {';
my $cnt = 0;
for my $state (@keys, '__CALL') {
    say $enumfh "    $state = $cnt,";
    $cnt++;
}
say $enumfh '} cmdp_state;';
close($enumfh);

# Third step: Generate the call function.
open(my $callfh, '>', 'GENERATED_call.h');
say $callfh 'static void GENERATED_call(const int call_identifier, struct CommandResult *result) {';
say $callfh '    switch (call_identifier) {';
my $call_id = 0;
for my $state (@keys) {
    my $tokens = $states{$state};
    for my $token (@$tokens) {
        next unless $token->{next_state} =~ /^call /;
        my ($cmd) = ($token->{next_state} =~ /^call (.*)/);
        my ($next_state) = ($cmd =~ /; ([A-Z_]+)$/);
        $cmd =~ s/; ([A-Z_]+)$//;
        # Go back to the INITIAL state unless told otherwise.
        $next_state ||= 'INITIAL';
        my $fmt = $cmd;
        # Replace the references to identified literals (like $workspace) with
        # calls to get_string().
        $cmd =~ s/\$([a-z_]+)/get_string("$1")/g;
        # Used only for debugging/testing.
        $fmt =~ s/\$([a-z_]+)/%s/g;
        $fmt =~ s/"([a-z0-9_]+)"/%s/g;

        say $callfh "         case $call_id:";
        say $callfh '#ifndef TEST_PARSER';
        my $real_cmd = $cmd;
        if ($real_cmd =~ /\(\)/) {
            $real_cmd =~ s/\(/(&current_match, result/;
        } else {
            $real_cmd =~ s/\(/(&current_match, result, /;
        }
        say $callfh "             $real_cmd;";
        say $callfh '#else';
        # debug
        $cmd =~ s/[^(]+\(//;
        $cmd =~ s/\)$//;
        $cmd = ", $cmd" if length($cmd) > 0;
        say $callfh qq|           printf("$fmt\\n"$cmd);|;
        say $callfh '#endif';
        say $callfh "             state = $next_state;";
        say $callfh "             break;";
        $token->{next_state} = "call $call_id";
        $call_id++;
    }
}
say $callfh '        default:';
say $callfh '            printf("BUG in the parser. state = %d\n", call_identifier);';
say $callfh '    }';
say $callfh '}';
close($callfh);

# Fourth step: Generate the token datastructures.

open(my $tokfh, '>', 'GENERATED_tokens.h');

for my $state (@keys) {
    my $tokens = $states{$state};
    say $tokfh 'cmdp_token tokens_' . $state . '[' . scalar @$tokens . '] = {';
    for my $token (@$tokens) {
        my $call_identifier = 0;
        my $token_name = $token->{token};
        if ($token_name =~ /^'/) {
            # To make the C code simpler, we leave out the trailing single
            # quote of the literal. We can do strdup(literal + 1); then :).
            $token_name =~ s/'$//;
        }
        my $next_state = $token->{next_state};
        if ($next_state =~ /^call /) {
            ($call_identifier) = ($next_state =~ /^call ([0-9]+)$/);
            $next_state = '__CALL';
        }
        my $identifier = $token->{identifier};
        say $tokfh qq|    { "$token_name", "$identifier", $next_state, { $call_identifier } }, |;
    }
    say $tokfh '};';
}

say $tokfh 'cmdp_token_ptr tokens[' . scalar @keys . '] = {';
for my $state (@keys) {
    my $tokens = $states{$state};
    say $tokfh '    { tokens_' . $state . ', ' . scalar @$tokens . ' },';
}
say $tokfh '};';

close($tokfh);
-												Implement a new parser for commands. (+test)

On the rationale of using a custom parser instead of a lex/yacc one, see this
quote from src/commands_parser.c:
     We use a hand-written parser instead of lex/yacc because our commands are
     easy for humans, not for computers. Thus, it’s quite hard to specify a
     context-free grammar for the commands. A PEG grammar would be easier, but
     there’s downsides to every PEG parser generator I have come accross so far.

     This parser is basically a state machine which looks for literals or strings
     and can push either on a stack. After identifying a literal or string, it
     will either transition to the current state, to a different state, or call a
     function (like cmd_move()).

     Special care has been taken that error messages are useful and the code is
     well testable (when compiled with -DTEST_PARSER it will output to stdout
     instead of actually calling any function).

During the migration phase (I plan to completely switch to this parser before
4.2 will be released), the new parser will parse every command you send to
i3 and save the resulting call stack. Then, the old parser will parse your
input and actually execute the commands. Afterwards, both call stacks will be
compared and any differences will be logged.

The new parser works with 100% of the test suite and produces identical call
stacks.

											
										
										
											2012-01-14 14:53:29 -05:00
+								#!/usr/bin/env perl
 								# vim:ts=4:sw=4:expandtab
 								#
 								# i3 - an improved dynamic tiling window manager
 								# © 2009-2012 Michael Stapelberg and contributors (see also: LICENSE)
 								#
 								# generate-command-parser.pl: script to generate parts of the command parser
 								# from its specification file parser-specs/commands.spec.
 								#
 								# Requires only perl >= 5.10, no modules.
 								use strict;
 								use warnings;
 								use Data::Dumper;
 								use v5.10;
 								# reads in a whole file
 								sub slurp {
 								    open my $fh, '<', shift;
 								    local $/;
 								    <$fh>;
 								}
 								# Stores the different states.
 								my %states;
 								# XXX: don’t hardcode input and output
 								my $input = '../parser-specs/commands.spec';
 								my @raw_lines = split("\n", slurp($input));
 								my @lines;
 								# XXX: In the future, we might switch to a different way of parsing this. The
 								# parser is in many ways not good — one obvious one is that it is hand-crafted
 								# without a good reason, also it preprocesses lines and forgets about line
 								# numbers. Luckily, this is just an implementation detail and the specification
 								# for the i3 command parser is in-tree (not user input).
 								# -- michael, 2012-01-12
 								# First step of preprocessing:
 								# Join token definitions which are spread over multiple lines.
 								for my $line (@raw_lines) {
 								    next if $line =~ /^\s*#/ || $line =~ /^\s*$/;
 								    if ($line =~ /^\s+->/) {
 								        # This is a continued token definition, append this line to the
 								        # previous one.
 								        $lines[$#lines] = $lines[$#lines] . $line;
 								    } else {
 								        push @lines, $line;
 								        next;
 								    }
 								}
 								# First step: We build up the data structure containing all states and their
 								# token rules.
 								my $current_state;
 								for my $line (@lines) {
 								    if (my ($state) = ($line =~ /^state ([A-Z_]+):$/)) {
 								        #say "got a new state: $state";
 								        $current_state = $state;
 								    } else {
 								        # Must be a token definition:
 								        # [identifier = ] <tokens> -> <action>
 								        #say "token definition: $line";
 								        my ($identifier, $tokens, $action) =
 								            ($line =~ /
 								                ^\s*                  # skip leading whitespace
 								                ([a-z_]+ \s* = \s*|)  # optional identifier
 								                (.*?) -> \s*          # token
 								                (.*)                  # optional action
 								             /x);
 								        # Cleanup the identifier (if any).
 								        $identifier =~ s/^\s*(\S+)\s*=\s*$/$1/g;
 								        # Cleanup the tokens (remove whitespace).
 								        $tokens =~ s/\s*//g;
 								        # The default action is to stay in the current state.
 								        $action = $current_state if length($action) == 0;
 								        #say "identifier = *$identifier*, token = *$tokens*, action = *$action*";
 								        for my $token (split(',', $tokens)) {
 								            my $store_token = {
 								                token => $token,
 								                identifier => $identifier,
 								                next_state => $action,
 								            };
 								            if (exists $states{$current_state}) {
-												Bugfix: Make generate-command-parser.pl compatible with perl 5.10

											
										
										
											2012-01-16 16:20:48 -05:00
+								                push @{$states{$current_state}}, $store_token;
-												Implement a new parser for commands. (+test)

On the rationale of using a custom parser instead of a lex/yacc one, see this
quote from src/commands_parser.c:
     We use a hand-written parser instead of lex/yacc because our commands are
     easy for humans, not for computers. Thus, it’s quite hard to specify a
     context-free grammar for the commands. A PEG grammar would be easier, but
     there’s downsides to every PEG parser generator I have come accross so far.

     This parser is basically a state machine which looks for literals or strings
     and can push either on a stack. After identifying a literal or string, it
     will either transition to the current state, to a different state, or call a
     function (like cmd_move()).

     Special care has been taken that error messages are useful and the code is
     well testable (when compiled with -DTEST_PARSER it will output to stdout
     instead of actually calling any function).

During the migration phase (I plan to completely switch to this parser before
4.2 will be released), the new parser will parse every command you send to
i3 and save the resulting call stack. Then, the old parser will parse your
input and actually execute the commands. Afterwards, both call stacks will be
compared and any differences will be logged.

The new parser works with 100% of the test suite and produces identical call
stacks.

											
										
										
											2012-01-14 14:53:29 -05:00
+								            } else {
 								                $states{$current_state} = [ $store_token ];
 								            }
 								        }
 								    }
 								}
 								# Second step: Generate the enum values for all states.
 								# It is important to keep the order the same, so we store the keys once.
 								my @keys = keys %states;
 								open(my $enumfh, '>', 'GENERATED_enums.h');
 								# XXX: we might want to have a way to do this without a trailing comma, but gcc
 								# seems to eat it.
 								say $enumfh 'typedef enum {';
 								my $cnt = 0;
 								for my $state (@keys, '__CALL') {
 								    say $enumfh "    $state = $cnt,";
 								    $cnt++;
 								}
 								say $enumfh '} cmdp_state;';
 								close($enumfh);
 								# Third step: Generate the call function.
 								open(my $callfh, '>', 'GENERATED_call.h');
-												Refactor the interface of commands.c

This change has two implications:

1) tree_render() will now be called precisely once for input which consists of
   multiple commands (like "focus left; focus right"). Also, the caller of
   parse_command() has to call it. This makes us able to fix tickets such as
   ticket #608 (where multiple tree_render() calls are noticable).

2) The output of a command is now a JSON array of return values of the
   individual subcommands. In the case of "focus left; focus right", this is:

   [{"success":true}, {"success":true}]

   While this is incompatible with what i3 returned before, the return value of
   commands was undocumented and therefore not subject to our API stability.

											
										
										
											2012-02-07 17:38:21 -05:00
+								say $callfh 'static void GENERATED_call(const int call_identifier, struct CommandResult *result) {';
-												Implement a new parser for commands. (+test)

On the rationale of using a custom parser instead of a lex/yacc one, see this
quote from src/commands_parser.c:
     We use a hand-written parser instead of lex/yacc because our commands are
     easy for humans, not for computers. Thus, it’s quite hard to specify a
     context-free grammar for the commands. A PEG grammar would be easier, but
     there’s downsides to every PEG parser generator I have come accross so far.

     This parser is basically a state machine which looks for literals or strings
     and can push either on a stack. After identifying a literal or string, it
     will either transition to the current state, to a different state, or call a
     function (like cmd_move()).

     Special care has been taken that error messages are useful and the code is
     well testable (when compiled with -DTEST_PARSER it will output to stdout
     instead of actually calling any function).

During the migration phase (I plan to completely switch to this parser before
4.2 will be released), the new parser will parse every command you send to
i3 and save the resulting call stack. Then, the old parser will parse your
input and actually execute the commands. Afterwards, both call stacks will be
compared and any differences will be logged.

The new parser works with 100% of the test suite and produces identical call
stacks.

											
										
										
											2012-01-14 14:53:29 -05:00
+								say $callfh '    switch (call_identifier) {';
 								my $call_id = 0;
 								for my $state (@keys) {
 								    my $tokens = $states{$state};
 								    for my $token (@$tokens) {
 								        next unless $token->{next_state} =~ /^call /;
 								        my ($cmd) = ($token->{next_state} =~ /^call (.*)/);
 								        my ($next_state) = ($cmd =~ /; ([A-Z_]+)$/);
 								        $cmd =~ s/; ([A-Z_]+)$//;
 								        # Go back to the INITIAL state unless told otherwise.
 								        $next_state ||= 'INITIAL';
 								        my $fmt = $cmd;
 								        # Replace the references to identified literals (like $workspace) with
 								        # calls to get_string().
 								        $cmd =~ s/\$([a-z_]+)/get_string("$1")/g;
 								        # Used only for debugging/testing.
 								        $fmt =~ s/\$([a-z_]+)/%s/g;
 								        $fmt =~ s/"([a-z0-9_]+)"/%s/g;
 								        say $callfh "         case $call_id:";
 								        say $callfh '#ifndef TEST_PARSER';
 								        my $real_cmd = $cmd;
 								        if ($real_cmd =~ /\(\)/) {
-												Refactor the interface of commands.c

This change has two implications:

1) tree_render() will now be called precisely once for input which consists of
   multiple commands (like "focus left; focus right"). Also, the caller of
   parse_command() has to call it. This makes us able to fix tickets such as
   ticket #608 (where multiple tree_render() calls are noticable).

2) The output of a command is now a JSON array of return values of the
   individual subcommands. In the case of "focus left; focus right", this is:

   [{"success":true}, {"success":true}]

   While this is incompatible with what i3 returned before, the return value of
   commands was undocumented and therefore not subject to our API stability.

											
										
										
											2012-02-07 17:38:21 -05:00
+								            $real_cmd =~ s/\(/(&current_match, result/;
-												Implement a new parser for commands. (+test)

On the rationale of using a custom parser instead of a lex/yacc one, see this
quote from src/commands_parser.c:
     We use a hand-written parser instead of lex/yacc because our commands are
     easy for humans, not for computers. Thus, it’s quite hard to specify a
     context-free grammar for the commands. A PEG grammar would be easier, but
     there’s downsides to every PEG parser generator I have come accross so far.

     This parser is basically a state machine which looks for literals or strings
     and can push either on a stack. After identifying a literal or string, it
     will either transition to the current state, to a different state, or call a
     function (like cmd_move()).

     Special care has been taken that error messages are useful and the code is
     well testable (when compiled with -DTEST_PARSER it will output to stdout
     instead of actually calling any function).

During the migration phase (I plan to completely switch to this parser before
4.2 will be released), the new parser will parse every command you send to
i3 and save the resulting call stack. Then, the old parser will parse your
input and actually execute the commands. Afterwards, both call stacks will be
compared and any differences will be logged.

The new parser works with 100% of the test suite and produces identical call
stacks.

											
										
										
											2012-01-14 14:53:29 -05:00
+								        } else {
-												Refactor the interface of commands.c

This change has two implications:

1) tree_render() will now be called precisely once for input which consists of
   multiple commands (like "focus left; focus right"). Also, the caller of
   parse_command() has to call it. This makes us able to fix tickets such as
   ticket #608 (where multiple tree_render() calls are noticable).

2) The output of a command is now a JSON array of return values of the
   individual subcommands. In the case of "focus left; focus right", this is:

   [{"success":true}, {"success":true}]

   While this is incompatible with what i3 returned before, the return value of
   commands was undocumented and therefore not subject to our API stability.

											
										
										
											2012-02-07 17:38:21 -05:00
+								            $real_cmd =~ s/\(/(&current_match, result, /;
-												Implement a new parser for commands. (+test)

On the rationale of using a custom parser instead of a lex/yacc one, see this
quote from src/commands_parser.c:
     We use a hand-written parser instead of lex/yacc because our commands are
     easy for humans, not for computers. Thus, it’s quite hard to specify a
     context-free grammar for the commands. A PEG grammar would be easier, but
     there’s downsides to every PEG parser generator I have come accross so far.

     This parser is basically a state machine which looks for literals or strings
     and can push either on a stack. After identifying a literal or string, it
     will either transition to the current state, to a different state, or call a
     function (like cmd_move()).

     Special care has been taken that error messages are useful and the code is
     well testable (when compiled with -DTEST_PARSER it will output to stdout
     instead of actually calling any function).

During the migration phase (I plan to completely switch to this parser before
4.2 will be released), the new parser will parse every command you send to
i3 and save the resulting call stack. Then, the old parser will parse your
input and actually execute the commands. Afterwards, both call stacks will be
compared and any differences will be logged.

The new parser works with 100% of the test suite and produces identical call
stacks.

											
										
										
											2012-01-14 14:53:29 -05:00
+								        }
-												Refactor the interface of commands.c

This change has two implications:

1) tree_render() will now be called precisely once for input which consists of
   multiple commands (like "focus left; focus right"). Also, the caller of
   parse_command() has to call it. This makes us able to fix tickets such as
   ticket #608 (where multiple tree_render() calls are noticable).

2) The output of a command is now a JSON array of return values of the
   individual subcommands. In the case of "focus left; focus right", this is:

   [{"success":true}, {"success":true}]

   While this is incompatible with what i3 returned before, the return value of
   commands was undocumented and therefore not subject to our API stability.

											
										
										
											2012-02-07 17:38:21 -05:00
+								        say $callfh "             $real_cmd;";
-												Implement a new parser for commands. (+test)

On the rationale of using a custom parser instead of a lex/yacc one, see this
quote from src/commands_parser.c:
     We use a hand-written parser instead of lex/yacc because our commands are
     easy for humans, not for computers. Thus, it’s quite hard to specify a
     context-free grammar for the commands. A PEG grammar would be easier, but
     there’s downsides to every PEG parser generator I have come accross so far.

     This parser is basically a state machine which looks for literals or strings
     and can push either on a stack. After identifying a literal or string, it
     will either transition to the current state, to a different state, or call a
     function (like cmd_move()).

     Special care has been taken that error messages are useful and the code is
     well testable (when compiled with -DTEST_PARSER it will output to stdout
     instead of actually calling any function).

During the migration phase (I plan to completely switch to this parser before
4.2 will be released), the new parser will parse every command you send to
i3 and save the resulting call stack. Then, the old parser will parse your
input and actually execute the commands. Afterwards, both call stacks will be
compared and any differences will be logged.

The new parser works with 100% of the test suite and produces identical call
stacks.

											
										
										
											2012-01-14 14:53:29 -05:00
+								        say $callfh '#else';
 								        # debug
 								        $cmd =~ s/[^(]+\(//;
 								        $cmd =~ s/\)$//;
 								        $cmd = ", $cmd" if length($cmd) > 0;
 								        say $callfh qq|           printf("$fmt\\n"$cmd);|;
 								        say $callfh '#endif';
 								        say $callfh "             state = $next_state;";
 								        say $callfh "             break;";
 								        $token->{next_state} = "call $call_id";
 								        $call_id++;
 								    }
 								}
 								say $callfh '        default:';
 								say $callfh '            printf("BUG in the parser. state = %d\n", call_identifier);';
 								say $callfh '    }';
 								say $callfh '}';
 								close($callfh);
 								# Fourth step: Generate the token datastructures.
 								open(my $tokfh, '>', 'GENERATED_tokens.h');
 								for my $state (@keys) {
 								    my $tokens = $states{$state};
 								    say $tokfh 'cmdp_token tokens_' . $state . '[' . scalar @$tokens . '] = {';
 								    for my $token (@$tokens) {
 								        my $call_identifier = 0;
 								        my $token_name = $token->{token};
 								        if ($token_name =~ /^'/) {
 								            # To make the C code simpler, we leave out the trailing single
 								            # quote of the literal. We can do strdup(literal + 1); then :).
 								            $token_name =~ s/'$//;
 								        }
 								        my $next_state = $token->{next_state};
 								        if ($next_state =~ /^call /) {
 								            ($call_identifier) = ($next_state =~ /^call ([0-9]+)$/);
 								            $next_state = '__CALL';
 								        }
 								        my $identifier = $token->{identifier};
 								        say $tokfh qq|    { "$token_name", "$identifier", $next_state, { $call_identifier } }, |;
 								    }
 								    say $tokfh '};';
 								}
 								say $tokfh 'cmdp_token_ptr tokens[' . scalar @keys . '] = {';
 								for my $state (@keys) {
 								    my $tokens = $states{$state};
 								    say $tokfh '    { tokens_' . $state . ', ' . scalar @$tokens . ' },';
 								}
 								say $tokfh '};';
 								close($tokfh);