From 75d41d11372b7bf3a7bef4afcfe86441e41f6ec3 Mon Sep 17 00:00:00 2001 From: micbou Date: Fri, 5 Aug 2016 20:08:53 +0200 Subject: [PATCH] Improve extraction of syntax keywords Use the following strategy to extract identifiers from syntax highlighting: - ignore match and region: they mostly contain arguments, syntax groups and regular expressions; - ignore "nextgroup=" if first word and subsequent arguments "skipempty", "skipwhite", and "skipnl"; - ignore "contained" argument if first word; - add remaining words to the list of identifiers. Fix a bug where the word "match" was extracted while not being a keyword of the syntax language. --- python/ycm/syntax_parse.py | 95 +++++++++++++++------------ python/ycm/tests/syntax_parse_test.py | 37 +++++++---- 2 files changed, 76 insertions(+), 56 deletions(-) diff --git a/python/ycm/syntax_parse.py b/python/ycm/syntax_parse.py index e6ea1d52..25b5071c 100644 --- a/python/ycm/syntax_parse.py +++ b/python/ycm/syntax_parse.py @@ -38,43 +38,28 @@ SYNTAX_GROUP_REGEX = re.compile( $""", re.VERBOSE ) -KEYWORD_REGEX = re.compile( r'^[\w,]+$' ) +KEYWORD_REGEX = re.compile( r'^(\w+),?$' ) SYNTAX_ARGUMENT_REGEX = re.compile( r"^\w+=.*$" ) -SYNTAX_ARGUMENTS = set([ - 'cchar', - 'conceal', - 'contained', - 'containedin', - 'nextgroup', - 'skipempty', - 'skipnl', +SYNTAX_REGION_ARGUMENT_REGEX = re.compile( + r"^(?:matchgroup|start)=.*$") + +# See ":h syn-nextgroup". +SYNTAX_NEXTGROUP_ARGUMENTS = set([ 'skipwhite', - 'transparent', - 'concealends', - 'contains', - 'display', - 'extend', - 'fold', - 'oneline', - 'keepend', - 'excludenl', + 'skipnl', + 'skipempty' ]) -# We want to parse lines starting with these args -ALLOWED_SYNTAX_ARGUMENTS = set([ - 'contained', -]) - -# These are the parent groups from which we want to extract keywords +# These are the parent groups from which we want to extract keywords. ROOT_GROUPS = set([ 'Statement', 'Boolean', 'Include', 'Type', - 'Identifier', + 'Identifier' ]) @@ -149,7 +134,7 @@ def _CreateInitialGroupMap(): type_group = SyntaxGroup( 'Type' ) identifier_group = SyntaxGroup( 'Identifier' ) - # See `:h group-name` for details on how the initial group hierarchy is built + # See ":h group-name" for details on how the initial group hierarchy is built. group_name_to_group = { 'Statement': statement_group, 'Type': type_group, @@ -202,23 +187,49 @@ def _GetAllDescendentats( root_group ): return descendants +def _ExtractKeywordsFromLine( line ): + if line.startswith( 'links to ' ): + return [] + + # Ignore "syntax match" lines (see ":h syn-match"). + if line.startswith( 'match ' ): + return [] + + words = line.split() + if not words: + return [] + + # Ignore "syntax region" lines (see ":h syn-region"). They always start + # with matchgroup= or start= in the syntax list. + if SYNTAX_REGION_ARGUMENT_REGEX.match( words[ 0 ] ): + return [] + + # Ignore "nextgroup=" argument in first position and the arguments + # "skipwhite", "skipnl", and "skipempty" that immediately come after. + nextgroup_at_start = False + if words[ 0 ].startswith( 'nextgroup=' ): + nextgroup_at_start = True + words = words[ 1: ] + + # Ignore "contained" argument in first position. + if words[ 0 ] == 'contained': + words = words[ 1: ] + + keywords = [] + for word in words: + if nextgroup_at_start and word in SYNTAX_NEXTGROUP_ARGUMENTS: + continue + + nextgroup_at_start = False + + keyword_matched = KEYWORD_REGEX.match( word ) + if keyword_matched: + keywords.append( keyword_matched.group( 1 ) ) + return keywords + + def _ExtractKeywordsFromGroup( group ): keywords = [] for line in group.lines: - if line.startswith( 'links to ' ): - continue - - words = line.split() - if not words or ( words[ 0 ] in SYNTAX_ARGUMENTS and - words[ 0 ] not in ALLOWED_SYNTAX_ARGUMENTS ): - continue - - for word in words: - if ( word not in SYNTAX_ARGUMENTS and - not SYNTAX_ARGUMENT_REGEX.match( word ) and - KEYWORD_REGEX.match( word ) ): - - if word.endswith( ',' ): - word = word[ :-1 ] - keywords.append( word ) + keywords.extend( _ExtractKeywordsFromLine( line ) ) return keywords diff --git a/python/ycm/tests/syntax_parse_test.py b/python/ycm/tests/syntax_parse_test.py index c44ca0fa..e13e6bfe 100644 --- a/python/ycm/tests/syntax_parse_test.py +++ b/python/ycm/tests/syntax_parse_test.py @@ -43,10 +43,10 @@ def KeywordsFromSyntaxListOutput_PythonSyntax_test(): 'bytearray', 'IndexError', 'all', 'help', 'vars', 'SyntaxError', 'global', 'elif', 'unicode', 'sorted', 'memoryview', 'isinstance', 'except', 'nonlocal', 'NameError', 'finally', 'BytesWarning', 'dict', 'IOError', - 'pass', 'oct', 'match', 'bin', 'SystemExit', 'return', 'StandardError', - 'format', 'TabError', 'break', 'next', 'not', 'UnicodeDecodeError', - 'False', 'RuntimeWarning', 'list', 'iter', 'try', 'reload', 'Warning', - 'round', 'dir', 'cmp', 'set', 'bytes', 'UnicodeTranslateError', 'intern', + 'pass', 'oct', 'bin', 'SystemExit', 'return', 'StandardError', 'format', + 'TabError', 'break', 'next', 'not', 'UnicodeDecodeError', 'False', + 'RuntimeWarning', 'list', 'iter', 'try', 'reload', 'Warning', 'round', + 'dir', 'cmp', 'set', 'bytes', 'UnicodeTranslateError', 'intern', 'issubclass', 'yield', 'Ellipsis', 'hash', 'locals', 'BufferError', 'slice', 'for', 'FloatingPointError', 'sum', 'VMSError', 'getattr', 'abs', 'print', 'import', 'True', 'FutureWarning', 'ImportWarning', 'None', @@ -77,8 +77,8 @@ def KeywordsFromSyntaxListOutput_PythonSyntax_test(): def KeywordsFromSyntaxListOutput_CppSyntax_test(): expected_keywords = ( 'int_fast32_t', 'FILE', 'size_t', 'bitor', 'typedef', 'const', 'struct', - 'uint8_t', 'fpos_t', 'thread_local', 'unsigned', 'uint_least16_t', 'match', - 'do', 'intptr_t', 'uint_least64_t', 'return', 'auto', 'void', '_Complex', + 'uint8_t', 'fpos_t', 'thread_local', 'unsigned', 'uint_least16_t', 'do', + 'intptr_t', 'uint_least64_t', 'return', 'auto', 'void', '_Complex', 'break', '_Alignof', 'not', 'using', '_Static_assert', '_Thread_local', 'public', 'uint_fast16_t', 'this', 'continue', 'char32_t', 'int16_t', 'intmax_t', 'static', 'clock_t', 'sizeof', 'int_fast64_t', 'mbstate_t', @@ -108,7 +108,7 @@ def KeywordsFromSyntaxListOutput_JavaSyntax_test(): expected_keywords = ( 'code', 'text', 'cols', 'datetime', 'disabled', 'shape', 'codetype', 'alt', 'compact', 'style', 'valuetype', 'short', 'finally', 'continue', 'extends', - 'valign', 'match', 'bordercolor', 'do', 'return', 'rel', 'rules', 'void', + 'valign', 'bordercolor', 'do', 'return', 'rel', 'rules', 'void', 'nohref', 'abbr', 'background', 'scrolling', 'instanceof', 'name', 'summary', 'try', 'default', 'noshade', 'coords', 'dir', 'frame', 'usemap', 'ismap', 'static', 'hspace', 'vlink', 'for', 'selected', 'rev', 'vspace', @@ -273,25 +273,25 @@ def ExtractKeywordsFromGroup_KeywordStarts_test(): assert_that( syntax_parse._ExtractKeywordsFromGroup( syntax_parse.SyntaxGroup( '', [ 'foo bar', - 'transparent boo baa', + 'contained boo baa', 'zoo goo', ] ) ), - contains_inanyorder( 'foo', 'bar', 'zoo', 'goo' ) ) + contains_inanyorder( 'foo', 'bar', 'boo', 'baa', 'zoo', 'goo' ) ) def ExtractKeywordsFromGroup_KeywordMiddle_test(): assert_that( syntax_parse._ExtractKeywordsFromGroup( syntax_parse.SyntaxGroup( '', [ - 'foo oneline bar', + 'foo contained bar', 'zoo goo' ] ) ), - contains_inanyorder( 'foo', 'bar', 'zoo', 'goo' ) ) + contains_inanyorder( 'foo', 'contained', 'bar', 'zoo', 'goo' ) ) def ExtractKeywordsFromGroup_KeywordAssign_test(): assert_that( syntax_parse._ExtractKeywordsFromGroup( syntax_parse.SyntaxGroup( '', [ - 'foo end=zoo((^^//)) bar', + 'nextgroup=zoo skipwhite foo bar', 'zoo goo', ] ) ), contains_inanyorder( 'foo', 'bar', 'zoo', 'goo' ) ) @@ -300,10 +300,19 @@ def ExtractKeywordsFromGroup_KeywordAssign_test(): def ExtractKeywordsFromGroup_KeywordAssignAndMiddle_test(): assert_that( syntax_parse._ExtractKeywordsFromGroup( syntax_parse.SyntaxGroup( '', [ - 'foo end=zoo((^^//)) transparent bar', + 'nextgroup=zoo foo skipnl bar', 'zoo goo', ] ) ), - contains_inanyorder( 'foo', 'bar', 'zoo', 'goo' ) ) + contains_inanyorder( 'foo', 'skipnl', 'bar', 'zoo', 'goo' ) ) + + +def ExtractKeywordsFromGroup_KeywordWithoutNextgroup_test(): + assert_that( syntax_parse._ExtractKeywordsFromGroup( + syntax_parse.SyntaxGroup( '', [ + 'skipempty foo bar', + 'zoo goo', + ] ) ), + contains_inanyorder( 'skipempty', 'foo', 'bar', 'zoo', 'goo' ) ) def ExtractKeywordsFromGroup_ContainedSyntaxArgAllowed_test():