Improve extraction of syntax keywords
Use the following strategy to extract identifiers from syntax highlighting: - ignore match and region: they mostly contain arguments, syntax groups and regular expressions; - ignore "nextgroup=" if first word and subsequent arguments "skipempty", "skipwhite", and "skipnl"; - ignore "contained" argument if first word; - add remaining words to the list of identifiers. Fix a bug where the word "match" was extracted while not being a keyword of the syntax language.
This commit is contained in:
parent
f14485acb9
commit
75d41d1137
@ -38,43 +38,28 @@ SYNTAX_GROUP_REGEX = re.compile(
|
|||||||
$""",
|
$""",
|
||||||
re.VERBOSE )
|
re.VERBOSE )
|
||||||
|
|
||||||
KEYWORD_REGEX = re.compile( r'^[\w,]+$' )
|
KEYWORD_REGEX = re.compile( r'^(\w+),?$' )
|
||||||
|
|
||||||
SYNTAX_ARGUMENT_REGEX = re.compile(
|
SYNTAX_ARGUMENT_REGEX = re.compile(
|
||||||
r"^\w+=.*$" )
|
r"^\w+=.*$" )
|
||||||
|
|
||||||
SYNTAX_ARGUMENTS = set([
|
SYNTAX_REGION_ARGUMENT_REGEX = re.compile(
|
||||||
'cchar',
|
r"^(?:matchgroup|start)=.*$")
|
||||||
'conceal',
|
|
||||||
'contained',
|
# See ":h syn-nextgroup".
|
||||||
'containedin',
|
SYNTAX_NEXTGROUP_ARGUMENTS = set([
|
||||||
'nextgroup',
|
|
||||||
'skipempty',
|
|
||||||
'skipnl',
|
|
||||||
'skipwhite',
|
'skipwhite',
|
||||||
'transparent',
|
'skipnl',
|
||||||
'concealends',
|
'skipempty'
|
||||||
'contains',
|
|
||||||
'display',
|
|
||||||
'extend',
|
|
||||||
'fold',
|
|
||||||
'oneline',
|
|
||||||
'keepend',
|
|
||||||
'excludenl',
|
|
||||||
])
|
])
|
||||||
|
|
||||||
# We want to parse lines starting with these args
|
# These are the parent groups from which we want to extract keywords.
|
||||||
ALLOWED_SYNTAX_ARGUMENTS = set([
|
|
||||||
'contained',
|
|
||||||
])
|
|
||||||
|
|
||||||
# These are the parent groups from which we want to extract keywords
|
|
||||||
ROOT_GROUPS = set([
|
ROOT_GROUPS = set([
|
||||||
'Statement',
|
'Statement',
|
||||||
'Boolean',
|
'Boolean',
|
||||||
'Include',
|
'Include',
|
||||||
'Type',
|
'Type',
|
||||||
'Identifier',
|
'Identifier'
|
||||||
])
|
])
|
||||||
|
|
||||||
|
|
||||||
@ -149,7 +134,7 @@ def _CreateInitialGroupMap():
|
|||||||
type_group = SyntaxGroup( 'Type' )
|
type_group = SyntaxGroup( 'Type' )
|
||||||
identifier_group = SyntaxGroup( 'Identifier' )
|
identifier_group = SyntaxGroup( 'Identifier' )
|
||||||
|
|
||||||
# See `:h group-name` for details on how the initial group hierarchy is built
|
# See ":h group-name" for details on how the initial group hierarchy is built.
|
||||||
group_name_to_group = {
|
group_name_to_group = {
|
||||||
'Statement': statement_group,
|
'Statement': statement_group,
|
||||||
'Type': type_group,
|
'Type': type_group,
|
||||||
@ -202,23 +187,49 @@ def _GetAllDescendentats( root_group ):
|
|||||||
return descendants
|
return descendants
|
||||||
|
|
||||||
|
|
||||||
|
def _ExtractKeywordsFromLine( line ):
|
||||||
|
if line.startswith( 'links to ' ):
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Ignore "syntax match" lines (see ":h syn-match").
|
||||||
|
if line.startswith( 'match ' ):
|
||||||
|
return []
|
||||||
|
|
||||||
|
words = line.split()
|
||||||
|
if not words:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Ignore "syntax region" lines (see ":h syn-region"). They always start
|
||||||
|
# with matchgroup= or start= in the syntax list.
|
||||||
|
if SYNTAX_REGION_ARGUMENT_REGEX.match( words[ 0 ] ):
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Ignore "nextgroup=" argument in first position and the arguments
|
||||||
|
# "skipwhite", "skipnl", and "skipempty" that immediately come after.
|
||||||
|
nextgroup_at_start = False
|
||||||
|
if words[ 0 ].startswith( 'nextgroup=' ):
|
||||||
|
nextgroup_at_start = True
|
||||||
|
words = words[ 1: ]
|
||||||
|
|
||||||
|
# Ignore "contained" argument in first position.
|
||||||
|
if words[ 0 ] == 'contained':
|
||||||
|
words = words[ 1: ]
|
||||||
|
|
||||||
|
keywords = []
|
||||||
|
for word in words:
|
||||||
|
if nextgroup_at_start and word in SYNTAX_NEXTGROUP_ARGUMENTS:
|
||||||
|
continue
|
||||||
|
|
||||||
|
nextgroup_at_start = False
|
||||||
|
|
||||||
|
keyword_matched = KEYWORD_REGEX.match( word )
|
||||||
|
if keyword_matched:
|
||||||
|
keywords.append( keyword_matched.group( 1 ) )
|
||||||
|
return keywords
|
||||||
|
|
||||||
|
|
||||||
def _ExtractKeywordsFromGroup( group ):
|
def _ExtractKeywordsFromGroup( group ):
|
||||||
keywords = []
|
keywords = []
|
||||||
for line in group.lines:
|
for line in group.lines:
|
||||||
if line.startswith( 'links to ' ):
|
keywords.extend( _ExtractKeywordsFromLine( line ) )
|
||||||
continue
|
|
||||||
|
|
||||||
words = line.split()
|
|
||||||
if not words or ( words[ 0 ] in SYNTAX_ARGUMENTS and
|
|
||||||
words[ 0 ] not in ALLOWED_SYNTAX_ARGUMENTS ):
|
|
||||||
continue
|
|
||||||
|
|
||||||
for word in words:
|
|
||||||
if ( word not in SYNTAX_ARGUMENTS and
|
|
||||||
not SYNTAX_ARGUMENT_REGEX.match( word ) and
|
|
||||||
KEYWORD_REGEX.match( word ) ):
|
|
||||||
|
|
||||||
if word.endswith( ',' ):
|
|
||||||
word = word[ :-1 ]
|
|
||||||
keywords.append( word )
|
|
||||||
return keywords
|
return keywords
|
||||||
|
@ -43,10 +43,10 @@ def KeywordsFromSyntaxListOutput_PythonSyntax_test():
|
|||||||
'bytearray', 'IndexError', 'all', 'help', 'vars', 'SyntaxError', 'global',
|
'bytearray', 'IndexError', 'all', 'help', 'vars', 'SyntaxError', 'global',
|
||||||
'elif', 'unicode', 'sorted', 'memoryview', 'isinstance', 'except',
|
'elif', 'unicode', 'sorted', 'memoryview', 'isinstance', 'except',
|
||||||
'nonlocal', 'NameError', 'finally', 'BytesWarning', 'dict', 'IOError',
|
'nonlocal', 'NameError', 'finally', 'BytesWarning', 'dict', 'IOError',
|
||||||
'pass', 'oct', 'match', 'bin', 'SystemExit', 'return', 'StandardError',
|
'pass', 'oct', 'bin', 'SystemExit', 'return', 'StandardError', 'format',
|
||||||
'format', 'TabError', 'break', 'next', 'not', 'UnicodeDecodeError',
|
'TabError', 'break', 'next', 'not', 'UnicodeDecodeError', 'False',
|
||||||
'False', 'RuntimeWarning', 'list', 'iter', 'try', 'reload', 'Warning',
|
'RuntimeWarning', 'list', 'iter', 'try', 'reload', 'Warning', 'round',
|
||||||
'round', 'dir', 'cmp', 'set', 'bytes', 'UnicodeTranslateError', 'intern',
|
'dir', 'cmp', 'set', 'bytes', 'UnicodeTranslateError', 'intern',
|
||||||
'issubclass', 'yield', 'Ellipsis', 'hash', 'locals', 'BufferError',
|
'issubclass', 'yield', 'Ellipsis', 'hash', 'locals', 'BufferError',
|
||||||
'slice', 'for', 'FloatingPointError', 'sum', 'VMSError', 'getattr', 'abs',
|
'slice', 'for', 'FloatingPointError', 'sum', 'VMSError', 'getattr', 'abs',
|
||||||
'print', 'import', 'True', 'FutureWarning', 'ImportWarning', 'None',
|
'print', 'import', 'True', 'FutureWarning', 'ImportWarning', 'None',
|
||||||
@ -77,8 +77,8 @@ def KeywordsFromSyntaxListOutput_PythonSyntax_test():
|
|||||||
def KeywordsFromSyntaxListOutput_CppSyntax_test():
|
def KeywordsFromSyntaxListOutput_CppSyntax_test():
|
||||||
expected_keywords = (
|
expected_keywords = (
|
||||||
'int_fast32_t', 'FILE', 'size_t', 'bitor', 'typedef', 'const', 'struct',
|
'int_fast32_t', 'FILE', 'size_t', 'bitor', 'typedef', 'const', 'struct',
|
||||||
'uint8_t', 'fpos_t', 'thread_local', 'unsigned', 'uint_least16_t', 'match',
|
'uint8_t', 'fpos_t', 'thread_local', 'unsigned', 'uint_least16_t', 'do',
|
||||||
'do', 'intptr_t', 'uint_least64_t', 'return', 'auto', 'void', '_Complex',
|
'intptr_t', 'uint_least64_t', 'return', 'auto', 'void', '_Complex',
|
||||||
'break', '_Alignof', 'not', 'using', '_Static_assert', '_Thread_local',
|
'break', '_Alignof', 'not', 'using', '_Static_assert', '_Thread_local',
|
||||||
'public', 'uint_fast16_t', 'this', 'continue', 'char32_t', 'int16_t',
|
'public', 'uint_fast16_t', 'this', 'continue', 'char32_t', 'int16_t',
|
||||||
'intmax_t', 'static', 'clock_t', 'sizeof', 'int_fast64_t', 'mbstate_t',
|
'intmax_t', 'static', 'clock_t', 'sizeof', 'int_fast64_t', 'mbstate_t',
|
||||||
@ -108,7 +108,7 @@ def KeywordsFromSyntaxListOutput_JavaSyntax_test():
|
|||||||
expected_keywords = (
|
expected_keywords = (
|
||||||
'code', 'text', 'cols', 'datetime', 'disabled', 'shape', 'codetype', 'alt',
|
'code', 'text', 'cols', 'datetime', 'disabled', 'shape', 'codetype', 'alt',
|
||||||
'compact', 'style', 'valuetype', 'short', 'finally', 'continue', 'extends',
|
'compact', 'style', 'valuetype', 'short', 'finally', 'continue', 'extends',
|
||||||
'valign', 'match', 'bordercolor', 'do', 'return', 'rel', 'rules', 'void',
|
'valign', 'bordercolor', 'do', 'return', 'rel', 'rules', 'void',
|
||||||
'nohref', 'abbr', 'background', 'scrolling', 'instanceof', 'name',
|
'nohref', 'abbr', 'background', 'scrolling', 'instanceof', 'name',
|
||||||
'summary', 'try', 'default', 'noshade', 'coords', 'dir', 'frame', 'usemap',
|
'summary', 'try', 'default', 'noshade', 'coords', 'dir', 'frame', 'usemap',
|
||||||
'ismap', 'static', 'hspace', 'vlink', 'for', 'selected', 'rev', 'vspace',
|
'ismap', 'static', 'hspace', 'vlink', 'for', 'selected', 'rev', 'vspace',
|
||||||
@ -273,25 +273,25 @@ def ExtractKeywordsFromGroup_KeywordStarts_test():
|
|||||||
assert_that( syntax_parse._ExtractKeywordsFromGroup(
|
assert_that( syntax_parse._ExtractKeywordsFromGroup(
|
||||||
syntax_parse.SyntaxGroup( '', [
|
syntax_parse.SyntaxGroup( '', [
|
||||||
'foo bar',
|
'foo bar',
|
||||||
'transparent boo baa',
|
'contained boo baa',
|
||||||
'zoo goo',
|
'zoo goo',
|
||||||
] ) ),
|
] ) ),
|
||||||
contains_inanyorder( 'foo', 'bar', 'zoo', 'goo' ) )
|
contains_inanyorder( 'foo', 'bar', 'boo', 'baa', 'zoo', 'goo' ) )
|
||||||
|
|
||||||
|
|
||||||
def ExtractKeywordsFromGroup_KeywordMiddle_test():
|
def ExtractKeywordsFromGroup_KeywordMiddle_test():
|
||||||
assert_that( syntax_parse._ExtractKeywordsFromGroup(
|
assert_that( syntax_parse._ExtractKeywordsFromGroup(
|
||||||
syntax_parse.SyntaxGroup( '', [
|
syntax_parse.SyntaxGroup( '', [
|
||||||
'foo oneline bar',
|
'foo contained bar',
|
||||||
'zoo goo'
|
'zoo goo'
|
||||||
] ) ),
|
] ) ),
|
||||||
contains_inanyorder( 'foo', 'bar', 'zoo', 'goo' ) )
|
contains_inanyorder( 'foo', 'contained', 'bar', 'zoo', 'goo' ) )
|
||||||
|
|
||||||
|
|
||||||
def ExtractKeywordsFromGroup_KeywordAssign_test():
|
def ExtractKeywordsFromGroup_KeywordAssign_test():
|
||||||
assert_that( syntax_parse._ExtractKeywordsFromGroup(
|
assert_that( syntax_parse._ExtractKeywordsFromGroup(
|
||||||
syntax_parse.SyntaxGroup( '', [
|
syntax_parse.SyntaxGroup( '', [
|
||||||
'foo end=zoo((^^//)) bar',
|
'nextgroup=zoo skipwhite foo bar',
|
||||||
'zoo goo',
|
'zoo goo',
|
||||||
] ) ),
|
] ) ),
|
||||||
contains_inanyorder( 'foo', 'bar', 'zoo', 'goo' ) )
|
contains_inanyorder( 'foo', 'bar', 'zoo', 'goo' ) )
|
||||||
@ -300,10 +300,19 @@ def ExtractKeywordsFromGroup_KeywordAssign_test():
|
|||||||
def ExtractKeywordsFromGroup_KeywordAssignAndMiddle_test():
|
def ExtractKeywordsFromGroup_KeywordAssignAndMiddle_test():
|
||||||
assert_that( syntax_parse._ExtractKeywordsFromGroup(
|
assert_that( syntax_parse._ExtractKeywordsFromGroup(
|
||||||
syntax_parse.SyntaxGroup( '', [
|
syntax_parse.SyntaxGroup( '', [
|
||||||
'foo end=zoo((^^//)) transparent bar',
|
'nextgroup=zoo foo skipnl bar',
|
||||||
'zoo goo',
|
'zoo goo',
|
||||||
] ) ),
|
] ) ),
|
||||||
contains_inanyorder( 'foo', 'bar', 'zoo', 'goo' ) )
|
contains_inanyorder( 'foo', 'skipnl', 'bar', 'zoo', 'goo' ) )
|
||||||
|
|
||||||
|
|
||||||
|
def ExtractKeywordsFromGroup_KeywordWithoutNextgroup_test():
|
||||||
|
assert_that( syntax_parse._ExtractKeywordsFromGroup(
|
||||||
|
syntax_parse.SyntaxGroup( '', [
|
||||||
|
'skipempty foo bar',
|
||||||
|
'zoo goo',
|
||||||
|
] ) ),
|
||||||
|
contains_inanyorder( 'skipempty', 'foo', 'bar', 'zoo', 'goo' ) )
|
||||||
|
|
||||||
|
|
||||||
def ExtractKeywordsFromGroup_ContainedSyntaxArgAllowed_test():
|
def ExtractKeywordsFromGroup_ContainedSyntaxArgAllowed_test():
|
||||||
|
Loading…
Reference in New Issue
Block a user