YouCompleteMe/cpp/ycm/IdentifierUtils.cpp

// Copyright (C) 2011, 2012  Strahinja Val Markovic  <val@markovic.io>
//
// This file is part of YouCompleteMe.
//
// YouCompleteMe is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// YouCompleteMe is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with YouCompleteMe.  If not, see <http://www.gnu.org/licenses/>.

#include "IdentifierUtils.h"
#include "Utils.h"
#include "standard.h"

#include <boost/unordered_map.hpp>
#include <boost/assign/list_of.hpp>
#include <boost/regex.hpp>
#include <boost/algorithm/string/regex.hpp>

namespace YouCompleteMe {

namespace fs = boost::filesystem;

namespace {

const char *COMMENT_AND_STRING_REGEX =
  "//.*?$" // Anything following '//'
  "|"
  "#.*?$"  // Anything following '#'
  "|"
  "/\\*.*?\\*/"  // C-style comments, '/* ... */'
  "|"
  // Anything inside single quotes, '...', but mind the escaped quote and the
  // escaped slash (\\)
  "'(?:\\\\\\\\|\\\\'|.)*?'"
  "|"
  // Anything inside double quotes, "...", but mind the escaped double quote and
  // the escaped slash (\\)
  "\"(?:\\\\\\\\|\\\\\"|.)*?\"";

const char *IDENTIFIER_REGEX = "[_a-zA-Z]\\w*";

// For details on the tag format supported, see here for details:
// http://ctags.sourceforge.net/FORMAT
// TL;DR: The only supported format is the one Exuberant Ctags emits.
const char *TAG_REGEX =
  "^([^\\t\\n\\r]+)"  // The first field is the identifier
  "\\t"  // A TAB char is the field separator
  // The second field is the path to the file that has the identifier; either
  // absolute or relative to the tags file.
  "([^\\t\\n\\r]+)"
  "\\t.*?"  // Non-greedy everything
  "language:([^\\t\\n\\r]+)"  // We want to capture the language of the file
  ".*?$";


// List of languages Exuberant Ctags supports:
//   ctags --list-languages
// To map a language name to a filetype, see this file:
//   :e $VIMRUNTIME/filetype.vim
const boost::unordered_map< std::string, std::string > LANG_TO_FILETYPE =
  boost::assign::map_list_of
  ( std::string( "Ant"        ), std::string( "ant"        ) )
  ( std::string( "Asm"        ), std::string( "asm"        ) )
  ( std::string( "Awk"        ), std::string( "awk"        ) )
  ( std::string( "Basic"      ), std::string( "basic"      ) )
  ( std::string( "C++"        ), std::string( "cpp"        ) )
  ( std::string( "C#"         ), std::string( "cs"         ) )
  ( std::string( "C"          ), std::string( "c"          ) )
  ( std::string( "COBOL"      ), std::string( "cobol"      ) )
  ( std::string( "DosBatch"   ), std::string( "dosbatch"   ) )
  ( std::string( "Eiffel"     ), std::string( "eiffel"     ) )
  ( std::string( "Erlang"     ), std::string( "erlang"     ) )
  ( std::string( "Fortran"    ), std::string( "fortran"    ) )
  ( std::string( "HTML"       ), std::string( "html"       ) )
  ( std::string( "Java"       ), std::string( "java"       ) )
  ( std::string( "JavaScript" ), std::string( "javascript" ) )
  ( std::string( "Lisp"       ), std::string( "lisp"       ) )
  ( std::string( "Lua"        ), std::string( "lua"        ) )
  ( std::string( "Make"       ), std::string( "make"       ) )
  ( std::string( "MatLab"     ), std::string( "matlab"     ) )
  ( std::string( "OCaml"      ), std::string( "ocaml"      ) )
  ( std::string( "Pascal"     ), std::string( "pascal"     ) )
  ( std::string( "Perl"       ), std::string( "perl"       ) )
  ( std::string( "PHP"        ), std::string( "php"        ) )
  ( std::string( "Python"     ), std::string( "python"     ) )
  ( std::string( "REXX"       ), std::string( "rexx"       ) )
  ( std::string( "Ruby"       ), std::string( "ruby"       ) )
  ( std::string( "Scheme"     ), std::string( "scheme"     ) )
  ( std::string( "Sh"         ), std::string( "sh"         ) )
  ( std::string( "SLang"      ), std::string( "slang"      ) )
  ( std::string( "SML"        ), std::string( "sml"        ) )
  ( std::string( "SQL"        ), std::string( "sql"        ) )
  ( std::string( "Tcl"        ), std::string( "tcl"        ) )
  ( std::string( "Tex"        ), std::string( "tex"        ) )
  ( std::string( "Vera"       ), std::string( "vera"       ) )
  ( std::string( "Verilog"    ), std::string( "verilog"    ) )
  ( std::string( "VHDL"       ), std::string( "vhdl"       ) )
  ( std::string( "Vim"        ), std::string( "vim"        ) )
  ( std::string( "YACC"       ), std::string( "yacc"       ) );

const std::string NOT_FOUND = "YCMFOOBAR_NOT_FOUND";

}  // unnamed namespace


std::string RemoveIdentifierFreeText( std::string text ) {
  boost::erase_all_regex( text, boost::regex( COMMENT_AND_STRING_REGEX ) );
  return text;
}


std::vector< std::string > ExtractIdentifiersFromText(
  const std::string &text ) {
  std::string::const_iterator start = text.begin();
  std::string::const_iterator end   = text.end();

  boost::smatch matches;
  const boost::regex expression( IDENTIFIER_REGEX );

  std::vector< std::string > identifiers;

  while ( boost::regex_search( start, end, matches, expression ) ) {
    identifiers.push_back( matches[ 0 ] );
    start = matches[ 0 ].second;
  }

  return identifiers;
}


FiletypeIdentifierMap ExtractIdentifiersFromTagsFile(
  const fs::path &path_to_tag_file ) {
  FiletypeIdentifierMap filetype_identifier_map;
  std::string tags_file_contents;

  try {
    tags_file_contents = ReadUtf8File( path_to_tag_file );
  } catch ( ... ) {
    return filetype_identifier_map;
  }

  std::string::const_iterator start = tags_file_contents.begin();
  std::string::const_iterator end   = tags_file_contents.end();

  boost::smatch matches;
  const boost::regex expression( TAG_REGEX );
  const boost::match_flag_type options = boost::match_not_dot_newline;

  while ( boost::regex_search( start, end, matches, expression, options ) ) {
    start = matches[ 0 ].second;

    std::string language( matches[ 3 ] );
    std::string filetype = FindWithDefault( LANG_TO_FILETYPE,
                                            language,
                                            NOT_FOUND );

    if ( filetype == NOT_FOUND )
      continue;

    std::string identifier( matches[ 1 ] );
    fs::path path( matches[ 2 ] );

    if ( path.is_relative() )
      path = path_to_tag_file.parent_path() / path;

    filetype_identifier_map[ filetype ][ path.string() ].push_back( identifier );
  }

  return filetype_identifier_map;
}

} // namespace YouCompleteMe
Now extracting identifiers in C++ code 2012-07-23 23:17:59 -04:00			`// Copyright (C) 2011, 2012 Strahinja Val Markovic <val@markovic.io>`
			`//`
			`// This file is part of YouCompleteMe.`
			`//`
			`// YouCompleteMe is free software: you can redistribute it and/or modify`
			`// it under the terms of the GNU General Public License as published by`
			`// the Free Software Foundation, either version 3 of the License, or`
			`// (at your option) any later version.`
			`//`
			`// YouCompleteMe is distributed in the hope that it will be useful,`
			`// but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`// GNU General Public License for more details.`
			`//`
			`// You should have received a copy of the GNU General Public License`
			`// along with YouCompleteMe. If not, see <http://www.gnu.org/licenses/>.`

			`#include "IdentifierUtils.h"`
The identifier completer now reads tags files See the docs for details. Fixes #135. 2013-05-26 16:28:00 -04:00			`#include "Utils.h"`
Now extracting identifiers in C++ code 2012-07-23 23:17:59 -04:00			`#include "standard.h"`

The identifier completer now reads tags files See the docs for details. Fixes #135. 2013-05-26 16:28:00 -04:00			`#include <boost/unordered_map.hpp>`
			`#include <boost/assign/list_of.hpp>`
Now extracting identifiers in C++ code 2012-07-23 23:17:59 -04:00			`#include <boost/regex.hpp>`
			`#include <boost/algorithm/string/regex.hpp>`

Formatting more code with astyle 2013-01-19 23:10:52 -05:00			`namespace YouCompleteMe {`
Now extracting identifiers in C++ code 2012-07-23 23:17:59 -04:00
The identifier completer now reads tags files See the docs for details. Fixes #135. 2013-05-26 16:28:00 -04:00			`namespace fs = boost::filesystem;`

			`namespace {`

Formatting more code with astyle 2013-01-19 23:10:52 -05:00			`const char *COMMENT_AND_STRING_REGEX =`
Now extracting identifiers in C++ code 2012-07-23 23:17:59 -04:00			`"//.*?$" // Anything following '//'`
			`"\|"`
			`"#.*?$" // Anything following '#'`
			`"\|"`
			`"/\\.?\\/" // C-style comments, '/ ... */'`
			`"\|"`
Updating comments for the string-stripping regex 2013-03-23 13:56:56 -04:00			`// Anything inside single quotes, '...', but mind the escaped quote and the`
			`// escaped slash (\\)`
Fix handling of escaped \ in char or string. 2013-03-14 00:41:20 -04:00			`"'(?:\\\\\\\\\|\\\\'\|.)*?'"`
Now extracting identifiers in C++ code 2012-07-23 23:17:59 -04:00			`"\|"`
Updating comments for the string-stripping regex 2013-03-23 13:56:56 -04:00			`// Anything inside double quotes, "...", but mind the escaped double quote and`
			`// the escaped slash (\\)`
Fix handling of escaped \ in char or string. 2013-03-14 00:41:20 -04:00			`"\"(?:\\\\\\\\\|\\\\\"\|.)*?\"";`
Now extracting identifiers in C++ code 2012-07-23 23:17:59 -04:00
Formatting more code with astyle 2013-01-19 23:10:52 -05:00			`const char IDENTIFIER_REGEX = "[_a-zA-Z]\\w";`
Now extracting identifiers in C++ code 2012-07-23 23:17:59 -04:00
The identifier completer now reads tags files See the docs for details. Fixes #135. 2013-05-26 16:28:00 -04:00			`// For details on the tag format supported, see here for details:`
			`// http://ctags.sourceforge.net/FORMAT`
			`// TL;DR: The only supported format is the one Exuberant Ctags emits.`
			`const char *TAG_REGEX =`
			`"^([^\\t\\n\\r]+)" // The first field is the identifier`
			`"\\t" // A TAB char is the field separator`
			`// The second field is the path to the file that has the identifier; either`
			`// absolute or relative to the tags file.`
			`"([^\\t\\n\\r]+)"`
			`"\\t.*?" // Non-greedy everything`
			`"language:([^\\t\\n\\r]+)" // We want to capture the language of the file`
			`".*?$";`


			`// List of languages Exuberant Ctags supports:`
			`// ctags --list-languages`
			`// To map a language name to a filetype, see this file:`
			`// :e $VIMRUNTIME/filetype.vim`
			`const boost::unordered_map< std::string, std::string > LANG_TO_FILETYPE =`
			`boost::assign::map_list_of`
code style fixes 2013-05-30 01:23:19 -04:00			`( std::string( "Ant" ), std::string( "ant" ) )`
			`( std::string( "Asm" ), std::string( "asm" ) )`
			`( std::string( "Awk" ), std::string( "awk" ) )`
			`( std::string( "Basic" ), std::string( "basic" ) )`
			`( std::string( "C++" ), std::string( "cpp" ) )`
			`( std::string( "C#" ), std::string( "cs" ) )`
			`( std::string( "C" ), std::string( "c" ) )`
			`( std::string( "COBOL" ), std::string( "cobol" ) )`
			`( std::string( "DosBatch" ), std::string( "dosbatch" ) )`
			`( std::string( "Eiffel" ), std::string( "eiffel" ) )`
			`( std::string( "Erlang" ), std::string( "erlang" ) )`
			`( std::string( "Fortran" ), std::string( "fortran" ) )`
			`( std::string( "HTML" ), std::string( "html" ) )`
			`( std::string( "Java" ), std::string( "java" ) )`
			`( std::string( "JavaScript" ), std::string( "javascript" ) )`
			`( std::string( "Lisp" ), std::string( "lisp" ) )`
			`( std::string( "Lua" ), std::string( "lua" ) )`
			`( std::string( "Make" ), std::string( "make" ) )`
			`( std::string( "MatLab" ), std::string( "matlab" ) )`
			`( std::string( "OCaml" ), std::string( "ocaml" ) )`
			`( std::string( "Pascal" ), std::string( "pascal" ) )`
			`( std::string( "Perl" ), std::string( "perl" ) )`
			`( std::string( "PHP" ), std::string( "php" ) )`
			`( std::string( "Python" ), std::string( "python" ) )`
			`( std::string( "REXX" ), std::string( "rexx" ) )`
			`( std::string( "Ruby" ), std::string( "ruby" ) )`
			`( std::string( "Scheme" ), std::string( "scheme" ) )`
			`( std::string( "Sh" ), std::string( "sh" ) )`
			`( std::string( "SLang" ), std::string( "slang" ) )`
			`( std::string( "SML" ), std::string( "sml" ) )`
			`( std::string( "SQL" ), std::string( "sql" ) )`
			`( std::string( "Tcl" ), std::string( "tcl" ) )`
			`( std::string( "Tex" ), std::string( "tex" ) )`
			`( std::string( "Vera" ), std::string( "vera" ) )`
			`( std::string( "Verilog" ), std::string( "verilog" ) )`
			`( std::string( "VHDL" ), std::string( "vhdl" ) )`
			`( std::string( "Vim" ), std::string( "vim" ) )`
			`( std::string( "YACC" ), std::string( "yacc" ) );`
The identifier completer now reads tags files See the docs for details. Fixes #135. 2013-05-26 16:28:00 -04:00
			`const std::string NOT_FOUND = "YCMFOOBAR_NOT_FOUND";`

			`} // unnamed namespace`

Now extracting identifiers in C++ code 2012-07-23 23:17:59 -04:00
Formatting more code with astyle 2013-01-19 23:10:52 -05:00			`std::string RemoveIdentifierFreeText( std::string text ) {`
Buffer identifiers are now extracted async 2012-07-24 23:09:09 -04:00			`boost::erase_all_regex( text, boost::regex( COMMENT_AND_STRING_REGEX ) );`
			`return text;`
Now extracting identifiers in C++ code 2012-07-23 23:17:59 -04:00			`}`


			`std::vector< std::string > ExtractIdentifiersFromText(`
Formatting more code with astyle 2013-01-19 23:10:52 -05:00			`const std::string &text ) {`
Now extracting identifiers in C++ code 2012-07-23 23:17:59 -04:00			`std::string::const_iterator start = text.begin();`
			`std::string::const_iterator end = text.end();`

The identifier completer now reads tags files See the docs for details. Fixes #135. 2013-05-26 16:28:00 -04:00			`boost::smatch matches;`
			`const boost::regex expression( IDENTIFIER_REGEX );`
Now extracting identifiers in C++ code 2012-07-23 23:17:59 -04:00
			`std::vector< std::string > identifiers;`
Formatting more code with astyle 2013-01-19 23:10:52 -05:00
			`while ( boost::regex_search( start, end, matches, expression ) ) {`
Now extracting identifiers in C++ code 2012-07-23 23:17:59 -04:00			`identifiers.push_back( matches[ 0 ] );`
			`start = matches[ 0 ].second;`
			`}`

			`return identifiers;`
			`}`

The identifier completer now reads tags files See the docs for details. Fixes #135. 2013-05-26 16:28:00 -04:00
			`FiletypeIdentifierMap ExtractIdentifiersFromTagsFile(`
code style fixes 2013-05-30 01:23:19 -04:00			`const fs::path &path_to_tag_file ) {`
The identifier completer now reads tags files See the docs for details. Fixes #135. 2013-05-26 16:28:00 -04:00			`FiletypeIdentifierMap filetype_identifier_map;`
			`std::string tags_file_contents;`

			`try {`
			`tags_file_contents = ReadUtf8File( path_to_tag_file );`
code style fixes 2013-05-30 01:23:19 -04:00			`} catch ( ... ) {`
The identifier completer now reads tags files See the docs for details. Fixes #135. 2013-05-26 16:28:00 -04:00			`return filetype_identifier_map;`
			`}`

			`std::string::const_iterator start = tags_file_contents.begin();`
			`std::string::const_iterator end = tags_file_contents.end();`

			`boost::smatch matches;`
			`const boost::regex expression( TAG_REGEX );`
			`const boost::match_flag_type options = boost::match_not_dot_newline;`

			`while ( boost::regex_search( start, end, matches, expression, options ) ) {`
			`start = matches[ 0 ].second;`

			`std::string language( matches[ 3 ] );`
			`std::string filetype = FindWithDefault( LANG_TO_FILETYPE,`
			`language,`
			`NOT_FOUND );`

			`if ( filetype == NOT_FOUND )`
			`continue;`

			`std::string identifier( matches[ 1 ] );`
			`fs::path path( matches[ 2 ] );`

			`if ( path.is_relative() )`
			`path = path_to_tag_file.parent_path() / path;`

			`filetype_identifier_map[ filetype ][ path.string() ].push_back( identifier );`
			`}`

			`return filetype_identifier_map;`
			`}`

Now extracting identifiers in C++ code 2012-07-23 23:17:59 -04:00			`} // namespace YouCompleteMe`