[prev in list] [next in list] [prev in thread] [next in thread] 

List:       cfe-commits
Subject:    Re: [cfe-commits] [PATCH 2/2] [clang.py] Implement Token API
From:       Gregory Szorc <gregory.szorc () gmail ! com>
Date:       2012-06-30 2:16:39
Message-ID: CAKQoGan7VkM9Z5o-HG-n+k+T1bWQsEDd=zC1FyrWijjEYrL-XQ () mail ! gmail ! com
[Download RAW message or body]

Updated patch attached.

* Added Cursor.get_tokens()
* Refactored TokenGroup out of TranslationUnit to support above
* Refactored with TranslationUnit.get_* API changes from last patch.

On Fri, Jun 29, 2012 at 12:18 AM, Gregory Szorc <gregory.szorc@gmail.com> w=
rote:
> This exposes all of libclang's token functions in the Python bindings.
>
> I'm putting the TokenKind enumerations in a new module,
> clang.enumerations. I plan to eventually move all existing
> enumerations there so they are all consolidated. And, maybe one day,
> we can even generate that file automatically by parsing the libclang
> header files. I know Anders has code around somewhere that does
> this...
>
> ---
> =C2=A0bindings/python/clang/cindex.py =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =
=C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0| 177 +++++++++++++++++++--
> =C2=A0bindings/python/clang/enumerations.py =C2=A0 =C2=A0 =C2=A0 =C2=A0 =
=C2=A0 =C2=A0 =C2=A0| =C2=A032 ++++
> =C2=A0bindings/python/tests/cindex/test_token_kind.py =C2=A0 =C2=A0| =C2=
=A043 +++++
> =C2=A0bindings/python/tests/cindex/test_tokens.py =C2=A0 =C2=A0 =C2=A0 =
=C2=A0| =C2=A052 ++++++
> =C2=A0.../python/tests/cindex/test_translation_unit.py =C2=A0 | =C2=A024 =
++-
> =C2=A05 files changed, 312 insertions(+), 16 deletions(-)
> =C2=A0create mode 100644 bindings/python/clang/enumerations.py
> =C2=A0create mode 100644 bindings/python/tests/cindex/test_token_kind.py
> =C2=A0create mode 100644 bindings/python/tests/cindex/test_tokens.py

["0002-clang.py-Implement-Token-API.patch" (application/octet-stream)]

From 4fe573444ebdcc876a4973d7dc02558b289c69e8 Mon Sep 17 00:00:00 2001
From: Gregory Szorc <gregory.szorc@gmail.com>
Date: Thu, 28 Jun 2012 23:23:39 -0700
Subject: [PATCH 2/2] [clang.py] Implement Token API

---
 bindings/python/clang/cindex.py                    | 191 +++++++++++++++++++--
 bindings/python/clang/enumerations.py              |  32 ++++
 bindings/python/tests/cindex/test_cursor.py        |  10 ++
 bindings/python/tests/cindex/test_token_kind.py    |  43 +++++
 bindings/python/tests/cindex/test_tokens.py        |  52 ++++++
 .../python/tests/cindex/test_translation_unit.py   |  24 ++-
 6 files changed, 336 insertions(+), 16 deletions(-)
 create mode 100644 bindings/python/clang/enumerations.py
 create mode 100644 bindings/python/tests/cindex/test_token_kind.py
 create mode 100644 bindings/python/tests/cindex/test_tokens.py

diff --git a/bindings/python/clang/cindex.py b/bindings/python/clang/cindex.py
index 41835d8..82e46a8 100644
--- a/bindings/python/clang/cindex.py
+++ b/bindings/python/clang/cindex.py
@@ -60,16 +60,18 @@ call is efficient.
 # o cleanup ctypes wrapping, would be nice to separate the ctypes details more
 #   clearly, and hide from the external interface (i.e., help(cindex)).
 #
 # o implement additional SourceLocation, SourceRange, and File methods.
 
 from ctypes import *
 import collections
 
+import clang.enumerations
+
 def get_cindex_library():
     # FIXME: It's probably not the case that the library is actually found in
     # this location. We need a better system of identifying and loading the
     # CIndex library. It could be on path or elsewhere, or versioned, etc.
     import platform
     name = platform.system()
     if name == 'Darwin':
         return cdll.LoadLibrary('libclang.dylib')
@@ -362,16 +364,102 @@ class FixIt(object):
 
     def __init__(self, range, value):
         self.range = range
         self.value = value
 
     def __repr__(self):
         return "<FixIt range %r, value %r>" % (self.range, self.value)
 
+class TokenGroup(object):
+    """Helper class to facilitate token management.
+
+    Tokens are allocated from libclang in chunks. They must be disposed of as a
+    collective group.
+
+    One purpose of this class is for instances to represent groups of allocated
+    tokens. Each token in a group contains a reference back to an instance of
+    this class. When all tokens from a group are garbage collected, it allows
+    this class to be garbage collected. When this class is garbage collected,
+    it calls the libclang destructor which invalidates all tokens in the group.
+
+    You should not instantiate this class outside of this module.
+    """
+    def __init__(self, tu, memory, count):
+        self._tu = tu
+        self._memory = memory
+        self._count = count
+
+    def __del__(self):
+        lib.clang_disposeTokens(self._tu, self._memory, self._count)
+
+    @staticmethod
+    def get_tokens(tu, extent):
+        """Helper method to return all tokens in an extent.
+
+        This functionality is needed multiple places in this module. We define
+        it here because it seems like a logical place.
+        """
+        tokens_memory = POINTER(Token)()
+        tokens_count = c_uint()
+
+        lib.clang_tokenize(tu, extent, byref(tokens_memory),
+                byref(tokens_count))
+
+        count = int(tokens_count.value)
+        tokens_array = cast(tokens_memory, POINTER(Token * count)).contents
+
+        token_group = TokenGroup(tu, tokens_memory, tokens_count)
+
+        for i in xrange(0, count):
+            token = Token()
+            token.int_data = tokens_array[i].int_data
+            token.ptr_data = tokens_array[i].ptr_data
+            token._tu = tu
+            token._group = token_group
+
+            yield token
+
+class TokenKind(object):
+    """Describes a specific type of a Token."""
+
+    _value_map = {} # int -> TokenKind
+
+    def __init__(self, value, name):
+        """Create a new TokenKind instance from a numeric value and a name."""
+        self.value = value
+        self.name = name
+
+    def __repr__(self):
+        return 'TokenKind.%s' % (self.name,)
+
+    @staticmethod
+    def from_value(value):
+        """Obtain a registered TokenKind instance from its value."""
+        result = TokenKind._value_map.get(value, None)
+
+        if result is None:
+            raise ValueError('Unknown TokenKind: %d' % value)
+
+        return result
+
+    @staticmethod
+    def register(value, name):
+        """Register a new TokenKind enumeration.
+
+This should only be called at module load time by code within this
+package.
+"""
+        if value in TokenKind._value_map:
+            raise ValueError('TokenKind already registered: %d' % value)
+
+        kind = TokenKind(value, name)
+        TokenKind._value_map[value] = kind
+        setattr(TokenKind, name, kind)
+
 ### Cursor Kinds ###
 
 class CursorKind(object):
     """
     A CursorKind describes the kind of entity that a cursor points to.
     """
 
     # The unique kind objects, indexed by id.
@@ -1176,16 +1264,24 @@ class Cursor(Structure):
             child._tu = self._tu
             children.append(child)
             return 1 # continue
         children = []
         lib.clang_visitChildren(self, callbacks['cursor_visit'](visitor),
                 children)
         return iter(children)
 
+    def get_tokens(self):
+        """Obtain Token instances formulating that compose this Cursor.
+
+        This is a generator for Token instances. It returns all tokens which
+        occupy the extent this cursor occupies.
+        """
+        return TokenGroup.get_tokens(self._tu, self.extent)
+
     @staticmethod
     def from_result(res, fn, args):
         assert isinstance(res, Cursor)
         # FIXME: There should just be an isNull method.
         if res == lib.clang_getNullCursor():
             return None
 
         # Store a reference to the TU in the Python object so it won't get GC'd
@@ -2041,16 +2137,29 @@ class TranslationUnit(ClangObject):
                 unsaved_files_array[i].contents = value
                 unsaved_files_array[i].length = len(value)
         ptr = lib.clang_codeCompleteAt(self, path, line, column,
                 unsaved_files_array, len(unsaved_files), options)
         if ptr:
             return CodeCompletionResults(ptr)
         return None
 
+    def get_tokens(self, locations=None, extent=None):
+        """Obtain tokens in this translation unit.
+
+        This is a generator for Token instances. The caller specifies a range
+        of source code to obtain tokens for. The range can be specified as a
+        2-tuple of SourceLocation or as a SourceRange. If both are defined,
+        behavior is undefined.
+        """
+        if locations is not None:
+            extent = SourceRange(start=locations[0], end=locations[1])
+
+        return TokenGroup.get_tokens(self, extent)
+
 class File(ClangObject):
     """
     The File class represents a particular source file that is part of a
     translation unit.
     """
 
     @staticmethod
     def from_name(translation_unit, file_name):
@@ -2096,32 +2205,78 @@ class FileInclusion(object):
         self.location = loc
         self.depth = depth
 
     @property
     def is_input_file(self):
         """True if the included file is the input file."""
         return self.depth == 0
 
+class Token(Structure):
+    """Represents a single token from the preprocessor.
+
+    Tokens are effectively segments of source code. Source code is first parsed
+    into tokens before being converted into the AST and Cursors.
+
+    Tokens are obtained from parsed TranslationUnit instances. You currently
+    can't create tokens manually.
+    """
+    _fields_ = [
+        ('int_data', c_uint * 4),
+        ('ptr_data', c_void_p)
+    ]
+
+    @property
+    def spelling(self):
+        """The spelling of this token.
+
+        This is the textual representation of the token in source.
+        """
+        return lib.clang_getTokenSpelling(self._tu, self)
+
+    @property
+    def kind(self):
+        """Obtain the TokenKind of the current token."""
+        return TokenKind.from_value(lib.clang_getTokenKind(self))
+
+    @property
+    def location(self):
+        """The SourceLocation this Token occurs at."""
+        return lib.clang_getTokenLocation(self._tu, self)
+
+    @property
+    def extent(self):
+        """The SourceRange this Token occupies."""
+        return lib.clang_getTokenExtent(self._tu, self)
+
+    @property
+    def cursor(self):
+        """The Cursor this Token corresponds to."""
+        cursor = Cursor()
+
+        lib.clang_annotateTokens(self._tu, byref(self), 1, byref(cursor))
+
+        return cursor
+
 # Now comes the plumbing to hook up the C library.
 
 # Register callback types in common container.
 callbacks['translation_unit_includes'] = CFUNCTYPE(None, c_object_p,
         POINTER(SourceLocation), c_uint, py_object)
 callbacks['cursor_visit'] = CFUNCTYPE(c_int, Cursor, Cursor, py_object)
 
 def register_functions(lib):
     """Register function prototypes with a libclang library instance.
 
     This must be called as part of library instantiation so Python knows how
     to call out to the shared library.
     """
     # Functions are registered in strictly alphabetical order.
-    #lib.clang_annotateTokens.argtype = [TranslationUnit, POINTER(Token),
-    #                                    c_uint, POINTER(Cursor)]
+    lib.clang_annotateTokens.argtype = [TranslationUnit, POINTER(Token),
+                                        c_uint, POINTER(Cursor)]
 
     lib.clang_codeCompleteAt.argtypes = [TranslationUnit, c_char_p, c_int,
             c_int, c_void_p, c_int, c_int]
     lib.clang_codeCompleteAt.restype = POINTER(CCRStructure)
 
     lib.clang_codeCompleteGetDiagnostic.argtypes = [CodeCompletionResults,
             c_int]
     lib.clang_codeCompleteGetDiagnostic.restype = Diagnostic
@@ -2149,17 +2304,17 @@ def register_functions(lib):
     #lib.clang_disposeCXTUResourceUsage.argtypes = [CXTUResourceUsage]
 
     lib.clang_disposeDiagnostic.argtypes = [Diagnostic]
 
     lib.clang_disposeIndex.argtypes = [Index]
 
     lib.clang_disposeString.argtypes = [_CXString]
 
-    #lib.clang_disposeTokens.argtype = [TranslationUnit, POINTER(Token), c_uint]
+    lib.clang_disposeTokens.argtype = [TranslationUnit, POINTER(Token), c_uint]
 
     lib.clang_disposeTranslationUnit.argtypes = [TranslationUnit]
 
     lib.clang_equalCursors.argtypes = [Cursor, Cursor]
     lib.clang_equalCursors.restype = bool
 
     lib.clang_equalLocations.argtypes = [SourceLocation, SourceLocation]
     lib.clang_equalLocations.restype = bool
@@ -2384,29 +2539,28 @@ def register_functions(lib):
 
     lib.clang_getSpecializedCursorTemplate.argtypes = [Cursor]
     lib.clang_getSpecializedCursorTemplate.restype = Cursor
     lib.clang_getSpecializedCursorTemplate.errcheck = Cursor.from_cursor_result
 
     lib.clang_getTemplateCursorKind.argtypes = [Cursor]
     lib.clang_getTemplateCursorKind.restype = c_uint
 
-    #lib.clang_getTokenExtent.argtypes = [TranslationUnit, Token]
-    #lib.clang_getTokenExtent.restype = SourceRange
+    lib.clang_getTokenExtent.argtypes = [TranslationUnit, Token]
+    lib.clang_getTokenExtent.restype = SourceRange
 
-    #lib.clang_getTokenKind.argtypes = [Token]
-    #lib.clang_getTokenKind.restype = c_uint
-    #lib.clang_getTokenKind.errcheck = TokenKind.from_result
+    lib.clang_getTokenKind.argtypes = [Token]
+    lib.clang_getTokenKind.restype = c_uint
 
-    #lib.clang_getTokenLocation.argtype = [TranslationUnit, Token]
-    #lib.clang_getTokenLocation.restype = SourceLocation
+    lib.clang_getTokenLocation.argtype = [TranslationUnit, Token]
+    lib.clang_getTokenLocation.restype = SourceLocation
 
-    #lib.clang_getTokenSpelling.argtype = [TranslationUnit, Token]
-    #lib.clang_getTokenSpelling.restype = _CXString
-    #lib.clang_getTokenSpelling.errcheck = _CXString.from_result
+    lib.clang_getTokenSpelling.argtype = [TranslationUnit, Token]
+    lib.clang_getTokenSpelling.restype = _CXString
+    lib.clang_getTokenSpelling.errcheck = _CXString.from_result
 
     lib.clang_getTranslationUnitCursor.argtypes = [TranslationUnit]
     lib.clang_getTranslationUnitCursor.restype = Cursor
     lib.clang_getTranslationUnitCursor.errcheck = Cursor.from_result
 
     lib.clang_getTranslationUnitSpelling.argtypes = [TranslationUnit]
     lib.clang_getTranslationUnitSpelling.restype = _CXString
     lib.clang_getTranslationUnitSpelling.errcheck = _CXString.from_result
@@ -2487,33 +2641,40 @@ def register_functions(lib):
     lib.clang_reparseTranslationUnit.argtypes = [TranslationUnit, c_int,
             c_void_p, c_int]
     lib.clang_reparseTranslationUnit.restype = c_int
 
     lib.clang_saveTranslationUnit.argtypes = [TranslationUnit, c_char_p,
             c_uint]
     lib.clang_saveTranslationUnit.restype = c_int
 
-    #lib.clang_tokenize.argtypes = [TranslationUnit, SourceRange,
-    #        POINTER(POINTER(Token)), POINTER(c_uint)]
+    lib.clang_tokenize.argtypes = [TranslationUnit, SourceRange,
+            POINTER(POINTER(Token)), POINTER(c_uint)]
 
     lib.clang_visitChildren.argtypes = [Cursor, callbacks['cursor_visit'],
             py_object]
     lib.clang_visitChildren.restype = c_uint
 
 register_functions(lib)
 
+def register_enumerations():
+    for name, value in clang.enumerations.TokenKinds:
+        TokenKind.register(value, name)
+
+register_enumerations()
 
 __all__ = [
     'CodeCompletionResults',
     'CursorKind',
     'Cursor',
     'Diagnostic',
     'File',
     'FixIt',
     'Index',
     'SourceLocation',
     'SourceRange',
+    'TokenKind',
+    'Token',
     'TranslationUnitLoadError',
     'TranslationUnit',
     'TypeKind',
     'Type',
 ]
diff --git a/bindings/python/clang/enumerations.py \
b/bindings/python/clang/enumerations.py new file mode 100644
index 0000000..12e82ed
--- /dev/null
+++ b/bindings/python/clang/enumerations.py
@@ -0,0 +1,32 @@
+#===- enumerations.py - Python Enumerations ------------------*- python -*--===#
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+#===------------------------------------------------------------------------===#
+
+"""
+Clang Enumerations
+==================
+
+This module provides static definitions of enumerations that exist in libclang.
+
+Enumerations are typically defined as a list of tuples. The exported values are
+typically munged into other types or classes at module load time.
+
+All enumerations are centrally defined in this file so they are all grouped
+together and easier to audit. And, maybe even one day this file will be
+automatically generated by scanning the libclang headers!
+"""
+
+TokenKinds = [
+    ('PUNCTUATION', 0),
+    ('KEYWORD', 1),
+    ('IDENTIFIER', 2),
+    ('LITERAL', 3),
+    ('COMMENT', 4),
+]
+
+__all__ = ['TokenKinds']
diff --git a/bindings/python/tests/cindex/test_cursor.py \
b/bindings/python/tests/cindex/test_cursor.py index 979838b..51695e2 100644
--- a/bindings/python/tests/cindex/test_cursor.py
+++ b/bindings/python/tests/cindex/test_cursor.py
@@ -226,8 +226,18 @@ def test_annotation_attribute():
 
 def test_result_type():
     tu = get_tu('int foo();')
     foo = get_cursor(tu, 'foo')
 
     assert foo is not None
     t = foo.result_type
     assert t.kind == TypeKind.INT
+
+def test_get_tokens():
+    """Ensure we can map cursors back to tokens."""
+    tu = get_tu('int foo(int i);')
+    foo = get_cursor(tu, 'foo')
+
+    tokens = list(foo.get_tokens())
+    assert len(tokens) == 7
+    assert tokens[0].spelling == 'int'
+    assert tokens[1].spelling == 'foo'
diff --git a/bindings/python/tests/cindex/test_token_kind.py \
b/bindings/python/tests/cindex/test_token_kind.py new file mode 100644
index 0000000..62ec63e
--- /dev/null
+++ b/bindings/python/tests/cindex/test_token_kind.py
@@ -0,0 +1,43 @@
+from clang.cindex import TokenKind
+from nose.tools import eq_
+from nose.tools import ok_
+from nose.tools import raises
+
+def test_constructor():
+    """Ensure TokenKind constructor works as expected."""
+
+    t = TokenKind(5, 'foo')
+
+    eq_(t.value, 5)
+    eq_(t.name, 'foo')
+
+@raises(ValueError)
+def test_bad_register():
+    """Ensure a duplicate value is rejected for registration."""
+
+    TokenKind.register(2, 'foo')
+
+@raises(ValueError)
+def test_unknown_value():
+    """Ensure trying to fetch an unknown value raises."""
+
+    TokenKind.from_value(-1)
+
+def test_registration():
+    """Ensure that items registered appear as class attributes."""
+    ok_(hasattr(TokenKind, 'LITERAL'))
+    literal = TokenKind.LITERAL
+
+    ok_(isinstance(literal, TokenKind))
+
+def test_from_value():
+    """Ensure registered values can be obtained from from_value()."""
+    t = TokenKind.from_value(3)
+    ok_(isinstance(t, TokenKind))
+    eq_(t, TokenKind.LITERAL)
+
+def test_repr():
+    """Ensure repr() works."""
+
+    r = repr(TokenKind.LITERAL)
+    eq_(r, 'TokenKind.LITERAL')
diff --git a/bindings/python/tests/cindex/test_tokens.py \
b/bindings/python/tests/cindex/test_tokens.py new file mode 100644
index 0000000..3c63950
--- /dev/null
+++ b/bindings/python/tests/cindex/test_tokens.py
@@ -0,0 +1,52 @@
+from clang.cindex import CursorKind
+from clang.cindex import Index
+from clang.cindex import SourceLocation
+from clang.cindex import SourceRange
+from clang.cindex import TokenKind
+from nose.tools import eq_
+from nose.tools import ok_
+
+from .util import get_tu
+
+def test_token_to_cursor():
+    """Ensure we can obtain a Cursor from a Token instance."""
+    tu = get_tu('int i = 5;')
+    r = tu.get_extent('t.c', offsets=(0, 9))
+    tokens = list(tu.get_tokens(extent=r))
+
+    assert len(tokens) == 5
+    assert tokens[1].spelling == 'i'
+    assert tokens[1].kind == TokenKind.IDENTIFIER
+
+    cursor = tokens[1].cursor
+    assert cursor.kind == CursorKind.VAR_DECL
+    assert tokens[1].cursor == tokens[2].cursor
+
+def test_token_location():
+    """Ensure Token.location works."""
+
+    tu = get_tu('int foo = 10;')
+    r = tu.get_extent('t.c', offsets=(0, 11))
+
+    tokens = list(tu.get_tokens(extent=r))
+    eq_(len(tokens), 4)
+
+    loc = tokens[1].location
+    ok_(isinstance(loc, SourceLocation))
+    eq_(loc.line, 1)
+    eq_(loc.column, 5)
+    eq_(loc.offset, 4)
+
+def test_token_extent():
+    """Ensure Token.extent works."""
+    tu = get_tu('int foo = 10;')
+    r = tu.get_extent('t.c', offsets=(0, 11))
+
+    tokens = list(tu.get_tokens(extent=r))
+    eq_(len(tokens), 4)
+
+    extent = tokens[1].extent
+    ok_(isinstance(extent, SourceRange))
+
+    eq_(extent.start.offset, 4)
+    eq_(extent.end.offset, 7)
diff --git a/bindings/python/tests/cindex/test_translation_unit.py \
b/bindings/python/tests/cindex/test_translation_unit.py index f45f9ca..ab1784b 100644
--- a/bindings/python/tests/cindex/test_translation_unit.py
+++ b/bindings/python/tests/cindex/test_translation_unit.py
@@ -1,19 +1,21 @@
+import gc
+import os
+
 from clang.cindex import CursorKind
 from clang.cindex import Cursor
 from clang.cindex import File
 from clang.cindex import Index
 from clang.cindex import SourceLocation
 from clang.cindex import SourceRange
 from clang.cindex import TranslationUnitSaveError
 from clang.cindex import TranslationUnit
 from .util import get_cursor
 from .util import get_tu
-import os
 
 kInputsDir = os.path.join(os.path.dirname(__file__), 'INPUTS')
 
 def test_spelling():
     path = os.path.join(kInputsDir, 'hello.cpp')
     tu = TranslationUnit.from_source(path)
     assert tu.spelling == path
 
@@ -212,8 +214,28 @@ def test_get_source_range():
     end = tu.get_location('t.c', offset=5)
 
     r = tu.get_extent('t.c', locations=(start, end))
     assert isinstance(r, SourceRange)
     assert r.start.offset == 0
     assert r.end.offset == 5
     assert r.start.file.name == 't.c'
     assert r.end.file.name == 't.c'
+
+def test_get_tokens_gc():
+    """Ensures get_tokens() works properly with garbage collection."""
+
+    tu = get_tu('int foo();')
+    r = tu.get_extent('t.c', offsets=(0, 10))
+    tokens = list(tu.get_tokens(extent=r))
+
+    assert tokens[0].spelling == 'int'
+    gc.collect()
+    assert tokens[0].spelling == 'int'
+
+    del tokens[1]
+    gc.collect()
+    assert tokens[0].spelling == 'int'
+
+    # May trigger segfault if we don't do our job properly.
+    del tokens
+    gc.collect()
+    gc.collect() # Just in case.
-- 
1.7.11.1



_______________________________________________
cfe-commits mailing list
cfe-commits@cs.uiuc.edu
http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits


[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic