'[pypy-commit] pypy unicode-utf8: DecodeBuffer returns utf8 codepoints'

[prev in list] [next in list] [prev in thread] [next in thread] 

List:       pypy-svn
Subject:    [pypy-commit] pypy unicode-utf8: DecodeBuffer returns utf8 codepoints
From:       mattip <pypy.commits () gmail ! com>
Date:       2018-12-24 15:28:25
Message-ID: 5c20fb19.1c69fb81.49892.2d2e () mx ! google ! com
[Download RAW message or body]

Author: Matti Picus <matti.picus@gmail.com>
Branch: unicode-utf8
Changeset: r95527:74c350367634
Date: 2018-12-24 16:25 +0200
http://bitbucket.org/pypy/pypy/changeset/74c350367634/

Log:	DecodeBuffer returns utf8 codepoints

diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -302,35 +302,43 @@
     def __init__(self, text=None):
         self.text = text
         self.pos = 0
+        self.upos = 0
 
     def set(self, space, w_decoded):
         check_decoded(space, w_decoded)
         self.text = space.utf8_w(w_decoded)
         self.pos = 0
+        self.upos = 0
 
     def reset(self):
         self.text = None
         self.pos = 0
+        self.upos = 0
 
     def get_chars(self, size):
         if self.text is None:
             return ""
 
-        available = len(self.text) - self.pos
+        lgt = codepoints_in_utf8(self.text)
+        available = lgt - self.upos
         if size < 0 or size > available:
             size = available
         assert size >= 0
 
         if self.pos > 0 or size < available:
             start = self.pos
-            end = self.pos + size
-            assert start >= 0
-            assert end >= 0
-            chars = self.text[start:end]
+            ret = []
+            pos = start
+            for  i in range(size):
+                pos = next_codepoint_pos(self.text, pos)
+                self.upos += 1
+            chars = self.text[start:pos]
+            self.pos = pos
         else:
             chars = self.text
+            self.pos = len(self.text)
+            self.upos = lgt
 
-        self.pos += size
         return chars
 
     def has_data(self):
@@ -342,16 +350,18 @@
     def next_char(self):
         if self.exhausted():
             raise StopIteration
-        ch = self.text[self.pos]
-        self.pos = next_codepoint_pos(self.text, self.pos)
+        newpos = next_codepoint_pos(self.text, self.pos)
+        ch = self.text[self.pos:newpos]
+        self.pos = newpos
+        self.upos += 1
         return ch
 
     def peek_char(self):
         # like next_char, but doesn't advance pos
         if self.exhausted():
             raise StopIteration
-        ch = self.text[self.pos]
-        return ch
+        newpos = next_codepoint_pos(self.text, self.pos)
+        return self.text[self.pos:newpos]
 
     def find_newline_universal(self, limit):
         # Universal newline search. Find any of \r, \r\n, \n
diff --git a/pypy/module/_io/test/test_interp_textio.py b/pypy/module/_io/test/test_interp_textio.py
--- a/pypy/module/_io/test/test_interp_textio.py
+++ b/pypy/module/_io/test/test_interp_textio.py
@@ -1,6 +1,6 @@
 import pytest
 try:
-    from hypothesis import given, strategies as st, settings
+    from hypothesis import given, strategies as st, settings, example
 except ImportError:
     pytest.skip("hypothesis required")
 import os
@@ -63,6 +63,7 @@
     assert buf.exhausted()
 
 @given(st.text(), st.lists(st.integers(min_value=0)))
+@example(u'\x80', [1])
 def test_readn_buffer(text, sizes):
     buf = DecodeBuffer(text.encode('utf-8'))
     strings = []
@@ -80,5 +81,5 @@
     buf = DecodeBuffer(text.encode('utf-8'))
     for i in range(len(text)):
         ch = buf.next_char()
-        assert ch == text[i].encode('utf-8')[0]
+        assert ch == text[i].encode('utf-8')
     assert buf.exhausted()
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit
[prev in list] [next in list] [prev in thread] [next in thread]