[prev in list] [next in list] [prev in thread] [next in thread]
List: pypy-svn
Subject: [pypy-commit] pypy unicode-utf8: DecodeBuffer returns utf8 codepoints
From: mattip <pypy.commits () gmail ! com>
Date: 2018-12-24 15:28:25
Message-ID: 5c20fb19.1c69fb81.49892.2d2e () mx ! google ! com
[Download RAW message or body]
Author: Matti Picus <matti.picus@gmail.com>
Branch: unicode-utf8
Changeset: r95527:74c350367634
Date: 2018-12-24 16:25 +0200
http://bitbucket.org/pypy/pypy/changeset/74c350367634/
Log: DecodeBuffer returns utf8 codepoints
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -302,35 +302,43 @@
def __init__(self, text=None):
self.text = text
self.pos = 0
+ self.upos = 0
def set(self, space, w_decoded):
check_decoded(space, w_decoded)
self.text = space.utf8_w(w_decoded)
self.pos = 0
+ self.upos = 0
def reset(self):
self.text = None
self.pos = 0
+ self.upos = 0
def get_chars(self, size):
if self.text is None:
return ""
- available = len(self.text) - self.pos
+ lgt = codepoints_in_utf8(self.text)
+ available = lgt - self.upos
if size < 0 or size > available:
size = available
assert size >= 0
if self.pos > 0 or size < available:
start = self.pos
- end = self.pos + size
- assert start >= 0
- assert end >= 0
- chars = self.text[start:end]
+ ret = []
+ pos = start
+ for i in range(size):
+ pos = next_codepoint_pos(self.text, pos)
+ self.upos += 1
+ chars = self.text[start:pos]
+ self.pos = pos
else:
chars = self.text
+ self.pos = len(self.text)
+ self.upos = lgt
- self.pos += size
return chars
def has_data(self):
@@ -342,16 +350,18 @@
def next_char(self):
if self.exhausted():
raise StopIteration
- ch = self.text[self.pos]
- self.pos = next_codepoint_pos(self.text, self.pos)
+ newpos = next_codepoint_pos(self.text, self.pos)
+ ch = self.text[self.pos:newpos]
+ self.pos = newpos
+ self.upos += 1
return ch
def peek_char(self):
# like next_char, but doesn't advance pos
if self.exhausted():
raise StopIteration
- ch = self.text[self.pos]
- return ch
+ newpos = next_codepoint_pos(self.text, self.pos)
+ return self.text[self.pos:newpos]
def find_newline_universal(self, limit):
# Universal newline search. Find any of \r, \r\n, \n
diff --git a/pypy/module/_io/test/test_interp_textio.py b/pypy/module/_io/test/test_interp_textio.py
--- a/pypy/module/_io/test/test_interp_textio.py
+++ b/pypy/module/_io/test/test_interp_textio.py
@@ -1,6 +1,6 @@
import pytest
try:
- from hypothesis import given, strategies as st, settings
+ from hypothesis import given, strategies as st, settings, example
except ImportError:
pytest.skip("hypothesis required")
import os
@@ -63,6 +63,7 @@
assert buf.exhausted()
@given(st.text(), st.lists(st.integers(min_value=0)))
+@example(u'\x80', [1])
def test_readn_buffer(text, sizes):
buf = DecodeBuffer(text.encode('utf-8'))
strings = []
@@ -80,5 +81,5 @@
buf = DecodeBuffer(text.encode('utf-8'))
for i in range(len(text)):
ch = buf.next_char()
- assert ch == text[i].encode('utf-8')[0]
+ assert ch == text[i].encode('utf-8')
assert buf.exhausted()
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit
[prev in list] [next in list] [prev in thread] [next in thread]
Configure |
About |
News |
Add a list |
Sponsored by KoreLogic