[SPARK-23299][SQL][PYSPARK] Fix __repr__ behaviour for Rows
This is PR is meant to replace #20503, which lay dormant for a while. The solution in the original PR is still valid, so this is just that patch rebased onto the current master. Original summary follows. ## What changes were proposed in this pull request? Fix `__repr__` behaviour for Rows. Rows `__repr__` assumes data is a string when column name is missing. Examples, ``` >>> from pyspark.sql.types import Row >>> Row ("Alice", "11") <Row(Alice, 11)> >>> Row (name="Alice", age=11) Row(age=11, name='Alice') >>> Row ("Alice", 11) <snip stack trace> TypeError: sequence item 1: expected string, int found ``` This is because Row () when called without column names assumes everything is a string. ## How was this patch tested? Manually tested and a unit test was added to `python/pyspark/sql/tests/test_types.py`. Closes #24448 from tbcs/SPARK-23299. Lead-authored-by: Tibor Csögör <tibi@tiborius.net> Co-authored-by: Shashwat Anand <me@shashwat.me> Signed-off-by: Holden Karau <holden@pigscanfly.ca>
This commit is contained in:
parent
6ef45301a4
commit
eec1a3c286
|
@ -1,3 +1,4 @@
|
|||
# -*- encoding: utf-8 -*-
|
||||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -739,6 +740,17 @@ class DataTypeTests(unittest.TestCase):
|
|||
tst = TimestampType()
|
||||
self.assertEqual(tst.toInternal(datetime.datetime.max) % 1000000, 999999)
|
||||
|
||||
# regression test for SPARK-23299
|
||||
def test_row_without_column_name(self):
|
||||
row = Row("Alice", 11)
|
||||
self.assertEqual(repr(row), "<Row('Alice', 11)>")
|
||||
|
||||
# test __repr__ with unicode values
|
||||
if sys.version_info.major >= 3:
|
||||
self.assertEqual(repr(Row("数", "量")), "<Row('数', '量')>")
|
||||
else:
|
||||
self.assertEqual(repr(Row(u"数", u"量")), r"<Row(u'\u6570', u'\u91cf')>")
|
||||
|
||||
def test_empty_row(self):
|
||||
row = Row()
|
||||
self.assertEqual(len(row), 0)
|
||||
|
|
|
@ -1435,13 +1435,24 @@ class Row(tuple):
|
|||
|
||||
>>> Person = Row("name", "age")
|
||||
>>> Person
|
||||
<Row(name, age)>
|
||||
<Row('name', 'age')>
|
||||
>>> 'name' in Person
|
||||
True
|
||||
>>> 'wrong_key' in Person
|
||||
False
|
||||
>>> Person("Alice", 11)
|
||||
Row(name='Alice', age=11)
|
||||
|
||||
This form can also be used to create rows as tuple values, i.e. with unnamed
|
||||
fields. Beware that such Row objects have different equality semantics:
|
||||
|
||||
>>> row1 = Row("Alice", 11)
|
||||
>>> row2 = Row(name="Alice", age=11)
|
||||
>>> row1 == row2
|
||||
False
|
||||
>>> row3 = Row(a="Alice", b=11)
|
||||
>>> row1 == row3
|
||||
True
|
||||
"""
|
||||
|
||||
def __new__(self, *args, **kwargs):
|
||||
|
@ -1549,7 +1560,7 @@ class Row(tuple):
|
|||
return "Row(%s)" % ", ".join("%s=%r" % (k, v)
|
||||
for k, v in zip(self.__fields__, tuple(self)))
|
||||
else:
|
||||
return "<Row(%s)>" % ", ".join(self)
|
||||
return "<Row(%s)>" % ", ".join("%r" % field for field in self)
|
||||
|
||||
|
||||
class DateConverter(object):
|
||||
|
|
Loading…
Reference in a new issue