-
Type: Bug
-
Resolution: Fixed
-
Priority: Unknown
-
Affects Version/s: None
-
Component/s: None
-
None
As reported in https://github.com/mongodb-labs/mongo-arrow/issues/208.
We should be able to handle the following:
from pymongo import MongoClient import pymongoarrow.api as pmaapi import pyarrow.parquet as papq import pyarrow.json as pajson import io import json import bson client = MongoClient() collection = client.testdb.data; collection.drop(); client.testdb.data.insert_many([ { '_id': 1, 'foo': { 'bar': ['1','2'] } }, { '_id': 2, 'foo': { 'bar': [] } } ]) # get document out of mongo, put it in a file and read it with pyarrow and write it to parquet doc1 = client.testdb.data.find_one({'_id': 1}) string1 = bson.json_util.dumps(doc1, indent = 2) file1 = io.BytesIO(bytes(string1, encoding='utf-8')) papatable1 = pajson.read_json(file1) print(str(papatable1)) papq.write_table(papatable1, 'pyarrow' + str(1) + '.parquet') # read document with pymongoarrow and write it to parquet pmapatable1 = pmaapi.find_arrow_all(client.testdb.data,{'_id': {'$eq': 1}}) print(str(pmapatable1)) papq.write_table(pmapatable1, 'pymongoarrow' + str(1) + '.parquet') doc2 = client.testdb.data.find_one({'_id': 2}) string2 = bson.json_util.dumps(doc2, indent = 2) file2 = io.BytesIO(bytes(string2, encoding='utf-8')) papatable2 = pajson.read_json(file2) print(str(papatable2)) papq.write_table(papatable2, 'pyarrow' + str(2) + '.parquet') pmapatable2 = pmaapi.find_arrow_all(client.testdb.data,{'_id': {'$eq': 2}}) papq.write_table(pmapatable2, 'pymongoarrow' + str(2) + '.parquet')
Instead it produces:
$ python repro.py pyarrow.Table _id: int64 foo: struct<bar: list<item: string>> child 0, bar: list<item: string> child 0, item: string ---- _id: [[1]] foo: [ -- is_valid: all not null -- child 0 type: list<item: string> [["1","2"]]] pyarrow.Table _id: int32 foo: struct<bar: list<item: string>> child 0, bar: list<item: string> child 0, item: string ---- _id: [[1]] foo: [ -- is_valid: all not null -- child 0 type: list<item: string> [["1","2"]]] pyarrow.Table _id: int64 foo: struct<bar: list<item: null>> child 0, bar: list<item: null> child 0, item: null ---- _id: [[2]] foo: [ -- is_valid: all not null -- child 0 type: list<item: null> [0 nulls]] Traceback (most recent call last): File "/workspaces/vscode-python/pymongoarrow/repro.py", line 45, in <module> pmapatable2 = pmaapi.find_arrow_all(client.testdb.data,{'_id': {'$eq': 2}}) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/vscode/Envs/pma1/lib/python3.11/site-packages/pymongoarrow/api.py", line 112, in find_arrow_all process_bson_stream(batch, context) File "pymongoarrow/lib.pyx", line 159, in pymongoarrow.lib.process_bson_stream File "pymongoarrow/lib.pyx", line 246, in pymongoarrow.lib.process_raw_bson_stream File "pymongoarrow/lib.pyx", line 133, in pymongoarrow.lib.extract_document_dtype File "pymongoarrow/lib.pyx", line 108, in pymongoarrow.lib.extract_field_dtype File "pyarrow/types.pxi", line 4452, in pyarrow.lib.list_ TypeError: List requires DataType or Field
- has to be finished together with
-
INTPYTHON-165 Auto schema detection can yield different table on missing values
- Closed