Skip to content

Commit 8aa5cb1

Browse files
committed
Add support for pandas
1 parent 240826c commit 8aa5cb1

File tree

12 files changed

+617
-10
lines changed

12 files changed

+617
-10
lines changed

CHANGELOG.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
------------------------------------------------------------------------------
2+
qPython 1.0 RC1 [2014.10.22]
3+
------------------------------------------------------------------------------
4+
5+
- Introduce support for pandas
6+
17
------------------------------------------------------------------------------
28
qPython 1.0 Beta 6 [2014.10.16]
39
------------------------------------------------------------------------------

doc/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1+
sphinx>=1.2.3
12
mock>=1.0.1

doc/source/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Welcome to qPython's documentation!
1212
connection
1313
queries
1414
type-conversion
15+
pandas
1516
usage-examples
1617

1718

doc/source/pandas.rst

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
Pandas integration
2+
==================
3+
4+
The `qPython` allows user to use ``pandas.DataFrame`` and ``pandas.Series``
5+
instead of ``numpy.recarray`` and ``numpy.ndarray`` to represent ``q`` tables
6+
and vectors.
7+
8+
In order to instrument `qPython` to use `pandas`_ data types user has to set
9+
``pandas`` flag while:
10+
11+
- creating :class:`.qconnection.QConnection` instance,
12+
- executing synchronous query: :meth:`~qpython.qconnection.QConnection.sync`,
13+
- or retrieving data from q: :meth:`~qpython.qconnection.QConnection.receive`.
14+
15+
For example:
16+
::
17+
18+
>>> with qconnection.QConnection(host = 'localhost', port = 5000, pandas = True) as q:
19+
>>> ds = q('(1i;0Ni;3i)', pandas = True)
20+
>>> print ds
21+
0 1
22+
1 NaN
23+
2 3
24+
dtype: float64
25+
>>> print ds.meta
26+
metadata(qtype=6)
27+
28+
>>> df = q('flip `name`iq`fullname!(`Dent`Beeblebrox`Prefect;98 42 126;("Arthur Dent"; "Zaphod Beeblebrox"; "Ford Prefect"))')
29+
>>> print df
30+
name iq fullname
31+
0 Dent 98 Arthur Dent
32+
1 Beeblebrox 42 Zaphod Beeblebrox
33+
2 Prefect 126 Ford Prefect
34+
>>> print df.meta
35+
metadata(iq=7, fullname=0, qtype=98, name=11)
36+
>>> print q('type', df)
37+
98
38+
39+
>>> df = q('([eid:1001 0N 1003;sym:`foo`bar`] pos:`d1`d2`d3;dates:(2001.01.01;2000.05.01;0Nd))')
40+
>>> print df
41+
pos dates
42+
eid sym
43+
1001 foo d1 2001-01-01
44+
NaN bar d2 2000-05-01
45+
1003 d3 NaT
46+
>>> print df.meta
47+
metadata(dates=14, qtype=99, eid=7, sym=11, pos=11)
48+
>>> print q('type', df)
49+
99
50+
51+
52+
Data conversions
53+
****************
54+
55+
If ``pandas`` flag is set, `qPython` converts the data according to following
56+
rules:
57+
58+
- ``q`` vectors are represented as ``pandas.Series``:
59+
60+
- ``pandas.Series`` is initialized with ``numpy.ndarray`` being result of
61+
parsing with ``numpy_temporals`` flag set to ``True`` (to ensure that
62+
temporal vectors are represented as numpy ``datetime64``/``timedelta64``
63+
arrays).
64+
- q nulls are replaced with ``numpy.NaN``. This can result in type promotion
65+
as described in `pandas documentation <http://pandas.pydata.org/pandas-docs/stable/gotchas.html#support-for-integer-na>`_.
66+
- ``pandas.Series`` is enriched with custom attribute ``meta``
67+
(:class:`qpython.MetaData`), which contains `qtype` of the vector. Note
68+
that this information is used while serializaing ``pandas.Series`` instance
69+
to IPC protocol.
70+
71+
72+
- tables are represented as ``pandas.DataFrame`` instances:
73+
74+
- individual columns are represented as ``pandas.Series``.
75+
- ``pandas.DataFrame`` is enriched with custom attribute ``meta``
76+
(:class:`qpython.MetaData`), which lists `qtype` for each column in table.
77+
Note that this information is used during ``pandas.DataFrame`` serialization.
78+
79+
- keyed tables are backed as ``pandas.DataFrame`` instances as well:
80+
81+
- index for ``pandas.DataFrame`` is created from key columns.
82+
- ``pandas.DataFrame`` is enriched with custom attribute ``meta``
83+
(:class:`qpython.MetaData`), which lists `qtype` for each column in table,
84+
including index ones. Note that this information is used during
85+
``pandas.DataFrame`` serialization.
86+
87+
88+
.. _pandas: http://pandas.pydata.org/
89+

qpython/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ def __getattr__(self, attr):
3939
def __getitem__(self, key):
4040
return self.__dict__.get(key, None)
4141

42+
def __setitem__(self, key, value):
43+
self.__dict__[key] = value
44+
4245
def as_dict(self):
4346
return self.__dict__.copy()
4447

qpython/_pandas.py

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
#
2+
# Copyright (c) 2011-2014 Exxeleron GmbH
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
17+
import pandas
18+
import struct
19+
20+
from collections import OrderedDict
21+
22+
from qpython import MetaData
23+
from qpython.qreader import QReader, READER_CONFIGURATION, QReaderException
24+
from qpython.qcollection import QDictionary, qlist
25+
from qpython.qwriter import QWriter, QWriterException
26+
from qpython.qtype import *
27+
28+
29+
30+
class PandasQReader(QReader):
31+
32+
parse = Mapper(QReader._reader_map)
33+
34+
@parse(QDICTIONARY)
35+
def _read_dictionary(self, qtype = QDICTIONARY, options = READER_CONFIGURATION):
36+
if options.pandas:
37+
keys = self._read_object(options = options)
38+
values = self._read_object(options = options)
39+
40+
if isinstance(keys, pandas.DataFrame):
41+
if not isinstance(values, pandas.DataFrame):
42+
raise QReaderException('Keyed table creation: values are expected to be of type pandas.DataFrame. Actual: %s' % type(values))
43+
44+
indices = keys.columns
45+
table = keys
46+
table.meta = keys.meta
47+
table.meta.qtype = QKEYED_TABLE
48+
49+
for column in values.columns:
50+
table[column] = values[column]
51+
table.meta[column] = values.meta[column]
52+
53+
table.set_index([column for column in indices], inplace = True)
54+
55+
return table
56+
else:
57+
keys = keys if not isinstance(keys, pandas.Series) else keys.as_matrix()
58+
values = values if not isinstance(values, pandas.Series) else values.as_matrix()
59+
return QDictionary(keys, values)
60+
else:
61+
return QReader._read_dictionary(self, qtype = qtype, options = options)
62+
63+
64+
@parse(QTABLE)
65+
def _read_table(self, qtype = QTABLE, options = READER_CONFIGURATION):
66+
if options.pandas:
67+
self._buffer.skip() # ignore attributes
68+
self._buffer.skip() # ignore dict type stamp
69+
70+
columns = self._read_object(options = options)
71+
data = self._read_object(options = options)
72+
73+
odict = OrderedDict()
74+
meta = MetaData(qtype = QTABLE)
75+
for i in xrange(len(columns)):
76+
if isinstance(data[i], str):
77+
# convert character list (represented as string) to numpy representation
78+
meta[columns[i]] = QSTRING
79+
odict[columns[i]] = numpy.array(list(data[i]), dtype = numpy.str)
80+
elif isinstance(data[i], (list, tuple)):
81+
# convert character list (represented as string) to numpy representation
82+
meta[columns[i]] = QGENERAL_LIST
83+
odict[columns[i]] = numpy.array(list(data[i]))
84+
else:
85+
meta[columns[i]] = data[i].meta.qtype
86+
odict[columns[i]] = data[i]
87+
88+
df = pandas.DataFrame(odict)
89+
df.meta = meta
90+
return df
91+
else:
92+
return QReader._read_table(self, qtype = qtype, options = options)
93+
94+
95+
def _read_list(self, qtype, options):
96+
if options.pandas:
97+
options.numpy_temporals = True
98+
99+
list = QReader._read_list(self, qtype = qtype, options = options)
100+
101+
if options.pandas:
102+
if qtype != QSYMBOL_LIST:
103+
null = QNULLMAP[-abs(qtype)][1]
104+
ps = pandas.Series(data = list).replace(null, numpy.NaN)
105+
else:
106+
ps = pandas.Series(data = list)
107+
108+
ps.meta = MetaData(qtype = qtype)
109+
return ps
110+
else:
111+
return list
112+
113+
114+
115+
class PandasQWriter(QWriter):
116+
117+
serialize = Mapper(QWriter._writer_map)
118+
119+
@serialize(pandas.Series)
120+
def _write_pandas_series(self, data, qtype = None):
121+
if qtype is not None:
122+
qtype = -abs(qtype)
123+
124+
if qtype is None and hasattr(data, 'meta'):
125+
qtype = -abs(data.meta.qtype)
126+
127+
if data.dtype == '|S1':
128+
qtype = QCHAR
129+
130+
if qtype is None:
131+
qtype = Q_TYPE.get(data.dtype.type, None)
132+
133+
if qtype is None and data.dtype.type in (numpy.datetime64, numpy.timedelta64):
134+
qtype = TEMPORAL_PY_TYPE.get(str(data.dtype), None)
135+
136+
if qtype is None:
137+
# determinate type based on first element of the numpy array
138+
qtype = Q_TYPE.get(type(data[0]), QGENERAL_LIST)
139+
140+
if qtype is None:
141+
raise QWriterException('Unable to serialize pandas series %s' % data)
142+
143+
if qtype == QGENERAL_LIST:
144+
self._write_generic_list(data.as_matrix())
145+
elif qtype == QCHAR:
146+
self._write_string(data.as_matrix().astype(numpy.string_).tostring())
147+
elif data.dtype.type not in (numpy.datetime64, numpy.timedelta64):
148+
data = data.fillna(QNULLMAP[-abs(qtype)][1])
149+
data = data.as_matrix()
150+
151+
if PY_TYPE[qtype] != data.dtype:
152+
data = data.astype(PY_TYPE[qtype])
153+
154+
self._write_list(data, qtype = qtype)
155+
else:
156+
data = data.as_matrix()
157+
data = data.astype(TEMPORAL_Q_TYPE[qtype])
158+
self._write_list(data, qtype = qtype)
159+
160+
161+
162+
@serialize(pandas.DataFrame)
163+
def _write_pandas_data_frame(self, data, qtype = None):
164+
data_columns = data.columns.values
165+
166+
if hasattr(data, 'meta') and data.meta.qtype == QKEYED_TABLE:
167+
# data frame represents keyed table
168+
self._buffer.write(struct.pack('=b', QDICTIONARY))
169+
self._buffer.write(struct.pack('=bxb', QTABLE, QDICTIONARY))
170+
index_columns = data.index.names
171+
self._write(qlist(numpy.array(index_columns), qtype = QSYMBOL_LIST))
172+
data.reset_index(inplace = True)
173+
self._buffer.write(struct.pack('=bxi', QGENERAL_LIST, len(index_columns)))
174+
for column in index_columns:
175+
self._write_pandas_series(data[column], qtype = data.meta[column] if hasattr(data, 'meta') else None)
176+
177+
data.set_index(index_columns, inplace = True)
178+
179+
self._buffer.write(struct.pack('=bxb', QTABLE, QDICTIONARY))
180+
self._write(qlist(numpy.array(data_columns), qtype = QSYMBOL_LIST))
181+
self._buffer.write(struct.pack('=bxi', QGENERAL_LIST, len(data_columns)))
182+
for column in data_columns:
183+
self._write_pandas_series(data[column], qtype = data.meta[column] if hasattr(data, 'meta') else None)
184+

qpython/qcollection.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -210,9 +210,9 @@ class QDictionary(object):
210210
- `values` (`QList`, `QTable`, `tuple` or `list`) - dictionary values
211211
'''
212212
def __init__(self, keys, values):
213-
if not isinstance(keys, (QList, tuple, list)):
213+
if not isinstance(keys, (QList, tuple, list, numpy.ndarray)):
214214
raise ValueError('%s expects keys to be of type: QList, tuple or list. Actual type: %s' % (self.__class__.__name__, type(keys)))
215-
if not isinstance(values, (QTable, QList, tuple, list)):
215+
if not isinstance(values, (QTable, QList, tuple, list, numpy.ndarray)):
216216
raise ValueError('%s expects values to be of type: QTable, QList, tuple or list. Actual type: %s' % (self.__class__.__name__, type(values)))
217217
if len(keys) != len(values):
218218
raise ValueError('Number of keys: %d doesn`t match number of values: %d' % (len(keys), len(values)))

qpython/qreader.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@
3030

3131

3232
READER_CONFIGURATION = MetaData(raw = False,
33-
numpy_temporals = False)
33+
numpy_temporals = False,
34+
pandas = False)
3435

3536

3637

@@ -106,6 +107,19 @@ class QReader(object):
106107
_reader_map = {}
107108
parse = Mapper(_reader_map)
108109

110+
111+
def __new__(cls, *args, **kwargs):
112+
if cls is QReader:
113+
# try to load optional pandas binding
114+
try:
115+
from qpython._pandas import PandasQReader
116+
return super(QReader, cls).__new__(PandasQReader, args, kwargs)
117+
except ImportError:
118+
return super(QReader, cls).__new__(QReader, args, kwargs)
119+
else:
120+
return super(QReader, cls).__new__(cls, args, kwargs)
121+
122+
109123
def __init__(self, stream):
110124
self._stream = stream
111125
self._buffer = QReader.BytesBuffer()
@@ -188,7 +202,7 @@ def read_data(self, message_size, is_compressed = False, **options):
188202
:returns: read data (parsed or raw byte form)
189203
'''
190204
options = MetaData(**READER_CONFIGURATION.union_dict(**options))
191-
205+
192206
if is_compressed:
193207
if self._stream:
194208
self._buffer.wrap(self._read_bytes(4))
@@ -290,10 +304,10 @@ def _read_list(self, qtype, options):
290304
data = numpy.fromstring(raw, dtype = conversion)
291305
if not self._is_native:
292306
data.byteswap(True)
293-
307+
294308
if qtype >= QTIMESTAMP_LIST and qtype <= QTIME_LIST and options.numpy_temporals:
295309
data = array_from_raw_qtemporal(data, qtype)
296-
310+
297311
return qlist(data, qtype = qtype, adjust_dtype = False)
298312
else:
299313
raise QReaderException('Unable to deserialize q type: %s' % hex(qtype))
@@ -536,3 +550,4 @@ def get_symbols(self, count):
536550

537551
return raw.split('\x00')
538552

553+

0 commit comments

Comments
 (0)