In [31]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

Table of Contents

References:

Tips

  • Use <tab> for completion
  • select then press ( wraps

Setup

In [32]:
%matplotlib inline
import pandas as pd
import numpy as np
from IPython.display import display, HTML
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999
In [34]:
data = pd.read_csv(
    'data.tsv', 
    parse_dates=['Timestamp'],
    dtype={'Kind': str},
    dialect='excel-tab')
data[:3]
Out[34]:
sequence_number Timestamp Kind NormalCount TotalCount
0 484828304 2016-06-18 00:49:34 10.27 3 10
1 1006221952 2016-04-13 17:31:09 10.31 6 14
2 227783421 2016-03-20 11:10:28 10.27 5 13
In [50]:
# supported CSV dialects:
import csv
csv.list_dialects()
Out[50]:
['excel-tab', 'excel', 'unix']

Column selection, transormation

In [35]:
# a columln is a Series:
counts = data['TotalCount']

# transformation:
even_counts = counts.apply(lambda x: x % 2 == 0)

# transpose a DataFrame:
transposed = data.transpose()

Filtering

In [36]:
filter_vec = data['Kind'] == "10.20"  # result: [True, True, False, ...]
filter_vec
Out[36]:
0    False
1    False
2    False
3     True
4    False
5     True
6    False
7    False
8    False
Name: Kind, dtype: bool
In [37]:
data[filter_vec]
Out[37]:
sequence_number Timestamp Kind NormalCount TotalCount
3 2112888815 2016-06-27 10:28:03 10.20 3 15
5 808318967 2016-04-10 21:11:11 10.20 4 12

You can also construct the filtering vector using arbitrary function by using .apply():

In [38]:
data[ data['Kind'].apply(lambda x: x > '10.27') ]
Out[38]:
sequence_number Timestamp Kind NormalCount TotalCount
1 1006221952 2016-04-13 17:31:09 10.31 6 14

Filter vectors can be combined using vec1 | vec2:

In [39]:
data[ (data['Kind'] == '10.31') | \
      (data['sequence_number'] == 808318967) ]
Out[39]:
sequence_number Timestamp Kind NormalCount TotalCount
1 1006221952 2016-04-13 17:31:09 10.31 6 14
5 808318967 2016-04-10 21:11:11 10.20 4 12

Adding removing columns

In [40]:
data.loc[:, 'NewCol'] = data['Kind'] > '10.27'
# or: data['NewCol'] = data['Kind'] > '10.27'
display(data[:2])

cols = list(data.columns)
cols.remove('NewCol')
seqnums = data['sequence_number'] # remember it in case we need it later
cols.remove('sequence_number')

# remove it
data = data[ cols ]
sequence_number Timestamp Kind NormalCount TotalCount NewCol
0 484828304 2016-06-18 00:49:34 10.27 3 10 False
1 1006221952 2016-04-13 17:31:09 10.31 6 14 True

Count unique column values

In [41]:
x = data['Kind'].value_counts()
display(x)
display(type(x))
10.27    4
10.20    2
10.24    2
10.31    1
Name: Kind, dtype: int64
pandas.core.series.Series
In [42]:
data['Kind'].unique()
Out[42]:
array(['10.27', '10.31', '10.20', '10.24'], dtype=object)

Column Wise Operation

Series.Combine(other, func) should serve the general purpose, but it seems the func return type is enforced to be the same as the series that's operated on.

Otherwise, division, multiplecatioin are all "numpy" flavored: / and *:

In [43]:
aveCount = data['NormalCount'] / data['TotalCount']
data["AverageCount"] = aveCount
data[:2]
Out[43]:
Timestamp Kind NormalCount TotalCount AverageCount
0 2016-06-18 00:49:34 10.27 3 10 0.300000
1 2016-04-13 17:31:09 10.31 6 14 0.428571

Grouping

Document

In [44]:
grouped = data.groupby('Kind') # or ['Kind', 'NewCol']
display("Type of grouped is {0}".format(type(grouped)))

# iterating:
for key, group in grouped:
    print("==========")
    print("KEY IS: {0}".format(key))
    print("GROUP CONTENT:")
    print(group)
"Type of grouped is <class 'pandas.core.groupby.DataFrameGroupBy'>"
==========
KEY IS: 10.20
GROUP CONTENT:
            Timestamp   Kind  NormalCount  TotalCount  AverageCount
3 2016-06-27 10:28:03  10.20            3          15      0.200000
5 2016-04-10 21:11:11  10.20            4          12      0.333333
==========
KEY IS: 10.24
GROUP CONTENT:
            Timestamp   Kind  NormalCount  TotalCount  AverageCount
6 2016-05-18 20:29:01  10.24            3          14      0.214286
7 2016-03-24 18:36:10  10.24            6          15      0.400000
==========
KEY IS: 10.27
GROUP CONTENT:
            Timestamp   Kind  NormalCount  TotalCount  AverageCount
0 2016-06-18 00:49:34  10.27            3          10      0.300000
2 2016-03-20 11:10:28  10.27            5          13      0.384615
4 2016-03-29 11:48:52  10.27            5          16      0.312500
8 2016-05-22 20:26:01  10.27            5          16      0.312500
==========
KEY IS: 10.31
GROUP CONTENT:
            Timestamp   Kind  NormalCount  TotalCount  AverageCount
1 2016-04-13 17:31:09  10.31            6          14      0.428571

Aggregation

It can be tricky. Refer to the aggregation doc

In [45]:
# You must select a single column before agg():
grouped['TotalCount'].agg({
        'sum': lambda group: sum(group),
        'min': lambda group: min(group)})
Out[45]:
min sum
Kind
10.20 12 27
10.24 14 29
10.27 10 55
10.31 14 14
In [46]:
def f(group):
    print(group)
    return "FOO"

# this is wrong:
# grouped.agg(f)
In [47]:
# alternatively, construct per-column aggregation then combine them into a DataFrame:

total = grouped['TotalCount'].agg(sum) # result is a Series
display(total)

normal = grouped['NormalCount'].agg(sum)
display(normal)

new_data = pd.DataFrame({
        'total': total,
        'normal': normal,
        'ave': normal/total})
display(new_data)
Kind
10.20    27
10.24    29
10.27    55
10.31    14
Name: TotalCount, dtype: int64
Kind
10.20     7
10.24     9
10.27    18
10.31     6
Name: NormalCount, dtype: int64
ave normal total
Kind
10.20 0.259259 7 27
10.24 0.310345 9 29
10.27 0.327273 18 55
10.31 0.428571 6 14
In [48]:
# To order the columns:
new_data = pd.DataFrame({'total' : total})
new_data['normal'] = normal
new_data['ave'] = normal / total

display(new_data)
total normal ave
Kind
10.20 27 7 0.259259
10.24 29 9 0.310345
10.27 55 18 0.327273
10.31 14 6 0.428571

Visualization

For table coloring, you may need seaborn. To install use conda install seaborn.

In [49]:
import seaborn as sns
cm = sns.light_palette("green", as_cmap=True)

# conditional formatting
new_data.style.background_gradient(cmap=cm)
Out[49]:
total normal ave
Kind
10.20 27 7 0.259259
10.24 29 9 0.310345
10.27 55 18 0.327273
10.31 14 6 0.428571