%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')
%matplotlib inline
import pandas as pd
import numpy as np
from IPython.display import display, HTML
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999
data = pd.read_csv(
'data.tsv',
parse_dates=['Timestamp'],
dtype={'Kind': str},
dialect='excel-tab')
data[:3]
# supported CSV dialects:
import csv
csv.list_dialects()
# a columln is a Series:
counts = data['TotalCount']
# transformation:
even_counts = counts.apply(lambda x: x % 2 == 0)
# transpose a DataFrame:
transposed = data.transpose()
filter_vec = data['Kind'] == "10.20" # result: [True, True, False, ...]
filter_vec
data[filter_vec]
You can also construct the filtering vector using arbitrary function by using .apply()
:
data[ data['Kind'].apply(lambda x: x > '10.27') ]
Filter vectors can be combined using vec1 | vec2
:
data[ (data['Kind'] == '10.31') | \
(data['sequence_number'] == 808318967) ]
data.loc[:, 'NewCol'] = data['Kind'] > '10.27'
# or: data['NewCol'] = data['Kind'] > '10.27'
display(data[:2])
cols = list(data.columns)
cols.remove('NewCol')
seqnums = data['sequence_number'] # remember it in case we need it later
cols.remove('sequence_number')
# remove it
data = data[ cols ]
x = data['Kind'].value_counts()
display(x)
display(type(x))
data['Kind'].unique()
Series.Combine(other, func)
should serve the general purpose, but it seems the func
return type is enforced to be the same as the series that's operated on.
Otherwise, division, multiplecatioin are all "numpy" flavored: /
and *
:
aveCount = data['NormalCount'] / data['TotalCount']
data["AverageCount"] = aveCount
data[:2]
grouped = data.groupby('Kind') # or ['Kind', 'NewCol']
display("Type of grouped is {0}".format(type(grouped)))
# iterating:
for key, group in grouped:
print("==========")
print("KEY IS: {0}".format(key))
print("GROUP CONTENT:")
print(group)
It can be tricky. Refer to the aggregation doc
# You must select a single column before agg():
grouped['TotalCount'].agg({
'sum': lambda group: sum(group),
'min': lambda group: min(group)})
def f(group):
print(group)
return "FOO"
# this is wrong:
# grouped.agg(f)
# alternatively, construct per-column aggregation then combine them into a DataFrame:
total = grouped['TotalCount'].agg(sum) # result is a Series
display(total)
normal = grouped['NormalCount'].agg(sum)
display(normal)
new_data = pd.DataFrame({
'total': total,
'normal': normal,
'ave': normal/total})
display(new_data)
# To order the columns:
new_data = pd.DataFrame({'total' : total})
new_data['normal'] = normal
new_data['ave'] = normal / total
display(new_data)
import seaborn as sns
cm = sns.light_palette("green", as_cmap=True)
# conditional formatting
new_data.style.background_gradient(cmap=cm)