pip install pandas
pip install seaborn
pip install folium
pip install numexpr
Copy fragments of code from slides to python console/IPython
or download complete code using link at a slide corner.
Please ask lots of questions!
After the introductory part,
we will run a freestyle data hacking session.
OPEN DATA
information available for anyone to use, for any purpose, at no cost
import pandas
# read CVS file to a data frame
data = pandas.read_csv("payments.csv")
# write data frame to a CSV file
data.to_csv("payments.csv")
JSON, HDF5, LaTeX, HTML, SQL db, Excel, SAS (.xpt), Stata (.dta), ...
import pandas
data = pandas.read_csv("january_2016.csv")
data.info()
# skip first two rows
data = pandas.read_csv("january_2016.csv", skiprows=2)
data.info()
# number with a thousands separator
data = pandas.read_csv("january_2016.csv", skiprows=2, thousands=",")
data.info()
# replace spaces in column names with underscores
data.columns = data.columns.str.replace(" ", "_")
# show the values in dates column
data["Paid_Date"].unique()
# convert strings to date (DD/MM/YYYY)
data["Paid_Date"] = pandas.to_datetime(data["Paid_Date"], format="%d/%m/%Y")
# convert strings to date, set NaT for invalid/missing values
data["Paid_Date"] = pandas.to_datetime(data["Paid_Date"], format="%d/%m/%Y", errors="coerce")
data.info()
print(data.Paid_Date[0])
data = data.dropna()
# show basic statistics for each numerical column
data.describe()
# show frequency
data["Directorate"].value_counts()
# show high value payments
data.query("Total > 1000000")
# complex queries
data.query("Group_Description == 'Revenue Income' and Total > 10000")
# alternative syntax
data[(data.Group_Description == "Revenue Income") & (data.Total > 10000)]
# first 5 rows
data[:5] # data.head()
# fifth row
data.ix[5]
# third column of the fifth row
data.ix[5, 3]
data.ix[5][3]
# default index
data.index
# use date column as index
indexed_data = data.set_index("Paid_Date")
indexed_data.info()
# get all rows with given index
indexed_data.ix["2016-01-06"]
# get all rows with index in given range
indexed_data.ix["2016-01-12":"2016-01-14"]
# group rows by one of the columns
grouped = data.groupby("Supplier_Name")
# all groups as a dictionary
grouped.groups
# single group
grouped.get_group("ROYAL MAIL")
import numpy
# build-in functions
grouped.size()
grouped.first()
# custom functions
grouped.aggregate(numpy.sum)
grouped.Total.aggregate([numpy.mean, numpy.std])
# modify members of the group
grouped.transform(lambda x: x.fillna(x.mean()))
# direct transformation
data.Total * 1000
# transformation with function
data.Total.apply(abs)
import calendar
# add columns
data["week"] = data.Paid_Date.apply(lambda x: x.week)
data["day"] = data.Paid_Date.apply(lambda x: calendar.day_name[x.dayofweek])
# sum up payments for each day of week
weekdays = data.groupby([data.week, data.day]).sum()
# multi-level index!
weekdays.index
# pivot inner level of the index to the column labels
weekdays.unstack()["Total"]
# reshape into a new table
d = pandas.pivot_table(data, values="Total", index="Paid_Date", columns="Directorate")
# unpivot from wide format to long format
pandas.melt(d)
# pivot column labels to the inner level of the index
d.stack()
# read CVS file to a data frame
new_data = pandas.read_csv("february_2016.csv", skiprows=2, thousands=",")
# replace spaces in column names with underscores
new_data.columns = new_data.columns.str.replace(" ", "_")
# convert strings to date, set NaT for invalid/missing values
new_data.Paid_Date = pandas.to_datetime(new_data.Paid_Date, format="%d-%b-%y",
errors="coerce")
# remove empty column
new_data = new_data.dropna(how="all", axis=1)
# remove all rows with missing values
new_data = new_data.dropna()
# concatenate the data frames, use new index
pandas.concat([data, new_data], ignore_index=True)
# add rows
data.append(more_data)
# add columns
pandas.concat([data, more_data], axis=1)
# merge data on index
data.join(more_data, how="inner")
# merge data on specific attributes
pandas.merge(data, more_data, left_on="label", right_on="tag", how="outer")
import seaborn
seaborn.set_style("whitegrid")
seaborn.set_context("paper")
plot = seaborn.countplot(x="Directorate", data=data)
plot.figure.savefig("count_plot.pdf")
# fix labels overlap, shrink margins
seaborn.plt.subplots(figsize=(8, 4))
plot = seaborn.countplot(x="Directorate", data=data)
plot.figure.savefig("count_plot_improved.pdf", bbox_inches="tight")
plot.figure.clear()
# split into categories, plot bars horizontally
plot = seaborn.countplot(y="Directorate", hue="Cap/Rev", data=data)
plot.figure.savefig("count_plot_categories.pdf", bbox_inches="tight")
plot.figure.clear()
# add extra columns
import calendar
data["week"] = data.Paid_Date.apply(lambda x: x.week)
data["day"] = data.Paid_Date.apply(lambda x: calendar.day_name[x.dayofweek])
# mean payment value for each day of week
plot = seaborn.barplot(x="day", y="Total", data=data)
plot.figure.savefig("bar_plot.pdf", bbox_inches="tight")
plot.figure.clear()
# sum up payments for each day of week
weekdays = data.groupby([data.week, data.day]).sum()
# pivot inner level of the index to the column labels
new_data = weekdays.unstack()["Total"]
# draw a heatmap
plot = seaborn.heatmap(new_data, annot=True, fmt=".0f", linewidths=0.5)
plot.figure.savefig("heatmap.pdf", bbox_inches="tight")
plot.figure.clear()
# specify order of columns
new_data = new_data[["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]]
small_payments = data.query("0 < Total < 10000")
# histogram + kernel density estimation
plot = seaborn.distplot(small_payments.Total)
plot.figure.savefig("distribution.pdf", bbox_inches="tight")
plot.figure.clear()
# box plot
plot = seaborn.boxplot(x="Total", y="Directorate", data=small_payments)
plot.figure.savefig("box_plot.pdf", bbox_inches="tight")
plot.figure.clear()
# violin plot
plot = seaborn.violinplot(x="Total", y="Directorate", data=small_payments)
plot.figure.savefig("violin_plot.pdf", bbox_inches="tight")
plot.figure.clear()
small_payments = data.query("0 < Total < 10000")
# grid of distribution plots
grid = seaborn.FacetGrid(small_payments, col="Directorate", hue="Directorate",
col_wrap=3, sharex=False, sharey=True)
grid.map(seaborn.distplot, "Total");
grid.set_titles("{col_name}")
grid.set(xlabel="", xlim=(0, 10000))
grid.savefig("grid_distribution.pdf", bbox_inches="tight")
grid.fig.clear()
small_payments = small_payments.sort(["week", "day"])
# 2D grid of box plots
grid = seaborn.FacetGrid(small_payments, col="day", row="week",
sharex=True, sharey=True)
grid.map(seaborn.boxplot, "Total", "Directorate")
grid.set_xlabels("")
grid.despine(left=True, bottom=True)
grid.savefig("grid_box_plot.pdf", bbox_inches="tight")
grid.fig.clear()
# read data from a CSV file
data = pandas.read_csv("libraries.csv")
import folium
# find geometrical center
location = data.Latitude.mean(), data.Longitude.mean()
# draw a map using tiles from the Open Street Map project
osm = folium.Map(location, zoom_start=13)
osm.save("map.html")
# mark libraries on the map, use name as the popup information
for info in zip(data.Latitude, data.Longitude, data.Library):
folium.Marker(info[:2], popup=info[2]).add_to(osm)
osm.save("map.html")
# mark libraries as circles with size proportional to number of computers
osm = folium.Map(location, zoom_start=15)
for info in zip(data.Latitude, data.Longitude, data.Library, data["No of PCs"]):
folium.CircleMarker(info[:2], popup=info[2], radius=info[3],
color="red", fill_color="green").add_to(osm)
osm.save("map_circles.html")