import pandas as pd df[“a”] = df[“a”].replace(old, new) df = pd.DataFrame(np.random.random(size = (5, 3)), with open(“sample.csv”) as fp:
import numpy as np columns = list(“abc”)) csv_pointer = csv.writer(fp)
import matplotlib.pyplot as plt Drop Duplicates (CLEAN) df = pd.DataFrame(np.random.randint(0, 2, size = csv_pointer.writerow(“x”)
# %matplotlib inline only in Jupyter df.drop_duplicates(subset = “A”) (10, 3)))
plt.style.use(“ggplot”) subset will narrow the columns df = pd.DataFrame(np.random.RandomState(8765). Creating Directory
import random randint(1, 101, size = (5, 3)), columns = [“a”, “b”]) if not os.path.exists(“sub”):
import os Addition os.mkdir(“sub”)
import csv print(df.sum()) DateTimeIndex os.rename(“salaries_sample.csv”, “sub” + os.sep +
Subtraction dti = pd.date_range(start = “2015-01-01”, end = “salaries.sample.csv”)
Version of Panda Imported df.sub(constant, axis = 0) or axis = 1 “2015-12-31”, freq = “B”)
print(pd._ _ version_ _) df[df.index.weekday == 2].sum() Writing csv
Version Information of All Libraries Column with Smallest Sum Sum of values in df for every Wednesday (Wed = 2) with open(filename, “w”) as fp:
pd.show_versions() print(“Revenue Report\n”, “-“ * 20, file = fp)
print(df.sum().idxmin())
Date Offsets for index, value in enumerate(revenues):
Create DataFrame B: Business day; W: Week; WOM: Week of Month; print(“Revenue for {}: ${:15,.2f} Cumm. Total:
Top Largest Ranking
data = {“x”:[1, 2], “y”:[4, 5, ]} dictionary LWOM: Last Week of Month; M: Month; ${}”.format(index + 1, value,
print(df.nlargest(3).sum()) Top 3
labels = [“a”, “b”] column labels MS: Month Beginning; Q: Quarter; A: Year; D: Day; sum(revenues[:index + 1])), file = fp)
df = pd.DataFrame(data, index = labels) H: Hour; T or min: Min; S: Second; print(“The company has made a total of
Index ${:,.2f}”.format(sum(revenues)), file = fp)
# index is the row label print(df.index(“a”)) WOM-3THU: 3rd Thurs of Each Month
data = [[1, 2], [3, 4]] list - r1 = 1,2; r2 = 3,4 Find index of a Sort
Split
columns = list(“ab”) df.sort(reverse = True)
Resample df[“a”].split(“_”, expand = True)
df = pd.DataFrame(data, columns = columns)
df.resample(“M”).mean() split by _; expand into multiple columns
Summary of df
mean of values in each month
print(df.info()) Graph Properties
print(df.describe()) Interpolate s: Size basis; c: Color basis; color: Color;
df.interpolate() marker: “x”, “*”, “.” , “o”, “v”, “^”, title: Title of
Selecting Rows/Columns graph; label: Label; linewidth: Width of line;
fills in missing value
print(df.iloc[[-1:], [1]) index no. linestyle: “-“, “--“, “-.”, “:”; legend = True/False;
print(df.loc[[row_name(s)], [column_name(s)]]) kind = “bar”, “hist”, “box”, “density”, “area”,
Change Type
print(df[column_name].between(1, 2)) “scatter”, “pie”
df.astype(int)
Null Values Group
Capitalize/Lower/Upper Case
print(df[(df[“c”].isnull())]) df.groupby(“a”)[“b”].mean()
df.[“a”].capitalize()
Select row where column c is null. mean of b grouped by column a
df[“a”].lower()
Changing Cell Values df[“a”].upper()
df.loc[“f”, “age”] = 1.5
Join
df = df.join(a) Append to df
Sum/Mean/Median/Max/Min of Columns
print(df[“f”].sum())
Scatterplot Set Title
df[“f”].mean() df[“f”].median()
df.plot.scatter(x, y) df.set_title(“a”)
df[“f”].max() df.groupby(“a”)[“f”].min()
Barplot Set Labels
Adding New Row/Column
df.plot.bar(x, y) df.set_xlabel(“x”)
df.loc[“a”] = [1, 2, 3] New Row
df[“a”] = [1, 2, 3] New Column df.set_ylabel(“y”)
Lineplot Set Graph Limit
Drop Row/Column df.plot.line(x, Candlestick df.set_xlim((-1, 12))
df = df.drop(“a”, axis = 0) or axis = 1 (col) y)v plot_candlestick(df) df.set_ylim((-1, 12))
Sort Values Map
print(df.sort_values(by = [“a”, “b”], ascending = [True, False])) df[“a”] = df[“a”].map({1:”yes”, 4:”No”, 7:”No”})