DataFrame
. import pandas as pd # This is the standard way of importing the Pandas library
import numpy as np
DataFrame
¶wh = pd.read_csv("kumpula-weather-2017.csv")
wh.head() # The head method prints the first 5 rows
wh["Snow depth (cm)"].head()
wh["Air temperature (degC)"].mean() # Mean temperature
wh.drop("Time zone", axis=1).head() # Return a copy with one column removed, the original DataFrame stays intact
wh.head() # Original DataFrame is unchanged
wh["Rainy"] = wh["Precipitation amount (mm)"] > 5
wh.head()
Series
is one-dimensional version of DataFrame
s=pd.Series([1, 4, 5, 2, 5, 2])
s
s1=pd.Series([1, 4, 5, 2, 5, 2], index=list("abcdef"))
s1
s1.index
We can also attach a name to this series:
s.name = "Grades"
s
The common attributes of the series are the name, dtype, and size:
print(f"Name: {s.name}, dtype: {s.dtype}, size: {s.size}")
s[1] # Indexing
s1["b"]
s2=s[[0,5]] # Fancy indexing
print(s2)
t=s[-2:] # Slicing
t
t[4] # t[0] would give an error
The DataFrame is essentially a two dimensional object, and it can be created in three different ways:
df=pd.DataFrame(np.random.randn(2,3), columns=["First", "Second", "Third"], index=["a", "b"])
df
df.index # These are the "row names"
df.columns # These are the "column names"
s1 = pd.Series([1,2,3])
s1
s2 = pd.Series([4,5,6], name="b")
s2
pd.DataFrame(s1, columns=["a"])
Multiple columns
pd.DataFrame({"a": s1, "b": s2})
Each row is given as a dict, list, Series, or NumPy array.
df=pd.DataFrame([{"Wage" : 1000, "Name" : "Jack", "Age" : 21}, {"Wage" : 1500, "Name" : "John", "Age" : 29}])
df
df = pd.DataFrame([[1000, "Jack", 21], [1500, "John", 29]], columns=["Wage", "Name", "Age"])
df
df[0]
loc
: use explicit indicesiloc
: use the implicit integer indicesdf.loc[1, "Wage"]
df.iloc[-1,-1] # Right lower corner of the DataFrame
df.loc[1, ["Name", "Wage"]]
wh.head()
wh2 = wh.drop(["Year", "m", "d", "Time", "Time zone"], axis=1) # taking averages over these is not very interesting
wh2.mean()
describe
method of the DataFrame
object gives different summary statistics for each (numeric) column. wh.describe()
wh["Snow depth (cm)"].unique()
The nan
value tells us that the measurement from that day is not available
For non-numeric types the special value None
is used to denote a missing value, and the dtype is promoted to object
.
pd.Series(["jack", "joe", None])
The missing values can be located with the isnull
method:
wh[wh.isnull().any(axis=1)]
The notnull
method works conversively to the isnull
method.
The dropna
method of a DataFrame drops columns or rows that contain missing values from the DataFrame, depending on the axis
parameter.
wh.dropna().shape # Default axis is 0
wh.dropna(axis=1).shape # Drops the columns containing missing values
pd.to_numeric
function or the map method. astype
method.pd.Series(["1","2"]).map(int) # str -> int
pd.Series([1,2]).map(str) # int -> str
pd.to_numeric(pd.Series([1,1.0]), downcast="integer") # object -> int
pd.to_numeric(pd.Series([1,"a"]), errors="coerce") # conversion error produces Nan
pd.Series([1,2]).astype(str) # works for a single series
df = pd.DataFrame({"a": [1,2,3], "b" : [4,5,6], "c" : [7,8,9]})
print(df.dtypes)
print(df)
df.astype(float) # Convert all columns
df2 = df.astype({"b" : float, "c" : str}) # different types for columns
print(df2.dtypes)
print(df2)
names = pd.Series(["donald", "theresa", "angela", "vladimir"])
names.str.capitalize()
# names.str. # Press the tab key
full_names = pd.Series(["Donald Trump", "Theresa May", "Angela Merkel", "Vladimir Putin"])
full_names.str.split() # one column
full_names.str.split(expand=True) # two columns
def makedf(cols, ind):
data = {c : [str(c) + str(i) for i in ind] for c in cols}
return pd.DataFrame(data, ind)
a=makedf("AB", [0,1])
a
b=makedf("AB", [2,3])
b
c=makedf("CD", [0,1])
c
d=makedf("BC", [2,3])
d
pd.concat([a,b]) # The default axis is 0
pd.concat([a,c], axis=1)
r=pd.concat([a,a])
r # This is not usually what we want!
## 1. automatic renumbering of rows:
pd.concat([a,a], ignore_index=True)
## 2. hierarchical indexing
r2=pd.concat([a,a], keys=['first', 'second'])
r2
r2["A"]["first"][0]
pd.concat([a,d])
pd.concat([a,d], sort=False)
pd.concat([a,d], join="inner")
df = pd.DataFrame([[1000, "Jack", 21], [1500, "John", 29]], columns=["Wage", "Name", "Age"])
df
df2 = pd.DataFrame({"Name" : ["John", "Jack"], "Occupation": ["Plumber", "Carpenter"]})
df2
pd.merge(df, df2)
df3 = pd.concat([df2, pd.DataFrame({ "Name" : ["James"], "Occupation":["Painter"]})], ignore_index=True)
df3
pd.merge(df, df3) # By default an inner join is computed
pd.merge(df, df3, how="outer") # Outer join
wh.head()
wh3 = wh.rename(columns={"m": "Month", "d": "Day", "Precipitation amount (mm)" : "Precipitation",
"Snow depth (cm)" : "Snow", "Air temperature (degC)" : "Temperature"})
wh3.head()
groups = wh3.groupby("Month")
len(groups)
for key, group in groups:
print(key, len(group))
groups.get_group(2).head() # Group with index two is February
groups["Temperature"].mean()
groups["Precipitation"].sum()
wh4 = wh3.copy()
wh4.loc[wh4.Precipitation == -1, "Precipitation"] = 0
wh4.loc[wh4.Snow == -1, "Snow"] = 0
wh4.head()
wh4.groupby("Month")["Precipitation"].sum()
The apply method is very generic and only requires that for each group’s DataFrame the given function returns a DataFrame, Series, or a scalar.
wh4.groupby("Month").apply(lambda df : df.sort_values("Temperature"))