import numpy as np
import pandas as pd
from pandas import Series, DataFrame
fill_value
参数,添加指定元素df1 = pd.DataFrame(np.arange(12.).reshape((3,4)),
columns=list("abcd"))
df2 = pd.DataFrame(np.arange(20.).reshape((4,5)),
columns=list("abcde"))
df1
a | b | c | d | |
---|---|---|---|---|
0 | 0.0 | 1.0 | 2.0 | 3.0 |
1 | 4.0 | 5.0 | 6.0 | 7.0 |
2 | 8.0 | 9.0 | 10.0 | 11.0 |
df2
a | b | c | d | e | |
---|---|---|---|---|---|
0 | 0.0 | 1.0 | 2.0 | 3.0 | 4.0 |
1 | 5.0 | 6.0 | 7.0 | 8.0 | 9.0 |
2 | 10.0 | 11.0 | 12.0 | 13.0 | 14.0 |
3 | 15.0 | 16.0 | 17.0 | 18.0 | 19.0 |
df1+df2
a | b | c | d | e | |
---|---|---|---|---|---|
0 | 0.0 | 2.0 | 4.0 | 6.0 | NaN |
1 | 9.0 | 11.0 | 13.0 | 15.0 | NaN |
2 | 18.0 | 20.0 | 22.0 | 24.0 | NaN |
3 | NaN | NaN | NaN | NaN | NaN |
# 使用add方法,填入fill_value参数
df1.add(df2, fill_value=0)
# r翻转实现
1 / df1
a | b | c | d | |
---|---|---|---|---|
0 | inf | 1.000000 | 0.500000 | 0.333333 |
1 | 0.250 | 0.200000 | 0.166667 | 0.142857 |
2 | 0.125 | 0.111111 | 0.100000 | 0.090909 |
df1.rdiv(1)
a | b | c | d | |
---|---|---|---|---|
0 | inf | 1.000000 | 0.500000 | 0.333333 |
1 | 0.250 | 0.200000 | 0.166667 | 0.142857 |
2 | 0.125 | 0.111111 | 0.100000 | 0.090909 |
df1.reindex(columns=df2.columns, fill_value=0)
a | b | c | d | e | |
---|---|---|---|---|---|
0 | 0.0 | 1.0 | 2.0 | 3.0 | 0 |
1 | 4.0 | 5.0 | 6.0 | 7.0 | 0 |
2 | 8.0 | 9.0 | 10.0 | 11.0 | 0 |
Series和DataFrame之间进行运算的时候,会作用于每行。
arr = np.arange(12.).reshape((3, 4))
arr
array([[ 0., 1., 2., 3.],
[ 4., 5., 6., 7.],
[ 8., 9., 10., 11.]])
arr[0]
array([0., 1., 2., 3.])
# 每行都要执行减操作
arr - arr[0]
array([[0., 0., 0., 0.],
[4., 4., 4., 4.],
[8., 8., 8., 8.]])
# DF和S型数据
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame
b | d | e | |
---|---|---|---|
Utah | 0.0 | 1.0 | 2.0 |
Ohio | 3.0 | 4.0 | 5.0 |
Texas | 6.0 | 7.0 | 8.0 |
Oregon | 9.0 | 10.0 | 11.0 |
series = frame.iloc[0]
series
b 0.0
d 1.0
e 2.0
Name: Utah, dtype: float64
frame - series
b | d | e | |
---|---|---|---|
Utah | 0.0 | 0.0 | 0.0 |
Ohio | 3.0 | 3.0 | 3.0 |
Texas | 6.0 | 6.0 | 6.0 |
Oregon | 9.0 | 9.0 | 9.0 |
series2 = pd.Series(range(3), index=['b', 'e', 'f'])
series2
b 0
e 1
f 2
dtype: int64
# 相同的数据则直接相加,只在一个类型中存在,填充为NaN
frame + series2
b | d | e | f | |
---|---|---|---|---|
Utah | 0.0 | NaN | 3.0 | NaN |
Ohio | 3.0 | NaN | 6.0 | NaN |
Texas | 6.0 | NaN | 9.0 | NaN |
Oregon | 9.0 | NaN | 12.0 | NaN |
# 在列上进行广播机制
series3 = frame["d"]
series3
Utah 1.0
Ohio 4.0
Texas 7.0
Oregon 10.0
Name: d, dtype: float64
# axis="index"等价于axis=0
frame.sub(series3, axis="index")
b | d | e | |
---|---|---|---|
Utah | -1.0 | 0.0 | 1.0 |
Ohio | -1.0 | 0.0 | 1.0 |
Texas | -1.0 | 0.0 | 1.0 |
Oregon | -1.0 | 0.0 | 1.0 |
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame
b | d | e | |
---|---|---|---|
Utah | 0.644845 | 0.878886 | 0.505433 |
Ohio | -0.230029 | -0.268866 | -0.107758 |
Texas | 0.429735 | 0.016485 | -0.940844 |
Oregon | 0.318921 | -0.910421 | 0.945280 |
# ufuncs函数
np.abs(frame)
b | d | e | |
---|---|---|---|
Utah | 0.644845 | 0.878886 | 0.505433 |
Ohio | 0.230029 | 0.268866 | 0.107758 |
Texas | 0.429735 | 0.016485 | 0.940844 |
Oregon | 0.318921 | 0.910421 | 0.945280 |
np.exp(frame)
b | d | e | |
---|---|---|---|
Utah | 1.905692 | 2.408215 | 1.657703 |
Ohio | 0.794511 | 0.764246 | 0.897845 |
Texas | 1.536850 | 1.016622 | 0.390298 |
Oregon | 1.375642 | 0.402355 | 2.573535 |
# apply():将自己实现的函数直接传给apply方法,作用于每行,返回S型数据
func = lambda x: x.max() - x.min()
frame.apply(func)
b 0.874874
d 1.789307
e 1.886124
dtype: float64
def f(x):
return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)
b | d | e | |
---|---|---|---|
min | -0.230029 | -0.910421 | -0.940844 |
max | 0.644845 | 0.878886 | 0.945280 |
# applymap()方法:S的方法
format_x = lambda x: '{:.2f}'.format(x)
frame.applymap(format_x)
b | d | e | |
---|---|---|---|
Utah | 0.64 | 0.88 | 0.51 |
Ohio | -0.23 | -0.27 | -0.11 |
Texas | 0.43 | 0.02 | -0.94 |
Oregon | 0.32 | -0.91 | 0.95 |
# S数据的map方法
frame['e'].map(format_x)
Utah 0.51
Ohio -0.11
Texas -0.94
Oregon 0.95
Name: e, dtype: object
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()
a 1
b 2
c 3
d 0
dtype: int64
# DF数据的排序
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
index=['three', 'one'],
columns=['d', 'a', 'b', 'c'])
# 默认是行排序,升序(one, three)
frame.sort_index()
d | a | b | c | |
---|---|---|---|---|
one | 4 | 5 | 6 | 7 |
three | 0 | 1 | 2 | 3 |
# 指定列排序和降序(dcba)
frame.sort_index(axis=1, ascending=False)
d | c | b | a | |
---|---|---|---|---|
three | 0 | 3 | 2 | 1 |
one | 4 | 7 | 6 | 5 |
# sort_values()
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame.sort_values(by='b')
b | a | |
---|---|---|
2 | -3 | 0 |
3 | 2 | 1 |
0 | 4 | 0 |
1 | 7 | 1 |
frame.sort_values(by=['a', 'b']) # 同时指定多个列属性
b | a | |
---|---|---|
2 | -3 | 0 |
0 | 4 | 0 |
3 | 2 | 1 |
1 | 7 | 1 |
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()
0 6.5
1 1.0
2 6.5
3 4.5
4 3.0
5 2.0
6 4.5
dtype: float64
# 根据出现的顺序返回排名
obj.rank(method='first')
0 6.0
1 1.0
2 7.0
3 4.0
4 3.0
5 2.0
6 5.0
dtype: float64
# 降序排列:[7,7,4,4,2,0,-5],相同元素排名加1
obj.rank(ascending=False, method='min') # 第一个7排名为1,取min值1
0 1.0
1 7.0
2 1.0
3 3.0
4 5.0
5 6.0
6 3.0
dtype: float64
obj.rank(ascending=False, method='max') # 第一个7排名为1,取max值2
0 2.0
1 7.0
2 2.0
3 4.0
4 5.0
5 6.0
6 4.0
dtype: float64
obj.rank(ascending=False, method='first')
0 1.0
1 7.0
2 2.0
3 3.0
4 5.0
5 6.0
6 4.0
dtype: float64
# 类似min方法,但是不加1,重复元素排名相同
obj.rank(ascending=False, method='dense')
0 1.0
1 5.0
2 1.0
3 2.0
4 3.0
5 4.0
6 2.0
dtype: float64
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
one | two | |
---|---|---|
a | 1.40 | NaN |
b | 7.10 | -4.5 |
c | NaN | NaN |
d | 0.75 | -1.3 |
df.sum(axis='columns')
a 1.40
b 2.60
c 0.00
d -0.55
dtype: float64
# 含有一个NA值自动排除
df.mean(axis='columns', skipna=False)
a NaN
b 1.300
c NaN
d -0.275
dtype: float64
df.mean(axis='columns')
a 1.400
b 1.300
c NaN
d -0.275
dtype: float64
df.idxmax()
one b
two d
dtype: object
df
one | two | |
---|---|---|
a | 1.40 | NaN |
b | 7.10 | -4.5 |
c | NaN | NaN |
d | 0.75 | -1.3 |
df.idxmin()
one d
two b
dtype: object
df.cumsum() # 累积求和
one | two | |
---|---|---|
a | 1.40 | NaN |
b | 8.50 | -4.5 |
c | NaN | NaN |
d | 9.25 | -5.8 |
df.describe()
one | two | |
---|---|---|
count | 3.000000 | 2.000000 |
mean | 3.083333 | -2.900000 |
std | 3.493685 | 2.262742 |
min | 0.750000 | -4.500000 |
25% | 1.075000 | -3.700000 |
50% | 1.400000 | -2.900000 |
75% | 4.250000 | -2.100000 |
max | 7.100000 | -1.300000 |
Stay Foolish Stay Hungry