Created
June 30, 2017 03:44
-
-
Save DadgadCafe/27165ed2b3e18787dac8dd0c4daf960b to your computer and use it in GitHub Desktop.
notes of numpy and pandas.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
# linspace:创建线段 | |
arr = np.array([[1, 2, 3] | |
[4, 5, 6]], dtype=int32) # list to matrix | |
arr.ndim # 2 dimensions | |
arr.shape # 2 * 3 | |
arr.size # number of elements | |
np.zeros((3, 4), dtype=int) # 3*4 | |
np.ones((3, 4), dtype=int16) # 3*4 | |
np.empty((3, 4), dtype=int) # 3*4, close to 0 | |
a = np.arange(10, 20, 2) | |
a[0] #10 | |
a = np.arange(10, 50, 2) # 10 - 20, step 2 | |
.reshape((4, 5)) # reshape to 3*4 | |
a[0][0] #10 | |
a[0, 0] #same | |
a[0, 1:3] # [12, 14] | |
np.linspace(1, 10, 20) # 1 - 10, 20 pieces | |
.reshape((5, 4)) | |
# matrix operation | |
a1 = np.arange(5) # array([0, 1, 2, 3, 4]) | |
a2 = np.arange(10, 15) # array([10, 11, 12, 13, 14]) | |
a2 - a1 # array([10, 10, 10, 10, 10]) | |
a2 + a1 # array([10, 12, 14, 16, 18]) | |
a2 * a1 # array([ 0, 11, 24, 39, 56]) | |
a1 / a2 # array([ 0. , 0.09090909, 0.16666667, 0.23076923, 0.28571429]) | |
a1 ** 2 # array([ 0, 1, 4, 9, 16]) | |
a1 < 0 # array([False, False, False, False, False], dtype=bool) | |
np.sin(a1) | |
a = np.array([[1,1],[0,1]]) | |
b = np.arange(4).reshape((2,2)) # [[0, 1], [2, 3]] | |
np.dot(a, b) # array([[2, 4], [2, 3]]) | |
a.dot(b) # same | |
np.dot([1,2,3],[4,5,6]) # 1*4 + 2*5 + 3*6 = 32 | |
a = np.random.random((2,4)) | |
np.sum(a) | |
np.max(a) | |
np.min(a) | |
np.sum(a, axis=0) # sum by column | |
np.sum(a, axis=1) # sum by row | |
A = np.arange(2,14).reshape((3,4)) | |
# array([[ 2, 3, 4, 5] | |
# [ 6, 7, 8, 9] | |
# [10,11,12,13]]) | |
np.argmax(A) # index of 2: 0 | |
np.argmin(A) # indeox of 13: 11 | |
np.mean(A) # A.mean() # 7.5 | |
np.median(A) # A.median() | |
np.median(A, axis=0) | |
np.average(A) # 7.5 | |
np.cumsum(A) # [2, 2+3, 2+3+4, ...] => array([ 2, 5, 9, 14, 20, 27, 35, 44, 54, 65, 77, 90]) | |
np.diff(A) # [[3-2, 4-3, 5-4], ...] => array([[1, 1, 1], [1, 1, 1], [1, 1, 1]]) | |
np.nonzero(A) # position of nonzero: (0, 0), (0, 1) ... | |
#(array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]), | |
# array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3])) | |
A = np.arange(14, 2, -1) | |
.reshape((3,4)) | |
# array([[14, 13, 12, 11], | |
# [10, 9, 8, 7], | |
# [ 6, 5, 4, 3]]) | |
np.sort(A) | |
# array([[11,12,13,14] | |
# [ 7, 8, 9,10] | |
# [ 3, 4, 5, 6]]) | |
np.transpose(A) | |
A.T # same | |
# array([[14,10, 6] | |
# [13, 9, 5] | |
# [12, 8, 4] | |
# [11, 7, 3]]) | |
(A.T).dot(A) | |
# array([[332, 302, 272, 242], | |
# [302, 275, 248, 221], | |
# [272, 248, 224, 200], | |
# [242, 221, 200, 179]]) | |
np.clip(A, 5, 9) # >9 => 9; <5 => 5 | |
# array([[ 9, 9, 9, 9] | |
# [ 9, 9, 8, 7] | |
# [ 6, 5, 5, 5]]) | |
a = np.arange(4).reshape((2, 2)) | |
a[0, 0] # 0 | |
for row in a: | |
print(row) | |
for column in a.T: | |
print(column) | |
a.flatten() # array([0, 1, 2, 3]) | |
for item in a.flat: # iterate items | |
print(item) | |
a = np.array([1, 1, 1]) | |
b = np.array([2, 2, 2]) | |
np.vstack((a, b)) # array([[1, 1, 1], [2, 2, 2]]) | |
np.hstack((a, b)) # array([1, 1, 1, 2, 2, 2]) | |
a[:, np.newaxis] # array([[1], [1], [1]]) | |
# using concatenate | |
np.concatenate((a, b, b), axis=0) # 0:vertical 1:horizontal | |
a = np.arange(12).reshape((3, 4)) | |
np.hsplit(a, 2) | |
np.split(a, 2, axis=1) # horizontal | |
# [ array([[0, 1], [4, 5], [8, 9]]), | |
# array([[2, 3], [6, 7], [10, 11]])] | |
np.vsplit(a, 3) | |
np.split(a, 3, axis=0) # vertically | |
# [array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[8, 9, 10, 11]])] | |
np.array_split(A, 3, axis=1) # uneven split | |
# [ array([[0, 1],[4, 5],[8, 9]]), | |
# array([[2], [6], [10]]), | |
# array([[3], [7], [11]])] | |
a = np.arange(4) | |
b = a | |
a is b # True | |
c = a.copy() | |
c is a # False | |
#pandas | |
import pandas as pd | |
s = pd.Series([1, 3, np.nan, 5]) | |
# 0 1.0 | |
# 1 3.0 | |
# 2 NaN | |
# 3 5.0 | |
# dtype: float64 | |
dates = pd.date_range('20170101', periods=6) | |
# DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04', '2017-01-05', '2017-01-06'], | |
# dtype='datetime64[ns]', freq='D') | |
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['a', 'b', 'c', 'd']) | |
# a b c d | |
# 2017-01-01 -0.669733 0.091818 0.581845 -0.290370 | |
# 2017-01-02 0.203958 -0.840011 -1.234419 1.567374 | |
# 2017-01-03 0.761231 -0.712473 0.954426 2.002349 | |
# 2017-01-04 0.477278 0.860596 0.867349 0.438903 | |
# 2017-01-05 -1.431947 0.684325 -0.762821 0.815071 | |
# 2017-01-06 -0.095380 -0.515609 0.184032 -0.482174 | |
pd.DataFrame(np.arange(12).reshape((3, 4))) | |
# 0 1 2 3 | |
# 0 0 1 2 3 | |
# 1 4 5 6 7 | |
# 2 8 9 10 11 | |
df2 = pd.DataFrame({'A': 1., | |
'B': pd.Timestamp('20130102'), | |
'C': pd.Series(1, index=list(range(4)), dtype='float32'), | |
'D': np.arange(4), | |
'E': pd.Categorical(['test', 'train', 'test', 'train']), | |
'F': 'foo'}) | |
# A B C D E F | |
# 0 1.0 2013-01-02 1.0 0 test foo | |
# 1 1.0 2013-01-02 1.0 1 train foo | |
# 2 1.0 2013-01-02 1.0 2 test foo | |
# 3 1.0 2013-01-02 1.0 3 train foo | |
df2.type | |
df2.index | |
df2.columns | |
df2.rows | |
df2.values | |
df2.describe() | |
df2.T | |
df2.sort_index(axis=1, ascending=False) # sort by column name | |
df2.sort_values(by='B') # sort by B column value | |
# data selection | |
dates = pd.date_range('20130101', periods=6) | |
df = pd.DataFrame(np.arange(24).reshape((6,4)), | |
index=dates, | |
columns=['A','B','C','D']) | |
''' | |
A B C D | |
2013-01-01 0 1 2 3 | |
2013-01-02 4 5 6 7 | |
2013-01-03 8 9 10 11 | |
2013-01-04 12 13 14 15 | |
2013-01-05 16 17 18 19 | |
2013-01-06 20 21 22 23 | |
''' | |
df['A'] | |
df.A | |
''' | |
2013-01-01 0 | |
2013-01-02 4 | |
2013-01-03 8 | |
2013-01-04 12 | |
2013-01-05 16 | |
2013-01-06 20 | |
Freq: D, Name: A, dtype: int64 | |
''' | |
df[0:3] | |
''' | |
A B C D | |
2013-01-01 0 1 2 3 | |
2013-01-02 4 5 6 7 | |
2013-01-03 8 9 10 11 | |
''' | |
df['20130102':'20130104'] | |
''' | |
A B C D | |
2013-01-02 4 5 6 7 | |
2013-01-03 8 9 10 11 | |
2013-01-04 12 13 14 15 | |
''' | |
df.loc['20130102'] | |
''' | |
A 4 | |
B 5 | |
C 6 | |
D 7 | |
Name: 2013-01-02 00:00:00, dtype: int64 | |
''' | |
df.loc[:,['A','B']] | |
''' | |
A B | |
2013-01-01 0 1 | |
2013-01-02 4 5 | |
2013-01-03 8 9 | |
2013-01-04 12 13 | |
2013-01-05 16 17 | |
2013-01-06 20 21 | |
''' | |
df.loc['20130102',['A','B']] | |
''' | |
A 4 | |
B 5 | |
Name: 2013-01-02 00:00:00, dtype: int64 | |
''' | |
df.iloc[3,1] # 13 | |
df.iloc[3:5, 1:3] | |
''' | |
B C | |
2013-01-04 13 14 | |
2013-01-05 17 18 | |
''' | |
df.iloc[[1,3,5],1:3] | |
''' | |
B C | |
2013-01-02 5 6 | |
2013-01-04 13 14 | |
2013-01-06 21 22 | |
''' | |
df.ix[:3,['A','C']] | |
''' | |
A C | |
2013-01-01 0 2 | |
2013-01-02 4 6 | |
2013-01-03 8 10 | |
''' | |
# Boolean indexing: | |
df[df.A>8] | |
''' | |
A B C D | |
2013-01-04 12 13 14 15 | |
2013-01-05 16 17 18 19 | |
2013-01-06 20 21 22 23 | |
''' | |
dates = pd.date_range('20130101', periods=6) | |
df = pd.DataFrame(np.arange(24).reshape((6, 4)), | |
index=dates, | |
columns=['A', 'B', 'C', 'D']) | |
df.iloc[2, 2] = 111 | |
df.loc['20130101', 'B'] = 222 | |
''' | |
A B C D | |
2013-01-01 0 222 2 3 | |
2013-01-02 4 5 6 7 | |
2013-01-03 8 9 111 11 | |
2013-01-04 12 13 14 15 | |
2013-01-05 16 17 18 19 | |
2013-01-06 20 21 22 23 | |
''' | |
df.B[df.A>4] = 0 | |
''' | |
A B C D | |
2013-01-01 0 2222 2 3 | |
2013-01-02 4 5 6 7 | |
2013-01-03 8 0 1111 11 | |
2013-01-04 12 0 14 15 | |
2013-01-05 16 0 18 19 | |
2013-01-06 20 0 22 23 | |
''' | |
df['F'] = np.nan | |
''' | |
A B C D F | |
2013-01-01 0 222 2 3 NaN | |
2013-01-02 4 5 6 7 NaN | |
2013-01-03 8 0 111 11 NaN | |
2013-01-04 12 0 14 15 NaN | |
2013-01-05 16 0 18 19 NaN | |
2013-01-06 20 0 22 23 NaN | |
''' | |
df['E'] = pd.Series([1,2,3,4,5,6], | |
index=pd.date_range('20130101', | |
periods=6)) | |
''' | |
A B C D F E | |
2013-01-01 0 2222 2 3 NaN 1 | |
2013-01-02 4 5 6 7 NaN 2 | |
2013-01-03 8 0 1111 11 NaN 3 | |
2013-01-04 12 0 14 15 NaN 4 | |
2013-01-05 16 0 18 19 NaN 5 | |
2013-01-06 20 0 22 23 NaN 6 | |
''' | |
dates = pd.date_range('20130101', periods=6) | |
df = pd.DataFrame(np.arange(24).reshape((6, 4)), | |
index=dates, | |
columns=['A', 'B', 'C', 'D']) | |
df.iloc[0, 1] = np.nan | |
df.iloc[1, 2] = np.nan | |
''' | |
A B C D | |
2013-01-01 0 NaN 2.0 3 | |
2013-01-02 4 5.0 NaN 7 | |
2013-01-03 8 9.0 10.0 11 | |
2013-01-04 12 13.0 14.0 15 | |
2013-01-05 16 17.0 18.0 19 | |
2013-01-06 20 21.0 22.0 23 | |
''' | |
df.dropna( | |
axis=0, # 0: 对行进行操作; 1: 对列进行操作 | |
how='any' # 'any': 只要存在 NaN 就 drop 掉; 'all': 必须全部是 NaN 才 drop | |
) | |
''' | |
A B C D | |
2013-01-03 8 9.0 10.0 11 | |
2013-01-04 12 13.0 14.0 15 | |
2013-01-05 16 17.0 18.0 19 | |
2013-01-06 20 21.0 22.0 23 | |
''' | |
df.fillna(value=0) | |
''' | |
A B C D | |
2013-01-01 0 0.0 2.0 3 | |
2013-01-02 4 5.0 0.0 7 | |
2013-01-03 8 9.0 10.0 11 | |
2013-01-04 12 13.0 14.0 15 | |
2013-01-05 16 17.0 18.0 19 | |
2013-01-06 20 21.0 22.0 23 | |
''' | |
df.isnull() | |
''' | |
A B C D | |
2013-01-01 False True False False | |
2013-01-02 False False True False | |
2013-01-03 False False False False | |
2013-01-04 False False False False | |
2013-01-05 False False False False | |
2013-01-06 False False False False | |
''' | |
np.any(df.isnull()) # if exists nan | |
# read | |
data = pd.read_csv('students.csv') | |
# to pickle | |
data.to_pickle('student.pickle') | |
# concat | |
# axis, default 0 | |
df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd']) | |
df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['a', 'b', 'c', 'd']) | |
df3 = pd.DataFrame(np.ones((3, 4))*2, columns=['a', 'b', 'c', 'd']) | |
pd.concat([df1, df2, df3], axis=0) | |
''' | |
a b c d | |
0 0.0 0.0 0.0 0.0 | |
1 0.0 0.0 0.0 0.0 | |
2 0.0 0.0 0.0 0.0 | |
0 1.0 1.0 1.0 1.0 | |
1 1.0 1.0 1.0 1.0 | |
2 1.0 1.0 1.0 1.0 | |
0 2.0 2.0 2.0 2.0 | |
1 2.0 2.0 2.0 2.0 | |
2 2.0 2.0 2.0 2.0 | |
''' | |
pd.concat([df1, df2, df3], axis=0, ignore_index=True) | |
''' | |
a b c d | |
0 0.0 0.0 0.0 0.0 | |
1 0.0 0.0 0.0 0.0 | |
2 0.0 0.0 0.0 0.0 | |
3 1.0 1.0 1.0 1.0 | |
4 1.0 1.0 1.0 1.0 | |
5 1.0 1.0 1.0 1.0 | |
6 2.0 2.0 2.0 2.0 | |
7 2.0 2.0 2.0 2.0 | |
8 2.0 2.0 2.0 2.0 | |
''' | |
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3]) | |
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4]) | |
pd.concat([df1, df2], axis=0, join='outer') | |
''' | |
a b c d e | |
1 0.0 0.0 0.0 0.0 NaN | |
2 0.0 0.0 0.0 0.0 NaN | |
3 0.0 0.0 0.0 0.0 NaN | |
2 NaN 1.0 1.0 1.0 1.0 | |
3 NaN 1.0 1.0 1.0 1.0 | |
4 NaN 1.0 1.0 1.0 1.0 | |
''' | |
pd.concat([df1, df2], axis=0, join='inner') | |
''' | |
b c d | |
1 0.0 0.0 0.0 | |
2 0.0 0.0 0.0 | |
3 0.0 0.0 0.0 | |
2 1.0 1.0 1.0 | |
3 1.0 1.0 1.0 | |
4 1.0 1.0 1.0 | |
''' | |
pd.concat([df1, df2], axis=0, join='inner', ignore_index=True) | |
''' | |
b c d | |
0 0.0 0.0 0.0 | |
1 0.0 0.0 0.0 | |
2 0.0 0.0 0.0 | |
3 1.0 1.0 1.0 | |
4 1.0 1.0 1.0 | |
5 1.0 1.0 1.0 | |
''' | |
# horizontal by index | |
df1 = pd.DataFrame(np.ones((3, 4)), | |
columns=['A', 'B', 'C', 'D'], | |
index=[1, 2, 3]) | |
df2 = pd.DataFrame(np.ones((3, 4)), | |
columns=['A', 'B', 'C', 'D'] | |
index=[2, 3, 4]) | |
pd.concat([df1, df2], axis=1, join_axes=[df1.index]) | |
# a b c d b c d e | |
# 1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN | |
# 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 | |
# 3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 | |
df1 = pd.DataFrame(np.ones((3, 4)) * 0, | |
columns=['A', 'B', 'C', 'D']) | |
df2 = pd.DataFrame(np.ones((3, 4)) * 1, | |
columns=['A', 'B', 'C', 'D']) | |
df3 = pd.DataFrame(np.ones((3, 4)) * 1, | |
columns=['A', 'B', 'C', 'D']) | |
df1.append(df2, ignore_index=True) | |
''' | |
a b c d | |
0 0.0 0.0 0.0 0.0 | |
1 0.0 0.0 0.0 0.0 | |
2 0.0 0.0 0.0 0.0 | |
3 1.0 1.0 1.0 1.0 | |
4 1.0 1.0 1.0 1.0 | |
5 1.0 1.0 1.0 1.0 | |
''' | |
s1 = pd.Series([1,2,3,4], | |
index=['a','b','c','d']) | |
df1.append(s1, ignore_index=True) | |
''' | |
# a b c d | |
# 0 0.0 0.0 0.0 0.0 | |
# 1 0.0 0.0 0.0 0.0 | |
# 2 0.0 0.0 0.0 0.0 | |
# 3 1.0 2.0 3.0 4.0 | |
''' | |
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], | |
'A': ['A0', 'A1', 'A2', 'A3'], | |
'B': ['B0', 'B1', 'B2', 'B3']}) | |
right = pd.DataFrame({'key': ['K1', 'K2', 'K3', 'K4'], | |
'C': ['C0', 'C1', 'C2', 'C3'], | |
'D': ['D0', 'D1', 'D2', 'D3']}) | |
''' | |
A B key | |
0 A0 B0 K0 | |
1 A1 B1 K1 | |
2 A2 B2 K2 | |
3 A3 B3 K3 | |
''' | |
''' | |
C D key | |
0 C0 D0 K1 | |
1 C1 D1 K2 | |
2 C2 D2 K3 | |
3 C3 D3 K4 | |
''' | |
pd.merge(left, right, on='key') | |
''' | |
A B key C D | |
0 A1 B1 K1 C0 D0 | |
1 A2 B2 K2 C1 D1 | |
2 A3 B3 K3 C2 D2 | |
''' | |
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'], | |
'key2': ['K0', 'K1', 'K0', 'K1'], | |
'A': ['A0', 'A1', 'A2', 'A3'], | |
'B': ['B0', 'B1', 'B2', 'B3']}) | |
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'], | |
'key2': ['K0', 'K0', 'K0', 'K0'], | |
'C': ['C0', 'C1', 'C2', 'C3'], | |
'D': ['D0', 'D1', 'D2', 'D3']}) | |
pd.merge(left, right, on=['key1', 'key2'], how='inner') | |
pd.merge(left, right, on=['key1', 'key2'], how='outer') | |
pd.merge(left, right, on=['key1', 'key2'], how='left') | |
pd.merge(left, right, on=['key1', 'key2'], how='right') | |
df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']}) | |
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]}) | |
pd.merge(df1, df2, on='col1', how='outer', indicator=True) | |
''' | |
# col1 col_left col_right _merge | |
# 0 0.0 a NaN left_only | |
# 1 1.0 b 2.0 both | |
# 2 2.0 NaN 2.0 right_only | |
# 3 2.0 NaN 2.0 right_only | |
''' | |
pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column') | |
''' | |
col1 col_left col_right indicator_column | |
0 0.0 a NaN left_only | |
1 1.0 b 2.0 both | |
2 2.0 NaN 2.0 right_only | |
3 2.0 NaN 2.0 right_only | |
''' | |
# merge by index | |
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], | |
'B': ['B0', 'B1', 'B2']}, | |
index=['K0', 'K1', 'K2']) | |
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'], | |
'D': ['D0', 'D2', 'D3']}, | |
index=['K0', 'K2', 'K3']) | |
pd.merge(left, right, left_index=True, right_index=True, how='outer') | |
# A B C D | |
# K0 A0 B0 C0 D0 | |
# K1 A1 B1 NaN NaN | |
# K2 A2 B2 C2 D2 | |
# K3 NaN NaN C3 D3 | |
pd.merge(left, right, left_index=True, right_index=True, how='inner') | |
# A B C D | |
# K0 A0 B0 C0 D0 | |
# K2 A2 B2 C2 D2 | |
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]}) | |
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]}) | |
#使用suffixes解决overlapping的问题 | |
pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner') | |
''' | |
age_boy k age_girl | |
0 1 K0 4 | |
1 1 K0 5 | |
''' | |
# draw | |
import matplotlib.pyplot as plt | |
data = pd.Series(np.random.randn(1000),index=np.arange(1000)) | |
data.cumsum() | |
data.plot() | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment