pandas基础(2)_多重索引

网友投稿 671 2022-11-25

pandas基础(2)_多重索引

pandas基础(2)_多重索引

1:多重索引的构造

>>> #下面显示构造pd.MultiIndex

>>> df1=DataFrame(np.random.randint(0,150,size=(6,3)),columns=['java','html5','python'])

>>> import pandas as pd

>>> df1=DataFrame(np.random.randint(0,150,size=(6,3)),columns=['java','html5','python'],index=pd.MultiIndex.from_arrays([['张三','张三','侯少','侯少','a','a'],['M','E','M','E','M','E']]))

>>> df1#因为Python自身的原因,对汉字的识别不是太好,所以汉字被?代替了

java  html5  python

-? M     2     13      76

E   141     67      84

M   116     83       8

E    70    118     125

a    M    74      0      76

E   111     31       8

>>> #使用元组tuple创建

df2=DataFrame(np.random.randint(0,150,size=(6,3)),columns=['java','html','python'],index=pd.MultiIndex.from_tuples([('a','1'),('a','11'),('b','1'),('b','11'),('c','1'),('c','11')]))

>>> df2

java  html  python

a 1     32   144      99

11   104   101      16

b 1     93    98      41

11    59    30      45

c 1     91    17     149

11     9    28      59

>>> #使用product

df2=DataFrame(np.random.randint(0,150,size=(6,3)),columns=['java','html','python'],index=pd.MultiIndex.from_product([['zhangsan ','lisi','wangwu'],['mid','end']]))

>>> df2

java  html  python

zhangsan  mid    50   128      54

end     3     4      91

lisi      mid     4    93     110

end   116   123     122

wangwu    mid    88    25      54

end    48   146      57

>>> #对dataFrame同样可以设置成多重索引

df2=DataFrame(np.random.randint(0,150,size=(3,6)),columns=pd.MultiIndex.from_product([['java','html','python'],['mid','end']]),index=['张三','李四','王五'])

>>> df2

java     html     python

mid end  mid end    mid  end

-?   33  38  112  70    113  110

-?   29  46  132  91    117  128

-?   73  56  118  82    132   39

>>>

>>> df2['java','mid']#查询某一列

-?    33

-?    29

-?    73

Name: (java, mid), dtype: int32

>>> s['zhangsan':'lisi']#其实就是一个Series

Series([], dtype: int64)

>>> s.iloc[0:3]

a  0    1

1    2

b  0    3

dtype: int64

>>> #切片

>>> df2['张三':'王五']

java     html     python

mid end  mid end    mid  end

-?   33  38  112  70    113  110

-?   29  46  132  91    117  128

-?   73  56  118  82    132   39

>>>df2.iloc[0:4]#推荐使用

Df2[‘张三’,‘期中’]和df2.loc[‘张三’].loc[‘期中’]

#如何一级索引有多个,对二级索引会遇到问题,也就是说,无法直接对二级进行索引

必须把二级索引变成一级索引才可以进行索引

>>> df2.stack()

html  java  python

-? end    70    38     110

mid   112    33     113

end    91    46     128

mid   132    29     117

end    82    56      39

mid   118    73     132

>>> #stack =堆----》行

end  mid

-? html     70  112

java     38   33

python  110  113

html     91  132

java     46   29

python  128  117

html     82  118

java     56   73

python   39  132

>>> #默认为-1

2:多重索引的计算

>>> df2

java     html     python

mid end  mid end    mid  end

-?   33  38  112  70    113  110

-?   29  46  132  91    117  128

-?   73  56  118  82    132   39

>>> df1.sum()

java      514

html5     312

python    377

dtype: int64

>>> df1.sum(axis=0)

java      514

html5     312

python    377

dtype: int64

>>> df1.sum(axis=1)#对列

-?  M     91

E    292

M    207

E    313

a     M    150

E    150

dtype: int64

>>> df1.sum(axis=1)#对列求和,得到每行的和

-?  M     91

E    292

M    207

E    313

a     M    150

E    150

dtype: int64

>>> df1.std

-? M     2     13      76

E   141     67      84

M   116     83       8

E    70    118     125

a    M    74      0      76

E   111     31       8>

>>> #求方差

>>> df1.std(axis=1)

-?  M    39.929104

E    38.759945

M    55.344376

E    29.938827

a     M    43.312816

E    54.064776

dtype: float64

>>> df1.max()

java      141

html5     118

python    125

dtype: int32

3多重索引的拼接

>>> nd = np.random.randint(0,10,size=(3,3))

>>> nd

array([[9, 9, 4],

[7, 2, 4],

[1, 6, 1]])

>>> np.concatenate ((nd,nd),axis=0)#在列方向就行拼接

array([[9, 9, 4],

[7, 2, 4],

[1, 6, 1],

[9, 9, 4],

[7, 2, 4],

[1, 6, 1]])

>>> np.concatenate ([nd,nd],axis=1)#在行方向进行拼接

array([[9, 9, 4, 9, 9, 4],

[7, 2, 4, 7, 2, 4],

[1, 6, 1, 1, 6, 1]])

>>> def make_df(cols,inds):

data = {c:[c+str(i) for i in cols]for c in cols}

return DataFrame(data,index=inds,columns=cols)

>>> make_df(['A','B'],[1,2])

A   B

1  AA  BA

2  AB  BB

>>> df1=make_df(list('AB'),[0,1])

>>> df2=make_df(list('AB'),[2,3])

>>> pd.concat ([df1,df2])#默认在列方向进行拼接

A   B

0  AA  BA

1  AB  BB

2  AA  BA

3  AB  BB

>>> #优先增加行数

>>> pd.concat ((df1,df2),axis=1)

A    B    A    B

0   AA   BA  NaN  NaN

1   AB   BB  NaN  NaN

2  NaN  NaN   AA   BA

3  NaN  NaN   AB   BB

>>> #注意index在级联时可以重复

3)

>>> #列名可以相同但是不建议

>>> df3= make_df(list('AB'),[0,1])

>>> df4=make_df(list('VB'),[1,2])

>>> pd.concat((df3,df4))#只能传入一个参数

A   B    V

0   AA  BA  NaN

1   AB  BB  NaN

1  NaN  BV   VV

2  NaN  BB   VB

>>> #3种连接方式

>>> #1:外连接:补NaN(默认模式)

>>> df1= make_df(list('AB'),[1,3])

>>> df2= make_df(list('AB'),[2,4])

>>> df2= make_df(list('BC'),[2,4])

>>> pd.concat ([df1,df2],join='inner')#连接都有的部分

B

1  BA

3  BB

2  BB

4  BC

>>> pd.concat ([df1,df2],join='outer')

A   B    C

1   AA  BA  NaN

3   AB  BB  NaN

2  NaN  BB   CB

4  NaN  BC   CC

>>> #内连接只连接匹配项

>>> #3:连接指定轴 join_axes所以CDF的F便不显示了

>>> df3= make_df(list('ACD'),[0,1,2])

>>> df4= make_df(list('CDF'),[3,4,5])

>>> pd.concat([df3,df4],join_axes=[df3.columns])

A   C   D

0   AA  CA  DA

1   AC  CC  DC

2   AD  CD  DD

3  NaN  CC  DC

4  NaN  CD  DD

5  NaN  CF  DF

>>> #join_axes 某一个DataFrame列索引为新的列索引值

>>> #3使用append()函数添加

>>> #concat方法属于pandas

>>> #append()在后面添加

>>> #concat([df1,df2])

>>> #df1.append(df2)

>>> #merge与concat的区别是,merge需要依据某一共同的行或列来进行合并

>>> #使用pd.merge()合并时,会自动根据两者相同column名称的那一属性,作为key来进行合并,注意每一列的顺序不要求一致

>>> #一对一合并

>>> df1 = DataFrame({'employee':['po','sara','danis'],'group':['sail','counting','marcketing']})

>>> df2 = DataFrame({'employee':['po','sara','danis'],'work_time':[2,3,1]})

>>> df1

employee       group

0       po        sail

1     sara    counting

2    danis  marcketing

>>> df2

employee  work_time

0       po          2

1     sara          3

2    danis          1

>>> pd.merge (df1,df2)

employee       group  work_time

0       po        sail          2

1     sara    counting          3

2    danis  marcketing          1

>>> pd.concat([df1,df2])

employee       group  work_time

0       po        sail        NaN

1     sara    counting        NaN

2    danis  marcketing        NaN

0       po         NaN        2.0

1     sara         NaN        3.0

2    danis         NaN        1.0

>>> df3 = DataFrame({'employee':['po','sara','liulei'],'work_time':[2,3,1]})

>>> pd.merge(df1,df3)

employee     group  work_time

0       po      sail          2

1     sara  counting          3

>>> #merge只合并相同属性里面都有的项

>>> #下面是merge的多对一的合并

>>> df1 = DataFrame({'employee':['po','sara','danis'],'work_time':[2,3,1]})

>>> df2 = DataFrame({'employee':['po','po','danis'],'group':['sail','counting','marcketing']})

>>> pd.merge(df1,df2)

employee  work_time       group

0       po          2        sail

1       po          2    counting

2    danis          1  marcketing

>>> #出现了两个po

>>> #下面是多对多的合并

>>> df1 = DataFrame({'employee':['po','sara','danis'],'group':['sail','counting','marcketing']})

>>> df1 = DataFrame({'employee':['po','po','danis'],'group':['sail','counting','marcketing']})

>>> df2 = DataFrame({'employee':['po','po','danis'],'work_time':[2,3,1]})

>>> pd.merge(df1,df2)

employee       group  work_time

0       po        sail          2

1       po        sail          3

2       po    counting          2

3       po    counting          3

4    danis  marcketing          1

>>> #1*2*2的模式

>>> #使用merge多对多可以来处理重名等数据的情况

>>> df3= DataFrame({'employee':['po','Summer','Flower'],'group':['sail','marking','serch'],'WorkTime':[1,2,3]})

>>> df4= DataFrame({'employee':['po','Summer','Flower'],'group':['sail','marking','serch'],'salary':[12000,20000,10002]})

>>> df3

WorkTime employee    group

0         1       po     sail

1         2   Summer  marking

2         3   Flower    serch

>>> df4

employee    group  salary

0       po     sail   12000

1   Summer  marking   20000

2   Flower    serch   10002

>>> pd.merge(df3,df4)

WorkTime employee    group  salary

0         1       po     sail   12000

1         2   Summer  marking   20000

2         3   Flower    serch   10002

>>> df3= DataFrame({'employee':['po','Winter','Flower'],'group':['marketing','marking','serch'],'WorkTime':[1,2,3]})

>>> pd.merge(df3,df4)

WorkTime employee  group  salary

0         3   Flower  serch   10002

>>> pd.merge(df3,df4,on='employee')

WorkTime employee    group_x group_y  salary

0         1       po  marketing    sail   12000

1         3   Flower      serch   serch   10002

>>> #出现两行数据的原因是指定了employee相同就可以合并

>> pd.merge(df3,df4,on='group')

WorkTime employee_x    group employee_y  salary

0         2     Winter  marking     Summer   20000

1         3     Flower    serch     Flower   10002

>>> pd.merge(df3,df4,on='group',suffixes=['_A','_B'])

WorkTime employee_A    group employee_B  salary

0         2     Winter  marking     Summer   20000

1         3     Flower    serch     Flower   10002

>>> df3= DataFrame({'employee':['po','Winter','Flower'],'group':['marketing','marking','serch'],'WorkTime':[1,2,3]})

>>> df4= DataFrame({'employer':['po','Summer','Flower'],'group':['sail','marking','serch'],'salary':[12000,20000,10002]})

>>> pd.merge(df3,df4)

WorkTime employee    group employer  salary

0         2   Winter  marking   Summer   20000

1         3   Flower    serch   Flower   10002

>>> pd.merge(df3,df4,left_on='employee',right_on='employer')

WorkTime employee    group_x employer group_y  salary

0         1       po  marketing       po    sail   12000

1         3   Flower      serch   Flower   serch   10002

>>> #df3主键key为employee和df4主键为employer,两者不同但又想相互匹配时,可以指定前者的left_on为employee和后者的right_on为employer这时两者可以进行匹配

>>> #内合并与外合并

>>> #内合并只保留两者都有的数据

>>> df1=DataFrame({'age':[18,22,33],'height':[175,169,180]})

>>> df1=DataFrame({'age':[18,23,32],'height':[175,169,180]})

>>> df2=DataFrame({'age':[18,22,33],'weight':[175,169,180]})

>>> pd.merge(df1,df2)

age  height  weight

0   18     175     175

>>> pd.merge(df1,df2,how='outer')

age  height  weight

0   18   175.0   175.0

1   23   169.0     NaN

2   32   180.0     NaN

3   22     NaN   169.0

4   33     NaN   180.0

>>> #默认为内合并,通过how可以指定合并类型

>>>

>>> pd.merge(df1,df2,how='left')

age  height  weight

0   18     175   175.0

1   23     169     NaN

2   32     180     NaN

>>> pd.merge(df1,df2,how='right')

age  height  weight

0   18   175.0     175

1   22     NaN     169

2   33     NaN     180

>>> #left保留前者的数据,right保留后者数据

>>> #left保留前者df1的数据,right保留后者df2数据

>>> #下面是列冲突

>>> df3= DataFrame({'employee':['po','Winter','Flower'],'group':['marketing','marking','serch'],'WorkTime':[1,2,3]})

>>> df4= DataFrame({'employee':['po','Summer','Flower'],'group':['sail','marking','serch'],'salary':[12000,20000,10002]})

>>> pd.merge(df3,df4)

WorkTime employee  group  salary

0         3   Flower  serch   10002

>>> pd.merge(df3,df4,on='employee',suffixes=['_李','_王'])

WorkTime employee   group_?? group_??  salary

0         1       po  marketing     sail   12000

1         3   Flower      serch    serch   10002

>>> #因为两者的employee和group相同,当指定employee为主键时,suffixes修改的就是group

4:总结:

多重索引也是pandas里非常重要的知识点,要牢牢掌握

作者:​​你的雷哥​​

版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。

上一篇:Scipy的应用
下一篇:python集合介绍
相关文章

 发表评论

暂时没有评论,来抢沙发吧~