数据分析第一篇:Pandas基础
import numpy as np
import pandas as pd
import matplotlib as plt
%matplotlib inline
数据导出导入
'''1、创建数据表'''
df = pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006],
"date":pd.date_range('20130102', periods=6),
"city":['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],
"age":[23,44,54,32,34,32],
"category":['100-A','100-B','110-A','110-C','210-A','130-F'],
"price":[1200,np.nan,2133,5433,np.nan,4432]},
columns =['id','date','city','category','age','price'])
df
id | date | city | category | age | price | |
---|---|---|---|---|---|---|
0 | 1001 | 2013-01-02 | Beijing | 100-A | 23 | 1200.0 |
1 | 1002 | 2013-01-03 | SH | 100-B | 44 | NaN |
2 | 1003 | 2013-01-04 | guangzhou | 110-A | 54 | 2133.0 |
3 | 1004 | 2013-01-05 | Shenzhen | 110-C | 32 | 5433.0 |
4 | 1005 | 2013-01-06 | shanghai | 210-A | 34 | NaN |
5 | 1006 | 2013-01-07 | BEIJING | 130-F | 32 | 4432.0 |
'''2、导出数据表'''
# df.to_excel('pandas_test.xls')
df.to_csv('pandas_test.csv')
'''3、导入数据表'''
df = pd.read_csv('pandas_test.csv',index_col=1)#可指定索引
df = pd.read_excel('pandas_test.xls',header=0)#可指定表头
数据查看
'''1、数据维度:数据由几行几列'''
df.shape
(6, 6)
'''2、数据信息:数据维度、列名称、数据格式和所占空间等'''
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6 entries, 0 to 5
Data columns (total 6 columns):
id 6 non-null int64
date 6 non-null datetime64[ns]
city 6 non-null object
category 6 non-null object
age 6 non-null int64
price 4 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 288.0+ bytes
'''3、数据格式'''
df.dtypes
id int64
date datetime64[ns]
city object
category object
age int64
price float64
dtype: object
'''4、查看空值,可以对整个数据表,也可以单独对某一列进行空值检查'''
df.isnull()
df['price'].isnull()
df['price'].isnull().value_counts()#可查看到底有几个空值出现
df.isnull.sum() #直接查看所有特征是否存在空值
0 False
1 True
2 False
3 False
4 True
5 False
Name: price, dtype: bool
'''5、去重复查看所有唯一值,只能对数据表中的某一列进行,类似去重复功能'''
a = df['age']
a.unique()#两种方法
a.drop_duplicates()
array([23, 44, 54, 32, 34], dtype=int64)
'''6、查看数据值,不包含表头'''
df.values
array([[1001, Timestamp('2013-01-02 00:00:00'), 'Beijing ', '100-A', 23,
1200.0],
[1002, Timestamp('2013-01-03 00:00:00'), 'SH', '100-B', 44, nan],
[1003, Timestamp('2013-01-04 00:00:00'), ' guangzhou ', '110-A', 54,
2133.0],
[1004, Timestamp('2013-01-05 00:00:00'), 'Shenzhen', '110-C', 32,
5433.0],
[1005, Timestamp('2013-01-06 00:00:00'), 'shanghai', '210-A', 34,
nan],
[1006, Timestamp('2013-01-07 00:00:00'), 'BEIJING ', '130-F', 32,
4432.0]], dtype=object)
'''7、查看列名'''
df.columns
Index(['id', 'date', 'city', 'category', 'age', 'price'], dtype='object')
'''8、查看前几行数据,默认前10行'''
df.head(3)
id | date | city | category | age | price | |
---|---|---|---|---|---|---|
0 | 1001 | 2013-01-02 | Beijing | 100-A | 23 | 1200.0 |
1 | 1002 | 2013-01-03 | SH | 100-B | 44 | NaN |
2 | 1003 | 2013-01-04 | guangzhou | 110-A | 54 | 2133.0 |
'''9、查看后几行数据,默认后10行'''
df.tail(3)
id | date | city | category | age | price | |
---|---|---|---|---|---|---|
3 | 1004 | 2013-01-05 | Shenzhen | 110-C | 32 | 5433.0 |
4 | 1005 | 2013-01-06 | shanghai | 210-A | 34 | NaN |
5 | 1006 | 2013-01-07 | BEIJING | 130-F | 32 | 4432.0 |
数据清洗
一、处理空值
'''1、删除空值'''
df.dropna(how='any')#参数任意位置为空
id | date | city | category | age | price | |
---|---|---|---|---|---|---|
0 | 1001 | 2013-01-02 | Beijing | 100-A | 23 | 1200.0 |
2 | 1003 | 2013-01-04 | guangzhou | 110-A | 54 | 2133.0 |
3 | 1004 | 2013-01-05 | Shenzhen | 110-C | 32 | 5433.0 |
5 | 1006 | 2013-01-07 | BEIJING | 130-F | 32 | 4432.0 |
#2、填充空值,用特定值如0或均值
df.fillna(df['price'].mean())#用均值填充
id | date | city | category | age | price | |
---|---|---|---|---|---|---|
0 | 1001 | 2013-01-02 | Beijing | 100-A | 23 | 1200.0 |
1 | 1002 | 2013-01-03 | SH | 100-B | 44 | 3299.5 |
2 | 1003 | 2013-01-04 | guangzhou | 110-A | 54 | 2133.0 |
3 | 1004 | 2013-01-05 | Shenzhen | 110-C | 32 | 5433.0 |
4 | 1005 | 2013-01-06 | shanghai | 210-A | 34 | 3299.5 |
5 | 1006 | 2013-01-07 | BEIJING | 130-F | 32 | 4432.0 |
二、处理空格
#1、清理空格,清除字符中空格
df['city'] = df['city'].map(str.strip)
0 BEIJING
1 SH
2 GUANGZHOU
3 SHENZHEN
4 SHANGHAI
5 BEIJING
Name: city, dtype: object
三、大写小写转化
#1、大写转小写
df['city'].str.lower()#转小写
df['city'] =df['city'].str.upper()#转大写
0 BEIJING
1 SH
2 GUANGZHOU
3 SHENZHEN
4 SHANGHAI
5 BEIJING
Name: city, dtype: object
四、改变数据格式
#1、修改数据格式
df['age'].astype('int')
0 23
1 44
2 54
3 32
4 34
5 32
Name: age, dtype: int32
五、更改列名称
#1、更改列名称
df.rename(columns={'age':'age_new'})
id | date | city | category | age_new | price | |
---|---|---|---|---|---|---|
0 | 1001 | 2013-01-02 | BEIJING | 100-A | 23 | 1200.0 |
1 | 1002 | 2013-01-03 | SH | 100-B | 44 | NaN |
2 | 1003 | 2013-01-04 | GUANGZHOU | 110-A | 54 | 2133.0 |
3 | 1004 | 2013-01-05 | SHENZHEN | 110-C | 32 | 5433.0 |
4 | 1005 | 2013-01-06 | SHANGHAI | 210-A | 34 | NaN |
5 | 1006 | 2013-01-07 | BEIJING | 130-F | 32 | 4432.0 |
六、删除重复值
#1、删除重复值,可选择保留最先或者最后的值
df['city'].drop_duplicates()#默认
df['city'].drop_duplicates(keep='last')
1 SH
2 GUANGZHOU
3 SHENZHEN
4 SHANGHAI
5 BEIJING
Name: city, dtype: object
七、数据修改或替换
'''非中文时可直接使用,中文时需.str.replace方可成功'''
df['city'].replace('SH','BEIJING')
0 BEIJING
1 BEIJING
2 GUANGZHOU
3 SHENZHEN
4 SHANGHAI
5 BEIJING
Name: city, dtype: object
数据预处理
df1=pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006,1007,1008],
"gender":['male','female','male','female','male','female','male','female'],
"pay":['Y','N','Y','Y','N','Y','N','Y',],
"m-point":[10,12,20,40,40,40,30,20]})
一、数据合并
#1、数据合并
df_new = pd.merge(df,df1,how='inner')#共有列名匹配合并,不匹配的丢弃
df_new = pd.merge(df,df1,how='left')#左合并,保留df
df_new = pd.merge(df,df1,how='right')#右合并,保留df1
df_new = pd.merge(df,df1,how='outer')#全合并,全部保留df,df1
二、设置索引列
#1、将某列设置为索引,该列数据不能重复
df_new.set_index('id')
date | city | category | age | price | gender | m-point | pay | |
---|---|---|---|---|---|---|---|---|
id | ||||||||
1001 | 2013-01-02 | BEIJING | 100-A | 23 | 1200.0 | male | 10 | Y |
1002 | 2013-01-03 | SH | 100-B | 44 | NaN | female | 12 | N |
1003 | 2013-01-04 | GUANGZHOU | 110-A | 54 | 2133.0 | male | 20 | Y |
1004 | 2013-01-05 | SHENZHEN | 110-C | 32 | 5433.0 | female | 40 | Y |
1005 | 2013-01-06 | SHANGHAI | 210-A | 34 | NaN | male | 40 | N |
1006 | 2013-01-07 | BEIJING | 130-F | 32 | 4432.0 | female | 40 | Y |
三、排序
#1、按索引排序
df_new.sort_index()
id | date | city | category | age | price | gender | m-point | pay | |
---|---|---|---|---|---|---|---|---|---|
0 | 1001 | 2013-01-02 | BEIJING | 100-A | 23 | 1200.0 | male | 10 | Y |
1 | 1002 | 2013-01-03 | SH | 100-B | 44 | NaN | female | 12 | N |
2 | 1003 | 2013-01-04 | GUANGZHOU | 110-A | 54 | 2133.0 | male | 20 | Y |
3 | 1004 | 2013-01-05 | SHENZHEN | 110-C | 32 | 5433.0 | female | 40 | Y |
4 | 1005 | 2013-01-06 | SHANGHAI | 210-A | 34 | NaN | male | 40 | N |
5 | 1006 | 2013-01-07 | BEIJING | 130-F | 32 | 4432.0 | female | 40 | Y |
#1、按数值排序
df_new = df_new.sort_values(by=['age','id'])#可设置多级排序
id | date | city | category | age | price | gender | m-point | pay | |
---|---|---|---|---|---|---|---|---|---|
0 | 1001 | 2013-01-02 | BEIJING | 100-A | 23 | 1200.0 | male | 10 | Y |
3 | 1004 | 2013-01-05 | SHENZHEN | 110-C | 32 | 5433.0 | female | 40 | Y |
5 | 1006 | 2013-01-07 | BEIJING | 130-F | 32 | 4432.0 | female | 40 | Y |
4 | 1005 | 2013-01-06 | SHANGHAI | 210-A | 34 | NaN | male | 40 | N |
1 | 1002 | 2013-01-03 | SH | 100-B | 44 | NaN | female | 12 | N |
2 | 1003 | 2013-01-04 | GUANGZHOU | 110-A | 54 | 2133.0 | male | 20 | Y |
#★★ 应添加数据列的新增、删除、修改 #应添加数据列划分,loc、iloc、ix的使用区别
四、数据分组
#1、单条件分组标记列
df_new['group'] = np.where(df_new['age'] > 40,'high','low')
df_new
id | date | city | category | age | price | gender | m-point | pay | group | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1001 | 2013-01-02 | BEIJING | 100-A | 23 | 1200.0 | male | 10 | Y | low |
3 | 1004 | 2013-01-05 | SHENZHEN | 110-C | 32 | 5433.0 | female | 40 | Y | low |
5 | 1006 | 2013-01-07 | BEIJING | 130-F | 32 | 4432.0 | female | 40 | Y | low |
4 | 1005 | 2013-01-06 | SHANGHAI | 210-A | 34 | NaN | male | 40 | N | low |
1 | 1002 | 2013-01-03 | SH | 100-B | 44 | NaN | female | 12 | N | high |
2 | 1003 | 2013-01-04 | GUANGZHOU | 110-A | 54 | 2133.0 | male | 20 | Y | high |
#2、多条件复合,生成分组标记列
df_new.loc[(df_new['age'] >= 40) & (df_new['price']>= 2000), 'sign']=1
df_new
id | date | city | category | age | price | gender | m-point | pay | group | sign | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1001 | 2013-01-02 | BEIJING | 100-A | 23 | 1200.0 | male | 10 | Y | low | NaN |
3 | 1004 | 2013-01-05 | SHENZHEN | 110-C | 32 | 5433.0 | female | 40 | Y | low | NaN |
5 | 1006 | 2013-01-07 | BEIJING | 130-F | 32 | 4432.0 | female | 40 | Y | low | NaN |
4 | 1005 | 2013-01-06 | SHANGHAI | 210-A | 34 | NaN | male | 40 | N | low | NaN |
1 | 1002 | 2013-01-03 | SH | 100-B | 44 | NaN | female | 12 | N | high | NaN |
2 | 1003 | 2013-01-04 | GUANGZHOU | 110-A | 54 | 2133.0 | male | 20 | Y | high | 1.0 |
五、数据分列
# 1、依据分割符号对某个列进行分列,然后通过index与原df进行合并
split = pd.DataFrame([x.split('-') for x in df_new['category']],index=df_new.index,columns=['category','size'])
df_new = pd.merge(df_new,split,left_index=True,right_index=True)
id | date | city | category_x | age | price | gender | m-point | pay | group | sign | category_y | size | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1001 | 2013-01-02 | BEIJING | 100-A | 23 | 1200.0 | male | 10 | Y | low | NaN | 100 | A |
3 | 1004 | 2013-01-05 | SHENZHEN | 110-C | 32 | 5433.0 | female | 40 | Y | low | NaN | 110 | C |
5 | 1006 | 2013-01-07 | BEIJING | 130-F | 32 | 4432.0 | female | 40 | Y | low | NaN | 130 | F |
4 | 1005 | 2013-01-06 | SHANGHAI | 210-A | 34 | NaN | male | 40 | N | low | NaN | 210 | A |
1 | 1002 | 2013-01-03 | SH | 100-B | 44 | NaN | female | 12 | N | high | NaN | 100 | B |
2 | 1003 | 2013-01-04 | GUANGZHOU | 110-A | 54 | 2133.0 | male | 20 | Y | high | 1.0 | 110 | A |
数据提取和切片
一、按索引标签值进行提取:loc
#1、按索引标签提取单行
df_new.loc[3]
id 1004
date 2013-01-05 00:00:00
city SHENZHEN
category_x 110-C
age 32
price 5433
gender female
m-point 40
pay Y
group low
sign NaN
category_y 110
size C
Name: 3, dtype: object
#2、按索引标签提取多行
df_new.loc[3:4]#★★注意此处,是取标签值间的
id | date | city | category_x | age | price | gender | m-point | pay | group | sign | category_y | size | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3 | 1004 | 2013-01-05 | SHENZHEN | 110-C | 32 | 5433.0 | female | 40 | Y | low | NaN | 110 | C |
5 | 1006 | 2013-01-07 | BEIJING | 130-F | 32 | 4432.0 | female | 40 | Y | low | NaN | 130 | F |
4 | 1005 | 2013-01-06 | SHANGHAI | 210-A | 34 | NaN | male | 40 | N | low | NaN | 210 | A |
#3、恢复顺序索引
df_new.reset_index()
index | id | date | city | category_x | age | price | gender | m-point | pay | group | sign | category_y | size | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1001 | 2013-01-02 | BEIJING | 100-A | 23 | 1200.0 | male | 10 | Y | low | NaN | 100 | A |
1 | 3 | 1004 | 2013-01-05 | SHENZHEN | 110-C | 32 | 5433.0 | female | 40 | Y | low | NaN | 110 | C |
2 | 5 | 1006 | 2013-01-07 | BEIJING | 130-F | 32 | 4432.0 | female | 40 | Y | low | NaN | 130 | F |
3 | 4 | 1005 | 2013-01-06 | SHANGHAI | 210-A | 34 | NaN | male | 40 | N | low | NaN | 210 | A |
4 | 1 | 1002 | 2013-01-03 | SH | 100-B | 44 | NaN | female | 12 | N | high | NaN | 100 | B |
5 | 2 | 1003 | 2013-01-04 | GUANGZHOU | 110-A | 54 | 2133.0 | male | 20 | Y | high | 1.0 | 110 | A |
#4、设置日期为索引
df_new = df_new.set_index('date')
id | city | category_x | age | price | gender | m-point | pay | group | sign | category_y | size | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
date | ||||||||||||
2013-01-02 | 1001 | BEIJING | 100-A | 23 | 1200.0 | male | 10 | Y | low | NaN | 100 | A |
2013-01-05 | 1004 | SHENZHEN | 110-C | 32 | 5433.0 | female | 40 | Y | low | NaN | 110 | C |
2013-01-07 | 1006 | BEIJING | 130-F | 32 | 4432.0 | female | 40 | Y | low | NaN | 130 | F |
2013-01-06 | 1005 | SHANGHAI | 210-A | 34 | NaN | male | 40 | N | low | NaN | 210 | A |
2013-01-03 | 1002 | SH | 100-B | 44 | NaN | female | 12 | N | high | NaN | 100 | B |
2013-01-04 | 1003 | GUANGZHOU | 110-A | 54 | 2133.0 | male | 20 | Y | high | 1.0 | 110 | A |
#5、按日期标签提取
df_new.loc[:'2013-01-03']
id | city | category_x | age | price | gender | m-point | pay | group | sign | category_y | size | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
date | ||||||||||||
2013-01-02 | 1001 | BEIJING | 100-A | 23 | 1200.0 | male | 10 | Y | low | NaN | 100 | A |
2013-01-03 | 1002 | SH | 100-B | 44 | NaN | female | 12 | N | high | NaN | 100 | B |
二、按实际位置提取数据:iloc
#1、按实际位置提取
df_new.iloc[0:2,1:4]
city | category_x | age | |
---|---|---|---|
date | |||
2013-01-02 | BEIJING | 100-A | 23 |
2013-01-05 | SHENZHEN | 110-C | 32 |
#2、按实际位置提取特定数据
df_new.iloc[[1,3,5],1:4]
city | category_x | age | |
---|---|---|---|
date | |||
2013-01-05 | SHENZHEN | 110-C | 32 |
2013-01-06 | SHANGHAI | 210-A | 34 |
2013-01-04 | GUANGZHOU | 110-A | 54 |
三、按标签和位置提取:ix
#1、混合提取,规则:先按标签,再按位置
df_new.ix[:'2013-01-04',:3]
id | city | category_x | |
---|---|---|---|
date | |||
2013-01-02 | 1001 | BEIJING | 100-A |
2013-01-03 | 1002 | SH | 100-B |
2013-01-04 | 1003 | GUANGZHOU | 110-A |
四、按条件提取(值或区域)
#1、直接逻辑条件提取
df_new[df_new['city'].isin(['SH','BEIJING'])]
id | city | category_x | age | price | gender | m-point | pay | group | sign | category_y | size | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
date | ||||||||||||
2013-01-02 | 1001 | BEIJING | 100-A | 23 | 1200.0 | male | 10 | Y | low | NaN | 100 | A |
2013-01-07 | 1006 | BEIJING | 130-F | 32 | 4432.0 | female | 40 | Y | low | NaN | 130 | F |
2013-01-03 | 1002 | SH | 100-B | 44 | NaN | female | 12 | N | high | NaN | 100 | B |
#2.嵌套函数逻辑值提取
df_new.loc[df_new['city'].isin(['SH','BEIJING'])]
id | city | category_x | age | price | gender | m-point | pay | group | sign | category_y | size | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
date | ||||||||||||
2013-01-02 | 1001 | BEIJING | 100-A | 23 | 1200.0 | male | 10 | Y | low | NaN | 100 | A |
2013-01-07 | 1006 | BEIJING | 130-F | 32 | 4432.0 | female | 40 | Y | low | NaN | 130 | F |
2013-01-03 | 1002 | SH | 100-B | 44 | NaN | female | 12 | N | high | NaN | 100 | B |
#3、嵌套函数逻辑值提取
df_new.ix[df_new['city'].isin(['SH','BEIJING'])]
id | city | category_x | age | price | gender | m-point | pay | group | sign | category_y | size | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
date | ||||||||||||
2013-01-02 | 1001 | BEIJING | 100-A | 23 | 1200.0 | male | 10 | Y | low | NaN | 100 | A |
2013-01-07 | 1006 | BEIJING | 130-F | 32 | 4432.0 | female | 40 | Y | low | NaN | 130 | F |
2013-01-03 | 1002 | SH | 100-B | 44 | NaN | female | 12 | N | high | NaN | 100 | B |
五、类似分列的提取方式
#1、观察数值组成结构,选择性提取
df_new['category_x'].str[:3]
date
2013-01-02 100
2013-01-05 110
2013-01-07 130
2013-01-06 210
2013-01-03 100
2013-01-04 110
Name: category_x, dtype: object
数据筛选
# df_new.to_excel('pandas_test2.xls')
df_new = pd.read_excel('pandas_test2.xls')
df_new
date | id | city | category_x | age | price | gender | m-point | pay | group | sign | category_y | size | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2013-01-02 | 1001 | BEIJING | 100-A | 23 | 1200.0 | male | 10 | Y | low | NaN | 100 | A |
1 | 2013-01-05 | 1004 | SHENZHEN | 110-C | 32 | 5433.0 | female | 40 | Y | low | NaN | 110 | C |
2 | 2013-01-07 | 1006 | BEIJING | 130-F | 32 | 4432.0 | female | 40 | Y | low | NaN | 130 | F |
3 | 2013-01-06 | 1005 | SHANGHAI | 210-A | 34 | NaN | male | 40 | N | low | NaN | 210 | A |
4 | 2013-01-03 | 1002 | SH | 100-B | 44 | NaN | female | 12 | N | high | NaN | 100 | B |
5 | 2013-01-04 | 1003 | GUANGZHOU | 110-A | 54 | 2133.0 | male | 20 | Y | high | 1.0 | 110 | A |
按条件筛选(或与非)
#1、与条件联合筛选
df_new[(df_new['age']>30) & (df_new['category_y']>200)][['city','age','category_y']]
df_new.loc[ (df_new['age']>30) & (df_new['category_y']>200),['city','age','category_y']]
city | age | category_y | |
---|---|---|---|
3 | SHANGHAI | 34 | 210 |
#2、或条件联合筛选
df_new[(df_new['city']=='BEIJING')|(df_new['age']>40)][['city','age']]
df_new.loc[(df_new['city']=='BEIJING')|(df_new['age']>40),['city','age']]
city | age | |
---|---|---|
0 | BEIJING | 23 |
2 | BEIJING | 32 |
4 | SH | 44 |
5 | GUANGZHOU | 54 |
#3、非条件筛选
df_new[df_new['city']!='BEIJING'][['city','age']]
city | age | |
---|---|---|
1 | SHENZHEN | 32 |
3 | SHANGHAI | 34 |
4 | SH | 44 |
5 | GUANGZHOU | 54 |
#4、通用筛选函数,简化代码
df_new.query('age > 30 & city == ["BEIJING"]')#★★注意括号里的引号要用双引号
date | id | city | category_x | age | price | gender | m-point | pay | group | sign | category_y | size | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | 2013-01-07 | 1006 | BEIJING | 130-F | 32 | 4432.0 | female | 40 | Y | low | NaN | 130 | F |
#5、类似sumif、countif功能
df_new[df_new['city']!='BEIJING'][['city','age']].age.sum()#求某列和
df_new[df_new['city']=='BEIJING'][['city','age']].city.count()#求某列技术
2
数据汇总
分类汇总:ground by
#1、指定字段或方式进行汇总,取字段,然后计数或求和
df_new.groupby('pay')['age'].count()
pay
N 2
Y 4
Name: age, dtype: int64
#2、多字段多级别分组进行汇总 df_new.groupby([‘pay’,’size’])[‘age’].count()
#3、多级别分组后,可多级运算
a = df_new.groupby(['pay','size'])['age']
a.agg([np.sum,np.mean,len])#agg函数,可添加多个函数
sum | mean | len | ||
---|---|---|---|---|
pay | size | |||
N | A | 34 | 34.0 | 1 |
B | 44 | 44.0 | 1 | |
Y | A | 77 | 38.5 | 2 |
C | 32 | 32.0 | 1 | |
F | 32 | 32.0 | 1 |
数据透视 pivot_table
#1、数据透视表,设置行、列、数据值、运算函数、空值填充
pd.pivot_table(df_new,index=['city'],columns=['size'],values='price',aggfunc=[len,np.mean,np.sum],fill_value=0)
len | mean | sum | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
size | A | B | C | F | A | B | C | F | A | B | C | F |
city | ||||||||||||
BEIJING | 1 | 0 | 0 | 1 | 1200 | 0 | 0 | 4432 | 1200 | 0 | 0 | 4432 |
GUANGZHOU | 1 | 0 | 0 | 0 | 2133 | 0 | 0 | 0 | 2133 | 0 | 0 | 0 |
SH | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
SHANGHAI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
SHENZHEN | 0 | 0 | 1 | 0 | 0 | 0 | 5433 | 0 | 0 | 0 | 5433 | 0 |
数据统计
数据抽样 sample
#1、数据简单随机抽样
df_new.sample(2,replace=False)#重要参数是否放回,即重复或不重复抽样,默认不重复
date | id | city | category_x | age | price | gender | m-point | pay | group | sign | category_y | size | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5 | 2013-01-04 | 1003 | GUANGZHOU | 110-A | 54 | 2133.0 | male | 20 | Y | high | 1.0 | 110 | A |
1 | 2013-01-05 | 1004 | SHENZHEN | 110-C | 32 | 5433.0 | female | 40 | Y | low | NaN | 110 | C |
描述统计 Describe
#2、描述性统计
df_new.describe().round(2).T#统计后去二位小数,并转置
E:\Anaconda3\lib\site-packages\numpy\lib\function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
RuntimeWarning)
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
id | 6.0 | 1003.50 | 1.87 | 1001.0 | 1002.25 | 1003.5 | 1004.75 | 1006.0 |
age | 6.0 | 36.50 | 10.88 | 23.0 | 32.00 | 33.0 | 41.50 | 54.0 |
price | 4.0 | 3299.50 | 1966.64 | 1200.0 | NaN | NaN | NaN | 5433.0 |
m-point | 6.0 | 27.00 | 14.63 | 10.0 | 14.00 | 30.0 | 40.00 | 40.0 |
sign | 1.0 | 1.00 | NaN | 1.0 | NaN | NaN | NaN | 1.0 |
category_y | 6.0 | 126.67 | 42.27 | 100.0 | 102.50 | 110.0 | 125.00 | 210.0 |
'''平均数'''
df_new['age'].mean()
'''中位数'''
df_new['age'].median()
'''众数'''
df_new['age'].mode()
#3、标准差
df_new['age'].std()
10.876580344942981
#4、协方差
df_new.cov()#整体间
df_new['age'].cov(df_new['price'])#两个字段间
-2255.833333333333
#5、相关分析系数
df_new.corr()#整体相关系数
df_new['age'].corr(df_new['m-point'])#两个字段间相关
-0.19483296280358281