数据分析第一篇:Pandas基础

2017/07/07 数据分析

数据分析第一篇:Pandas基础

import numpy as np
import pandas as pd
import matplotlib as plt
%matplotlib inline

数据导出导入

'''1、创建数据表'''
df = pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006],
                   "date":pd.date_range('20130102', periods=6),
                   "city":['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],
                   "age":[23,44,54,32,34,32],
                   "category":['100-A','100-B','110-A','110-C','210-A','130-F'],
                   "price":[1200,np.nan,2133,5433,np.nan,4432]},
                  columns =['id','date','city','category','age','price'])
df
id date city category age price
0 1001 2013-01-02 Beijing 100-A 23 1200.0
1 1002 2013-01-03 SH 100-B 44 NaN
2 1003 2013-01-04 guangzhou 110-A 54 2133.0
3 1004 2013-01-05 Shenzhen 110-C 32 5433.0
4 1005 2013-01-06 shanghai 210-A 34 NaN
5 1006 2013-01-07 BEIJING 130-F 32 4432.0
'''2、导出数据表'''
# df.to_excel('pandas_test.xls')
df.to_csv('pandas_test.csv')

'''3、导入数据表'''
df = pd.read_csv('pandas_test.csv',index_col=1)#可指定索引
df = pd.read_excel('pandas_test.xls',header=0)#可指定表头

数据查看

'''1、数据维度:数据由几行几列'''
df.shape
(6, 6)
'''2、数据信息:数据维度、列名称、数据格式和所占空间等'''
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6 entries, 0 to 5
Data columns (total 6 columns):
id          6 non-null int64
date        6 non-null datetime64[ns]
city        6 non-null object
category    6 non-null object
age         6 non-null int64
price       4 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 288.0+ bytes
'''3、数据格式'''
df.dtypes
id                   int64
date        datetime64[ns]
city                object
category            object
age                  int64
price              float64
dtype: object
'''4、查看空值,可以对整个数据表,也可以单独对某一列进行空值检查'''
df.isnull()
df['price'].isnull()
df['price'].isnull().value_counts()#可查看到底有几个空值出现
df.isnull.sum() #直接查看所有特征是否存在空值
0    False
1     True
2    False
3    False
4     True
5    False
Name: price, dtype: bool
'''5、去重复查看所有唯一值,只能对数据表中的某一列进行,类似去重复功能'''
a = df['age']
a.unique()#两种方法
a.drop_duplicates()
array([23, 44, 54, 32, 34], dtype=int64)
'''6、查看数据值,不包含表头'''
df.values
array([[1001, Timestamp('2013-01-02 00:00:00'), 'Beijing ', '100-A', 23,
        1200.0],
       [1002, Timestamp('2013-01-03 00:00:00'), 'SH', '100-B', 44, nan],
       [1003, Timestamp('2013-01-04 00:00:00'), ' guangzhou ', '110-A', 54,
        2133.0],
       [1004, Timestamp('2013-01-05 00:00:00'), 'Shenzhen', '110-C', 32,
        5433.0],
       [1005, Timestamp('2013-01-06 00:00:00'), 'shanghai', '210-A', 34,
        nan],
       [1006, Timestamp('2013-01-07 00:00:00'), 'BEIJING ', '130-F', 32,
        4432.0]], dtype=object)
'''7、查看列名'''
df.columns
Index(['id', 'date', 'city', 'category', 'age', 'price'], dtype='object')
'''8、查看前几行数据,默认前10行'''
df.head(3)
id date city category age price
0 1001 2013-01-02 Beijing 100-A 23 1200.0
1 1002 2013-01-03 SH 100-B 44 NaN
2 1003 2013-01-04 guangzhou 110-A 54 2133.0
'''9、查看后几行数据,默认后10行'''
df.tail(3)
id date city category age price
3 1004 2013-01-05 Shenzhen 110-C 32 5433.0
4 1005 2013-01-06 shanghai 210-A 34 NaN
5 1006 2013-01-07 BEIJING 130-F 32 4432.0

数据清洗

一、处理空值

'''1、删除空值'''
df.dropna(how='any')#参数任意位置为空
id date city category age price
0 1001 2013-01-02 Beijing 100-A 23 1200.0
2 1003 2013-01-04 guangzhou 110-A 54 2133.0
3 1004 2013-01-05 Shenzhen 110-C 32 5433.0
5 1006 2013-01-07 BEIJING 130-F 32 4432.0
#2、填充空值,用特定值如0或均值
df.fillna(df['price'].mean())#用均值填充
id date city category age price
0 1001 2013-01-02 Beijing 100-A 23 1200.0
1 1002 2013-01-03 SH 100-B 44 3299.5
2 1003 2013-01-04 guangzhou 110-A 54 2133.0
3 1004 2013-01-05 Shenzhen 110-C 32 5433.0
4 1005 2013-01-06 shanghai 210-A 34 3299.5
5 1006 2013-01-07 BEIJING 130-F 32 4432.0

二、处理空格

#1、清理空格,清除字符中空格
df['city'] = df['city'].map(str.strip)
0      BEIJING
1           SH
2    GUANGZHOU
3     SHENZHEN
4     SHANGHAI
5      BEIJING
Name: city, dtype: object

三、大写小写转化

#1、大写转小写
df['city'].str.lower()#转小写
df['city'] =df['city'].str.upper()#转大写
0      BEIJING
1           SH
2    GUANGZHOU
3     SHENZHEN
4     SHANGHAI
5      BEIJING
Name: city, dtype: object

四、改变数据格式

#1、修改数据格式
df['age'].astype('int')
0    23
1    44
2    54
3    32
4    34
5    32
Name: age, dtype: int32

五、更改列名称

#1、更改列名称
df.rename(columns={'age':'age_new'})
id date city category age_new price
0 1001 2013-01-02 BEIJING 100-A 23 1200.0
1 1002 2013-01-03 SH 100-B 44 NaN
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0
3 1004 2013-01-05 SHENZHEN 110-C 32 5433.0
4 1005 2013-01-06 SHANGHAI 210-A 34 NaN
5 1006 2013-01-07 BEIJING 130-F 32 4432.0

六、删除重复值

#1、删除重复值,可选择保留最先或者最后的值
df['city'].drop_duplicates()#默认
df['city'].drop_duplicates(keep='last')
1             SH
2     GUANGZHOU
3       SHENZHEN
4       SHANGHAI
5       BEIJING
Name: city, dtype: object

七、数据修改或替换

'''非中文时可直接使用,中文时需.str.replace方可成功'''
df['city'].replace('SH','BEIJING')
0      BEIJING
1      BEIJING
2    GUANGZHOU
3     SHENZHEN
4     SHANGHAI
5      BEIJING
Name: city, dtype: object

数据预处理

df1=pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006,1007,1008],
                  "gender":['male','female','male','female','male','female','male','female'],
                  "pay":['Y','N','Y','Y','N','Y','N','Y',],
                  "m-point":[10,12,20,40,40,40,30,20]})

一、数据合并

#1、数据合并
df_new = pd.merge(df,df1,how='inner')#共有列名匹配合并,不匹配的丢弃
df_new = pd.merge(df,df1,how='left')#左合并,保留df
df_new = pd.merge(df,df1,how='right')#右合并,保留df1
df_new = pd.merge(df,df1,how='outer')#全合并,全部保留df,df1

二、设置索引列

#1、将某列设置为索引,该列数据不能重复
df_new.set_index('id')
date city category age price gender m-point pay
id
1001 2013-01-02 BEIJING 100-A 23 1200.0 male 10 Y
1002 2013-01-03 SH 100-B 44 NaN female 12 N
1003 2013-01-04 GUANGZHOU 110-A 54 2133.0 male 20 Y
1004 2013-01-05 SHENZHEN 110-C 32 5433.0 female 40 Y
1005 2013-01-06 SHANGHAI 210-A 34 NaN male 40 N
1006 2013-01-07 BEIJING 130-F 32 4432.0 female 40 Y

三、排序

#1、按索引排序
df_new.sort_index()
id date city category age price gender m-point pay
0 1001 2013-01-02 BEIJING 100-A 23 1200.0 male 10 Y
1 1002 2013-01-03 SH 100-B 44 NaN female 12 N
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0 male 20 Y
3 1004 2013-01-05 SHENZHEN 110-C 32 5433.0 female 40 Y
4 1005 2013-01-06 SHANGHAI 210-A 34 NaN male 40 N
5 1006 2013-01-07 BEIJING 130-F 32 4432.0 female 40 Y
#1、按数值排序
df_new = df_new.sort_values(by=['age','id'])#可设置多级排序
id date city category age price gender m-point pay
0 1001 2013-01-02 BEIJING 100-A 23 1200.0 male 10 Y
3 1004 2013-01-05 SHENZHEN 110-C 32 5433.0 female 40 Y
5 1006 2013-01-07 BEIJING 130-F 32 4432.0 female 40 Y
4 1005 2013-01-06 SHANGHAI 210-A 34 NaN male 40 N
1 1002 2013-01-03 SH 100-B 44 NaN female 12 N
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0 male 20 Y

#★★ 应添加数据列的新增、删除、修改 #应添加数据列划分,loc、iloc、ix的使用区别

四、数据分组
#1、单条件分组标记列
df_new['group'] = np.where(df_new['age'] > 40,'high','low')
df_new
id date city category age price gender m-point pay group
0 1001 2013-01-02 BEIJING 100-A 23 1200.0 male 10 Y low
3 1004 2013-01-05 SHENZHEN 110-C 32 5433.0 female 40 Y low
5 1006 2013-01-07 BEIJING 130-F 32 4432.0 female 40 Y low
4 1005 2013-01-06 SHANGHAI 210-A 34 NaN male 40 N low
1 1002 2013-01-03 SH 100-B 44 NaN female 12 N high
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0 male 20 Y high
#2、多条件复合,生成分组标记列
df_new.loc[(df_new['age'] >= 40) & (df_new['price']>= 2000), 'sign']=1
df_new
id date city category age price gender m-point pay group sign
0 1001 2013-01-02 BEIJING 100-A 23 1200.0 male 10 Y low NaN
3 1004 2013-01-05 SHENZHEN 110-C 32 5433.0 female 40 Y low NaN
5 1006 2013-01-07 BEIJING 130-F 32 4432.0 female 40 Y low NaN
4 1005 2013-01-06 SHANGHAI 210-A 34 NaN male 40 N low NaN
1 1002 2013-01-03 SH 100-B 44 NaN female 12 N high NaN
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0 male 20 Y high 1.0
五、数据分列
# 1、依据分割符号对某个列进行分列,然后通过index与原df进行合并
split = pd.DataFrame([x.split('-') for x in df_new['category']],index=df_new.index,columns=['category','size'])
df_new = pd.merge(df_new,split,left_index=True,right_index=True)
id date city category_x age price gender m-point pay group sign category_y size
0 1001 2013-01-02 BEIJING 100-A 23 1200.0 male 10 Y low NaN 100 A
3 1004 2013-01-05 SHENZHEN 110-C 32 5433.0 female 40 Y low NaN 110 C
5 1006 2013-01-07 BEIJING 130-F 32 4432.0 female 40 Y low NaN 130 F
4 1005 2013-01-06 SHANGHAI 210-A 34 NaN male 40 N low NaN 210 A
1 1002 2013-01-03 SH 100-B 44 NaN female 12 N high NaN 100 B
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0 male 20 Y high 1.0 110 A

数据提取和切片

一、按索引标签值进行提取:loc

#1、按索引标签提取单行
df_new.loc[3]
id                           1004
date          2013-01-05 00:00:00
city                     SHENZHEN
category_x                  110-C
age                            32
price                        5433
gender                     female
m-point                        40
pay                             Y
group                         low
sign                          NaN
category_y                    110
size                            C
Name: 3, dtype: object
#2、按索引标签提取多行
df_new.loc[3:4]#★★注意此处,是取标签值间的
id date city category_x age price gender m-point pay group sign category_y size
3 1004 2013-01-05 SHENZHEN 110-C 32 5433.0 female 40 Y low NaN 110 C
5 1006 2013-01-07 BEIJING 130-F 32 4432.0 female 40 Y low NaN 130 F
4 1005 2013-01-06 SHANGHAI 210-A 34 NaN male 40 N low NaN 210 A
#3、恢复顺序索引
df_new.reset_index()
index id date city category_x age price gender m-point pay group sign category_y size
0 0 1001 2013-01-02 BEIJING 100-A 23 1200.0 male 10 Y low NaN 100 A
1 3 1004 2013-01-05 SHENZHEN 110-C 32 5433.0 female 40 Y low NaN 110 C
2 5 1006 2013-01-07 BEIJING 130-F 32 4432.0 female 40 Y low NaN 130 F
3 4 1005 2013-01-06 SHANGHAI 210-A 34 NaN male 40 N low NaN 210 A
4 1 1002 2013-01-03 SH 100-B 44 NaN female 12 N high NaN 100 B
5 2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0 male 20 Y high 1.0 110 A
#4、设置日期为索引
df_new = df_new.set_index('date')
id city category_x age price gender m-point pay group sign category_y size
date
2013-01-02 1001 BEIJING 100-A 23 1200.0 male 10 Y low NaN 100 A
2013-01-05 1004 SHENZHEN 110-C 32 5433.0 female 40 Y low NaN 110 C
2013-01-07 1006 BEIJING 130-F 32 4432.0 female 40 Y low NaN 130 F
2013-01-06 1005 SHANGHAI 210-A 34 NaN male 40 N low NaN 210 A
2013-01-03 1002 SH 100-B 44 NaN female 12 N high NaN 100 B
2013-01-04 1003 GUANGZHOU 110-A 54 2133.0 male 20 Y high 1.0 110 A
#5、按日期标签提取
df_new.loc[:'2013-01-03']
id city category_x age price gender m-point pay group sign category_y size
date
2013-01-02 1001 BEIJING 100-A 23 1200.0 male 10 Y low NaN 100 A
2013-01-03 1002 SH 100-B 44 NaN female 12 N high NaN 100 B

二、按实际位置提取数据:iloc

#1、按实际位置提取
df_new.iloc[0:2,1:4]
city category_x age
date
2013-01-02 BEIJING 100-A 23
2013-01-05 SHENZHEN 110-C 32
#2、按实际位置提取特定数据
df_new.iloc[[1,3,5],1:4]
city category_x age
date
2013-01-05 SHENZHEN 110-C 32
2013-01-06 SHANGHAI 210-A 34
2013-01-04 GUANGZHOU 110-A 54

三、按标签和位置提取:ix

#1、混合提取,规则:先按标签,再按位置
df_new.ix[:'2013-01-04',:3]
id city category_x
date
2013-01-02 1001 BEIJING 100-A
2013-01-03 1002 SH 100-B
2013-01-04 1003 GUANGZHOU 110-A

四、按条件提取(值或区域)

#1、直接逻辑条件提取
df_new[df_new['city'].isin(['SH','BEIJING'])]
id city category_x age price gender m-point pay group sign category_y size
date
2013-01-02 1001 BEIJING 100-A 23 1200.0 male 10 Y low NaN 100 A
2013-01-07 1006 BEIJING 130-F 32 4432.0 female 40 Y low NaN 130 F
2013-01-03 1002 SH 100-B 44 NaN female 12 N high NaN 100 B
#2.嵌套函数逻辑值提取
df_new.loc[df_new['city'].isin(['SH','BEIJING'])]
id city category_x age price gender m-point pay group sign category_y size
date
2013-01-02 1001 BEIJING 100-A 23 1200.0 male 10 Y low NaN 100 A
2013-01-07 1006 BEIJING 130-F 32 4432.0 female 40 Y low NaN 130 F
2013-01-03 1002 SH 100-B 44 NaN female 12 N high NaN 100 B
#3、嵌套函数逻辑值提取
df_new.ix[df_new['city'].isin(['SH','BEIJING'])]
id city category_x age price gender m-point pay group sign category_y size
date
2013-01-02 1001 BEIJING 100-A 23 1200.0 male 10 Y low NaN 100 A
2013-01-07 1006 BEIJING 130-F 32 4432.0 female 40 Y low NaN 130 F
2013-01-03 1002 SH 100-B 44 NaN female 12 N high NaN 100 B

五、类似分列的提取方式

#1、观察数值组成结构,选择性提取
df_new['category_x'].str[:3]
date
2013-01-02    100
2013-01-05    110
2013-01-07    130
2013-01-06    210
2013-01-03    100
2013-01-04    110
Name: category_x, dtype: object

数据筛选

# df_new.to_excel('pandas_test2.xls')
df_new = pd.read_excel('pandas_test2.xls')
df_new
date id city category_x age price gender m-point pay group sign category_y size
0 2013-01-02 1001 BEIJING 100-A 23 1200.0 male 10 Y low NaN 100 A
1 2013-01-05 1004 SHENZHEN 110-C 32 5433.0 female 40 Y low NaN 110 C
2 2013-01-07 1006 BEIJING 130-F 32 4432.0 female 40 Y low NaN 130 F
3 2013-01-06 1005 SHANGHAI 210-A 34 NaN male 40 N low NaN 210 A
4 2013-01-03 1002 SH 100-B 44 NaN female 12 N high NaN 100 B
5 2013-01-04 1003 GUANGZHOU 110-A 54 2133.0 male 20 Y high 1.0 110 A

按条件筛选(或与非)

#1、与条件联合筛选
df_new[(df_new['age']>30) & (df_new['category_y']>200)][['city','age','category_y']]
df_new.loc[ (df_new['age']>30) & (df_new['category_y']>200),['city','age','category_y']]
city age category_y
3 SHANGHAI 34 210
#2、或条件联合筛选
df_new[(df_new['city']=='BEIJING')|(df_new['age']>40)][['city','age']]
df_new.loc[(df_new['city']=='BEIJING')|(df_new['age']>40),['city','age']]
city age
0 BEIJING 23
2 BEIJING 32
4 SH 44
5 GUANGZHOU 54
#3、非条件筛选
df_new[df_new['city']!='BEIJING'][['city','age']]
city age
1 SHENZHEN 32
3 SHANGHAI 34
4 SH 44
5 GUANGZHOU 54
#4、通用筛选函数,简化代码
df_new.query('age > 30 & city == ["BEIJING"]')#★★注意括号里的引号要用双引号
date id city category_x age price gender m-point pay group sign category_y size
2 2013-01-07 1006 BEIJING 130-F 32 4432.0 female 40 Y low NaN 130 F
#5、类似sumif、countif功能
df_new[df_new['city']!='BEIJING'][['city','age']].age.sum()#求某列和
df_new[df_new['city']=='BEIJING'][['city','age']].city.count()#求某列技术
2

数据汇总

分类汇总:ground by

#1、指定字段或方式进行汇总,取字段,然后计数或求和
df_new.groupby('pay')['age'].count()
pay
N    2
Y    4
Name: age, dtype: int64

#2、多字段多级别分组进行汇总 df_new.groupby([‘pay’,’size’])[‘age’].count()

#3、多级别分组后,可多级运算
a = df_new.groupby(['pay','size'])['age']
a.agg([np.sum,np.mean,len])#agg函数,可添加多个函数
sum mean len
pay size
N A 34 34.0 1
B 44 44.0 1
Y A 77 38.5 2
C 32 32.0 1
F 32 32.0 1
数据透视 pivot_table
#1、数据透视表,设置行、列、数据值、运算函数、空值填充
pd.pivot_table(df_new,index=['city'],columns=['size'],values='price',aggfunc=[len,np.mean,np.sum],fill_value=0)
len mean sum
size A B C F A B C F A B C F
city
BEIJING 1 0 0 1 1200 0 0 4432 1200 0 0 4432
GUANGZHOU 1 0 0 0 2133 0 0 0 2133 0 0 0
SH 0 1 0 0 0 0 0 0 0 0 0 0
SHANGHAI 1 0 0 0 0 0 0 0 0 0 0 0
SHENZHEN 0 0 1 0 0 0 5433 0 0 0 5433 0

数据统计

数据抽样 sample

#1、数据简单随机抽样
df_new.sample(2,replace=False)#重要参数是否放回,即重复或不重复抽样,默认不重复
date id city category_x age price gender m-point pay group sign category_y size
5 2013-01-04 1003 GUANGZHOU 110-A 54 2133.0 male 20 Y high 1.0 110 A
1 2013-01-05 1004 SHENZHEN 110-C 32 5433.0 female 40 Y low NaN 110 C

描述统计 Describe

#2、描述性统计
df_new.describe().round(2).T#统计后去二位小数,并转置
E:\Anaconda3\lib\site-packages\numpy\lib\function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
count mean std min 25% 50% 75% max
id 6.0 1003.50 1.87 1001.0 1002.25 1003.5 1004.75 1006.0
age 6.0 36.50 10.88 23.0 32.00 33.0 41.50 54.0
price 4.0 3299.50 1966.64 1200.0 NaN NaN NaN 5433.0
m-point 6.0 27.00 14.63 10.0 14.00 30.0 40.00 40.0
sign 1.0 1.00 NaN 1.0 NaN NaN NaN 1.0
category_y 6.0 126.67 42.27 100.0 102.50 110.0 125.00 210.0
'''平均数'''
df_new['age'].mean()
'''中位数'''
df_new['age'].median()
'''众数'''
df_new['age'].mode()
#3、标准差
df_new['age'].std()
10.876580344942981
#4、协方差
df_new.cov()#整体间
df_new['age'].cov(df_new['price'])#两个字段间
-2255.833333333333
#5、相关分析系数
df_new.corr()#整体相关系数
df_new['age'].corr(df_new['m-point'])#两个字段间相关
-0.19483296280358281
Show Disqus Comments

Search

    欢迎关注我的微信公众号

    闷骚的程序员

    Table of Contents