Dataquest学习总结[5]

来源:互联网 发布:人卫 网络增值服务 编辑:程序博客网 时间:2024/06/05 16:43

Step 2: Intermediate Python And Pandas/Data Cleaning

>>表格的合并,pandas.concat()  

z = pd.concat([x,y], axis=0)  #合并x和y,y在x下面(对每一列操作得到行)

>>pd.read_csv() 也可以读取txt文件,根据传入的参数,对编码格式和分隔符进行设置

all_survey=pandas.read_csv("schools/survey_all.txt",delimiter="\t",encoding="windows-1252")d75_survey=pandas.read_csv("schools/survey_d75.txt",delimiter="\t",encoding="windows-1252")survey=pandas.concat([all_survey,d75_survey],axis=0)
>>关于pd.DataFrame.apply()的用法:

如果是DataFrame.apply(func) 则传入的func的参数一般是Series,即DataFrame的某一行或列

如果是Series.apply(func1) 则传入func1的参数是某个元素,例子如下:

data['hs_directory']['DBN']=data['hs_directory']['dbn']def padded(ele):    ele=str(ele)    num=len(ele)    if num==1:        ele='0'+ele    return ele data['class_size']["padded_csd"]=data['class_size']['CSD'].apply(padded)data['class_size']['DBN']=data['class_size']['padded_csd']+data['class_size']['SCHOOL CODE']print(data['class_size']['DBN'].head(10))
data是一个存入多个数据集的字典,data['class_size']相当于取出class_size数据集DataFrame

>> pandas.to_numeric() 将字符串类型的数据转换为数值类型

cols = ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score', 'SAT Writing Avg. Score']for c in cols:    data["sat_results"][c] = pd.to_numeric(data["sat_results"][c], errors="coerce")data['sat_results']['sat_score'] = data['sat_results'][cols[0]] + data['sat_results'][cols[1]] + data['sat_results'][cols[2]]print(data['sat_results']['sat_score'].head())
>>对数据集数据进行提取:

#从数据集提取经纬度信息#例:从字符串"1110 Boston Road\nBronx, NY 10456\n(40.8276026690005, -73.90447525699966)"提取出经纬度信息40.8276026690005import redef extract_titude(info):    loc=re.findall(r'\(.+\)',info)    return loc[0].split(',')[0].replace('(','')data['hs_directory']['lat']=data['hs_directory']['Location 1'].apply(extract_titude)    
#提取经度,并转化为数值def extra_lon(info):    loc=re.findall(r'\(.+\)',info)    return loc[0].split(',')[1].replace(')','')data['hs_directory']['lon']=data['hs_directory']['Location 1'].apply(extra_lon)data['hs_directory']['lon']=pd.to_numeric(data['hs_directory']['lon'],errors="coerce")data['hs_directory']['lat']=pd.to_numeric(data['hs_directory']['lat'],errors="coerce")print(data['hs_directory'])

>>对数据集中某列有相同数值的行进行分组 pandas.DataFrame.groupby()   

import numpy as npclass_size=class_size.groupby("DBN")class_size=class_size.agg(np.mean)class_size.reset_index(inplace=True)data['class_size']=class_sizeprint(class_size.head())

>>对不同的DataFrame进行连接pandas.DataFrame.merge() 

有四种方式left, right, inner, outer

combined = data["sat_results"]combined=combined.merge(data['ap_2010'],how='left',on='DBN')combined=combined.merge(data['graduation'],how='left',on='DBN')print(combined.head())print(combined.shape)
>>对连接后的数据集中存在元素为空Nan的情况,进行填充,其中一种就是填充均值pandas.DataFrame.mean()  pandas.DataFrame.fillna()  :

ave=combined.mean()combined=combined.fillna(ave)combined=combined.fillna(0)print(combined.head())

>>求解DataFrame中列与列的相关系数pandas.DataFrame.corr()  :

correlations=combined.corr()correlations=correlations['sat_score']print(correlations)
>>利用DataFrame结构绘制散点图 pandas.DataFrame.plot()  :

import matplotlib.pyplot as pltcombined.plot.scatter(y='sat_score',x='total_enrollment')plt.show()
>>根据各个学校的经纬度坐标绘制地形图:

from mpl_toolkits.basemap import Basemapm = Basemap(    projection='merc',     llcrnrlat=40.496044,     urcrnrlat=40.915256,     llcrnrlon=-74.255735,     urcrnrlon=-73.700272,    resolution='i')m.drawmapboundary(fill_color='#85A6D9')m.drawcoastlines(color='#6D5F47', linewidth=.4)m.drawrivers(color='#6D5F47', linewidth=.4)longitudes=combined['lon'].tolist()latitudes=combined['lat'].tolist()m.scatter(longitudes,latitudes,s=20,zorder=2,latlon=True)plt.show()

将scatter作图段改为其他:m.scatter(longitudes,latitudes,s=20,zorder=2,latlon=True,c=combined['ell_percent'],cmap="summer")


>>关于进一步分析 Analyzing NYC High School Data的代码在:https://github.com/dataquestio/solutions/blob/master/Mission217Solutions.ipynb 


Guided Project: Star Wars Survey部分工作,数据集their GitHub repository   ;代码the project's GitHub repository:

用到的函数: pandas.Series.map() pandas.Series.value_counts()   

ser = ["Yes", "No", NaN, "Yes"]yes_no = {"Yes": True,"No": False}ser = ser.map(yes_no)>>[True, False, NaN, True]
pandas.DataFrame.rename()  给columns重命名

star_wars = star_wars.rename(columns={ "Which of the following Star Wars films have you seen? Please select all that apply.": "seen_1"})
pandas.DataFrame.astype()进行类型转换

该工程的代码:

# -*- coding: utf-8 -*-"""Spyder Editorhttps://github.com/dataquestio/solutions/blob/master/Mission201Solution.ipynbThis is a temporary script file."""import pandas as pdstar_wars = pd.read_csv("StarWars.csv", encoding="ISO-8859-1")print(star_wars.head(10))print(star_wars.columns)star_wars=star_wars[pd.notnull(star_wars['RespondentID'])]print(star_wars.head(10))yes_no={'Yes':True,'No':False}star_wars["Have you seen any of the 6 films in the Star Wars franchise?"]=star_wars["Have you seen any of the 6 films in the Star Wars franchise?"].map(yes_no)star_wars["Do you consider yourself to be a fan of the Star Wars film franchise?"]=star_wars["Do you consider yourself to be a fan of the Star Wars film franchise?"].map(yes_no)#Convert each column above so that it only contains the values True and False,star_wars.columns[3:9]movie_bool={    "Star Wars: Episode I  The Phantom Menace": True,    "Star Wars: Episode II  Attack of the Clones":True,    "Star Wars: Episode III  Revenge of the Sith":True,    "Star Wars: Episode IV  A New Hope":True,    "Star Wars: Episode V The Empire Strikes Back":True,    "Star Wars: Episode VI Return of the Jedi":True,    None: False}def con_vt(s):    return s.map(movie_bool)star_wars.iloc[:,3:9]=star_wars.iloc[:,3:9].apply(con_vt)#for col in star_wars.columns[3:9]:#    star_wars[col]=star_wars[col].map(movie_bool)#print(star_wars.head(10))#Rename each of the above columnscolumn_cov={        "Which of the following Star Wars films have you seen? Please select all that apply.":"seen_1",        "Unnamed: 4": "seen_2",        "Unnamed: 5": "seen_3",        "Unnamed: 6": "seen_4",        "Unnamed: 7": "seen_5",        "Unnamed: 8": "seen_6" }star_wars=star_wars.rename(columns=column_cov)print(star_wars.head(10))#Convert each of the columns star_wars.columns[9:15] to a float typestar_wars.iloc[:,9:15]=star_wars.iloc[:,9:15].astype(float)#Give each column a more descriptive name. We suggest ranking_1, ranking_2, and so onrank_cov={        "Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film.":"ranking_1",        "Unnamed: 10":"ranking_2",        "Unnamed: 11":"ranking_3",        "Unnamed: 12":"ranking_4",        "Unnamed: 13":"ranking_5",        "Unnamed: 14":"ranking_6",        }star_wars=star_wars.rename(columns=rank_cov)print(star_wars.iloc[:,9:15])#compute the mean of each of the ranking columns and bar the valuemean_value=star_wars.iloc[:,9:15].mean()import matplotlib.pyplot as pltmean_value.plot.bar()plt.show()#compute the sum of each of the seen columns and bar the valuesum_value=star_wars.iloc[:,3:9].sum()sum_value.plot.bar()plt.show()
0 0