Python修改数据中的字符串数据列

来源:互联网 发布:施耐德可编程编程软件 编辑:程序博客网 时间:2024/06/08 01:22

有时候,我们想修改数据中的字符串数据列。下面的方法供参考:

  • str.extract()
  • str.upper()
  • str.lower()
  • str.len()
  • str.split()
  • str.replace()

参考实例:

>>>import pandas as pd>>>df = pd.DataFrame([['RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT'],['RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT'],['RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT'],['COHOES CITY SCHOOL DISTRICT'],['COHOES CITY SCHOOL DISTRICT']])>>>df.columns = ['AREA NAME']>>>df----------                                         AREA NAME0  RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT1  RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT2  RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT3                      COHOES CITY SCHOOL DISTRICT4                      COHOES CITY SCHOOL DISTRICT

str.extract()

>>>df['AREA NAME'].str.extract('(\w+)')#提取SREA NAME列字符串中的第一个单词----------0 RAVENA1 RAVENA2 RAVENA3 COHOES4 COHOESName: AREA NAME, dtype: object>>>df['AREA NAME'].str.extract('(\w+)\s(\w+)')#将AREA NAME列中的第二个单词作为单独的列提取----------       0         10  RAVENA  COEYMANS1  RAVENA  COEYMANS2  RAVENA  COEYMANS3  COHOES      CITY4  COHOES      CITY

str.upper()

>>>df['AREA NAME'].str.upper()#因为数据已经是大写,所以没有改变----------0  RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT1  RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT2  RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT3                      COHOES CITY SCHOOL DISTRICT4                      COHOES CITY SCHOOL DISTRICTName: AREA NAME, dtype: object

str.lower()

>>>df['AREA NAME'].str.lower()#将NAME NAME列中的字符串转化为小写----------0    ravena coeymans selkirk central school district1    ravena coeymans selkirk central school district2    ravena coeymans selkirk central school district3                        cohoes city school district4                        cohoes city school districtName: AREA NAME, dtype: object

str.len()

>>>df['NAME NAME'].str.len()#AREA NAME列中每个元素的长度----------0    471    472    473    274    27Name: AREA NAME, dtype: int64

str.split()

>>>df['NAME NAME'].str.split(' ')#用空格分割AREA NAME列中的字符串----------0    [RAVENA, COEYMANS, SELKIRK, CENTRAL, SCHOOL, D...1    [RAVENA, COEYMANS, SELKIRK, CENTRAL, SCHOOL, D...2    [RAVENA, COEYMANS, SELKIRK, CENTRAL, SCHOOL, D...3                     [COHOES, CITY, SCHOOL, DISTRICT]4                     [COHOES, CITY, SCHOOL, DISTRICT]dtype: object

str.replace()

>>>df['NAME NAME'].str.replace('DISTRICT$', 'DIST')#将AREA NAME列中每个元素末尾的DISTRICT替换为DIST----------0    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DIST1    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DIST2    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DIST3                        COHOES CITY SCHOOL DIST4                        COHOES CITY SCHOOL DISTName: AREA NAME, dtype: object

str.cat()

>>>df['AREA NAME'].str.cat(['a', 'b', 'c', 'd', 'e'],sep=' ')#拼接字符串,在原有字符串后各拼接一个字符串----------0    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT a1    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT b2    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT c3                        COHOES CITY SCHOOL DISTRICT d4                        COHOES CITY SCHOOL DISTRICT eName: AREA NAME, dtype: object>>>df['AREA NAME'].str.cat([['a', 'b', 'c', 'd', 'e'], ['1', '2', '3', '4', '5']], sep=' ')#在原有字符串后各拼接两个字符串----------0    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRIC...1    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRIC...2    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRIC...3                      COHOES CITY SCHOOL DISTRICT d 44                      COHOES CITY SCHOOL DISTRICT e 5Name: AREA NAME, dtype: object>>>df['AREA NAME'].str.cat(sep=',')#将某一列拼接成一个完整的字符串----------RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT,RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT,RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT,COHOES CITY SCHOOL DISTRICT,COHOES CITY SCHOOL DISTRICT

str.get()

>>>df['AREA NAME'].str.get(0)#获取指定位置的字符串----------0    R1    R2    R3    C4    CName: AREA NAME, dtype: object

str.contains()

>>>df['AREA NAME'].str.contains('RAVENA')#是否包含表达式----------0     True1     True2     True3    False4    FalseName: AREA NAME, dtype: bool

str.pad()

>>>df['AREA NAME'].str.pad(47, fillchar='?')#左补齐----------0    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT1    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT2    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT3    ????????????????????COHOES CITY SCHOOL DISTRICT4    ????????????????????COHOES CITY SCHOOL DISTRICTName: AREA NAME, dtype: object>>>df['AREA NAME'].str.pad(47, side='right', fillchar='?')#右补齐----------0    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT1    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT2    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT3    COHOES CITY SCHOOL DISTRICT????????????????????4    COHOES CITY SCHOOL DISTRICT????????????????????Name: AREA NAME, dtype: object

str.center()

>>>df['AREA NAME'].str.center(47, fillchar=' ')#中间补齐----------0    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT1    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT2    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT3              COHOES CITY SCHOOL DISTRICT          4              COHOES CITY SCHOOL DISTRICT          Name: AREA NAME, dtype: object

str.ljust()

>>>df['AREA NAME'].str.ljust(47, fillchar='?')#右边补齐----------0    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT1    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT2    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT3    COHOES CITY SCHOOL DISTRICT????????????????????4    COHOES CITY SCHOOL DISTRICT????????????????????Name: AREA NAME, dtype: object

str.rjust()

>>>df['AREA NAME'].str.rjust(47, fillchar='?')#左边补齐----------0    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT1    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT2    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT3    ????????????????????COHOES CITY SCHOOL DISTRICT4    ????????????????????COHOES CITY SCHOOL DISTRICTName: AREA NAME, dtype: object

str.zfill()

>>>df['AREA NAME'].str.zfill(47)#左边补0----------0    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT1    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT2    RAVENA COEYMANS SELKIRK CENTRAL SCHOOL DISTRICT3    00000000000000000000COHOES CITY SCHOOL DISTRICT4    00000000000000000000COHOES CITY SCHOOL DISTRICTName: AREA NAME, dtype: object

str.slice()

>>>df['AREA NAME'].str.slice(8,23)#按给定的开始结束位置切割字符串----------0    OEYMANS SELKIRK1    OEYMANS SELKIRK2    OEYMANS SELKIRK3    ITY SCHOOL DIST4    ITY SCHOOL DISTName: AREA NAME, dtype: object

str.slice_repalce()

>>>df['AREA NAME'].str.slice_replace(8, 23, '??')#使用给定的字符串,替换指定位置的字符----------0    RAVENA C?? CENTRAL SCHOOL DISTRICT1    RAVENA C?? CENTRAL SCHOOL DISTRICT2    RAVENA C?? CENTRAL SCHOOL DISTRICT3                        COHOES C??RICT4                        COHOES C??RICTName: AREA NAME, dtype: object

str.count()

>>>df['AREA NAME'].str.count('A')#计算给定单词出现的次数----------0    41    42    43    04    0Name: AREA NAME, dtype: int64

str.startswith()

>>>df['AREA NAME'].str.startswith('R')#判断是否以给定的字符串开头----------0     True1     True2     True3    False4    FalseName: AREA NAME, dtype: bool

str.endswith()

>>>df['AREA NAME'].str.endswith('T')#判断是否以给定的字符串结束----------0    True1    True2    True3    True4    TrueName: AREA NAME, dtype: bool

str.findall()

>>>df['AREA NAME'].str.findall('[A-D]')#查找所有符合正则表达式的字符,以数组形式返回----------0    [A, A, C, A, C, A, C, D, C]1    [A, A, C, A, C, A, C, D, C]2    [A, A, C, A, C, A, C, D, C]3                [C, C, C, D, C]4                [C, C, C, D, C]Name: AREA NAME, dtype: object

str.match()

>>>df['AREA NAME'].str.match('[A-D]')#检测是否完全匹配给定的字符串或表达式----------0    False1    False2    False3     True4     TrueName: AREA NAME, dtype: bool

str.isalnum()

>>>df['AREA NAME'].str.isalnum()#是否全部是数字和字母组成----------0    False1    False2    False3    False4    FalseName: AREA NAME, dtype: bool

str.swapcase()

>>>df['AREA NAME'].str.swapcase()#大小写互换----------0    ravena coeymans selkirk central school district1    ravena coeymans selkirk central school district2    ravena coeymans selkirk central school district3                        cohoes city school district4                        cohoes city school districtName: AREA NAME, dtype: object
原创粉丝点击