dataframe实践

来源:互联网 发布:六壬排盘软件 编辑:程序博客网 时间:2024/05/19 09:16



"""1:将抓取的红色标记字符串中,sold数字提取2:end model 分组3 按照年份分类汇总  ,汇总方式有:ebay_id 均值,sold和,个数最多"""import pandas as pdfrom collections import Counterimport redf = pd.DataFrame(pd.read_excel("make_model_市场年代1108.xlsx"))#处理sold字段--------dig_sold = pd.DataFrame((re.findall("(\d{1,}) Sold", str(x)) for x in df["sold"]),columns=["dig_sold"], index=df.index)df = pd.merge(df,dig_sold,left_index=True , right_index= True) #按索引合并df.loc[df["dig_sold"].isnull(),"dig_sold"] = 0df.loc[:,"dig_sold"] = df["dig_sold"].astype("int") #强制转化为int#---------------end_models = df[["vio_end","model"]].drop_duplicates() #去重result = []for models in end_models.values:    vio_end = models[0]    model = models[1]    print(model)    subdf = df.loc[(df.model == model) & (df.vio_end == vio_end),["vio_end","model","ebay_id","dig_sold","ebay_end"]].dropna(how="any")    subdf.loc[:, "ebay_end"] = subdf["ebay_end"].astype("int")  # 强制转化为int    # print(subdf.info())    #对年份分类汇总,并按照sold和计算,返回最大和的年份    gsold = subdf.groupby("ebay_end",as_index=False)["dig_sold"].sum() #Dataframe    sold_year = gsold.sort_values(by="dig_sold",ascending=False).values[0][0]    #-----    #返回ebay_id均值最小的年份    gid = subdf.groupby("ebay_end", as_index= False)["ebay_id"].mean()    id_year = int(gid.sort_values(by="ebay_id").values[0][0])    # print(id_year)    glen = subdf.groupby("ebay_end", as_index = False)["ebay_end"].agg([len])    # print(glen)    glen= glen.sort_values(by="len",ascending=False)    first_year = glen.index[0]    first_count = round(glen.values[0][0] / len(subdf),2)    years =[vio_end , model ,len(subdf), sold_year , id_year,first_year , first_count ]    if len(glen) > 1 :        second_year = glen.index[1]        second_count = round(glen.values[1][0] / len(subdf),2)        years.extend([second_year,second_count])    # print(first_year , first_count , second_year, second_count)    result.append(years)df = pd.DataFrame(result,columns=["vio_end","model","sum","sold预测","id_预测","统计预测1","统计1比率","统计预测2","统计2比率"])df.to_excel("make_model_ebayno_市场统计1108.xlsx",index=False)


import pandas  as pddf = pd.DataFrame(pd.read_excel("系统sku的fitment信息1108.xlsx"))geration_df = pd.DataFrame(pd.read_excel("车型最后代系整理1030.xlsx"))vio_model = df["model"]ge_models = geration_df["model"].astype(str) #更改数据类型print(geration_df.info())print(df.info())df.loc[df["product_sku"].str.find("RN")!= -1 , "note"] = "通用车型"   # 在字符串中查找子串df.loc[df["vio_end"] < 2013, "note"] = "2013前"nulldf = df[df["note"].isnull()] #提取为空的, 未处理的vio_modles = nulldf[["model","vio_end"]].drop_duplicates().sort_values(by="model").values #删除重复项,并按列值排序for end_model in vio_modles:    model = end_model[0]    vio_end = end_model[1]    subdf = nulldf[(df.model == model) & (df.vio_end == vio_end)]    index = subdf.index    strmodel = str(model)    if strmodel not in ge_models.values:        print(model)        print("nomodel")        df.loc[index,"note"] = "no车型"        continue    ge_start = geration_df.loc[geration_df.model.astype(str) == strmodel , "start"].values[0]    ge_end = geration_df.loc[geration_df.model.astype(str) == strmodel, "end"].values[0]    # print(ge_start , ge_end)    df.loc[index,"最后代系"]= "-".join([str(ge_start),str(ge_end)])    if  (vio_end == ge_end):        df.loc[index,"note"] ="同代"    elif vio_end >= ge_start and vio_end < ge_end:        df.loc[index,"note"]="同代扩展"    elif vio_end < ge_start:        if vio_end + 1 == ge_start:            df.loc[index,"note"]="重启一代"        else:            df.loc[index,"note"]="前代扩展"df.to_excel("晴雨挡最后代系分析1108.xlsx",columns=["product_id","product_sku","product_name","make","model","vio_start","vio_end","note","最后代系"], index = False)



原创粉丝点击