dataframe实践
来源:互联网 发布:六壬排盘软件 编辑:程序博客网 时间:2024/05/19 09:16
"""1:将抓取的红色标记字符串中,sold数字提取2:end model 分组3 按照年份分类汇总 ,汇总方式有:ebay_id 均值,sold和,个数最多"""import pandas as pdfrom collections import Counterimport redf = pd.DataFrame(pd.read_excel("make_model_市场年代1108.xlsx"))#处理sold字段--------dig_sold = pd.DataFrame((re.findall("(\d{1,}) Sold", str(x)) for x in df["sold"]),columns=["dig_sold"], index=df.index)df = pd.merge(df,dig_sold,left_index=True , right_index= True) #按索引合并df.loc[df["dig_sold"].isnull(),"dig_sold"] = 0df.loc[:,"dig_sold"] = df["dig_sold"].astype("int") #强制转化为int#---------------end_models = df[["vio_end","model"]].drop_duplicates() #去重result = []for models in end_models.values: vio_end = models[0] model = models[1] print(model) subdf = df.loc[(df.model == model) & (df.vio_end == vio_end),["vio_end","model","ebay_id","dig_sold","ebay_end"]].dropna(how="any") subdf.loc[:, "ebay_end"] = subdf["ebay_end"].astype("int") # 强制转化为int # print(subdf.info()) #对年份分类汇总,并按照sold和计算,返回最大和的年份 gsold = subdf.groupby("ebay_end",as_index=False)["dig_sold"].sum() #Dataframe sold_year = gsold.sort_values(by="dig_sold",ascending=False).values[0][0] #----- #返回ebay_id均值最小的年份 gid = subdf.groupby("ebay_end", as_index= False)["ebay_id"].mean() id_year = int(gid.sort_values(by="ebay_id").values[0][0]) # print(id_year) glen = subdf.groupby("ebay_end", as_index = False)["ebay_end"].agg([len]) # print(glen) glen= glen.sort_values(by="len",ascending=False) first_year = glen.index[0] first_count = round(glen.values[0][0] / len(subdf),2) years =[vio_end , model ,len(subdf), sold_year , id_year,first_year , first_count ] if len(glen) > 1 : second_year = glen.index[1] second_count = round(glen.values[1][0] / len(subdf),2) years.extend([second_year,second_count]) # print(first_year , first_count , second_year, second_count) result.append(years)df = pd.DataFrame(result,columns=["vio_end","model","sum","sold预测","id_预测","统计预测1","统计1比率","统计预测2","统计2比率"])df.to_excel("make_model_ebayno_市场统计1108.xlsx",index=False)
import pandas as pddf = pd.DataFrame(pd.read_excel("系统sku的fitment信息1108.xlsx"))geration_df = pd.DataFrame(pd.read_excel("车型最后代系整理1030.xlsx"))vio_model = df["model"]ge_models = geration_df["model"].astype(str) #更改数据类型print(geration_df.info())print(df.info())df.loc[df["product_sku"].str.find("RN")!= -1 , "note"] = "通用车型" # 在字符串中查找子串df.loc[df["vio_end"] < 2013, "note"] = "2013前"nulldf = df[df["note"].isnull()] #提取为空的, 未处理的vio_modles = nulldf[["model","vio_end"]].drop_duplicates().sort_values(by="model").values #删除重复项,并按列值排序for end_model in vio_modles: model = end_model[0] vio_end = end_model[1] subdf = nulldf[(df.model == model) & (df.vio_end == vio_end)] index = subdf.index strmodel = str(model) if strmodel not in ge_models.values: print(model) print("nomodel") df.loc[index,"note"] = "no车型" continue ge_start = geration_df.loc[geration_df.model.astype(str) == strmodel , "start"].values[0] ge_end = geration_df.loc[geration_df.model.astype(str) == strmodel, "end"].values[0] # print(ge_start , ge_end) df.loc[index,"最后代系"]= "-".join([str(ge_start),str(ge_end)]) if (vio_end == ge_end): df.loc[index,"note"] ="同代" elif vio_end >= ge_start and vio_end < ge_end: df.loc[index,"note"]="同代扩展" elif vio_end < ge_start: if vio_end + 1 == ge_start: df.loc[index,"note"]="重启一代" else: df.loc[index,"note"]="前代扩展"df.to_excel("晴雨挡最后代系分析1108.xlsx",columns=["product_id","product_sku","product_name","make","model","vio_start","vio_end","note","最后代系"], index = False)
阅读全文
0 0
- dataframe实践
- Python之Pandas中Series、DataFrame实践
- python脚本实践——python批量读取txt文件为DataFrame
- dataFrame操作
- sparkSQL:dataframe
- DataFrame案例
- Pandas DataFrame
- Pandas(DataFrame)
- DataFrame 排序
- Series && DataFrame
- sparksql dataframe
- SparkSQL---DataFrame
- pandas-dataframe
- DataFrame操作
- dataframe sample
- DataFrame格式化和RDD/Dataset/DataFrame互转
- Python:dataframe.to_sql和dataframe.read_sql
- pandas.DataFrame.any与pandas.DataFrame.all
- Java里面Calendar获取上个月最后一天
- Centos下查看cpu、磁盘、内存使用情况以及如何清理内存
- MySQL数据表的切分原则
- eazyui学习总章
- redis详细入门
- dataframe实践
- 当你纠结某个问题很难解决的时候,很可能是你的设计出了问题
- ActiveMq--03
- 2017.11.06
- java日期操作常见使用
- 正则表达式_小记
- Oracle修改字段类型方法总结
- [WARNING] 'dependencies.dependency.(groupId:artifactId:type:classifier)' must be unique
- javax.net.ssl.SSLException: Read error: ssl=0xdeae5100: I/O error during system call, Connection res