通过使用python来处理数据的技巧持续更新:【内向即失败--王奕君】

来源:互联网 发布:基站定位软件 编辑:程序博客网 时间:2024/06/05 19:37
# - * - coding:utf8 -from collections import Counterfrom function.data_tool.extract_data import extract_indeximport heapqimport collectionsfrom furl import furlimport grequestsimport timeimport refrom function.data_tool import transform_datadef padding_list(tabulation:'list', length:'int'):    '''补全列表到指定长度'''    while len(tabulation) < length:        tabulation.append('')    return tabulationdef cut_onelength(tabulation:'object()'):    '''去掉列表中长度为1的文本'''    if isinstance(tabulation,(list)):        return [i for i in tabulation if len(str(i))>1]    elif isinstance(tabulation,(str)):        return [i for i in eval(tabulation) if len(str(i))>1]def adjoin_data(tabulation:'list'):    '''获得相邻的数字    original_data:['i','love','you',1,3,1,4,'wu','er']    res_data:[('i', 'love'), ('you', 1), (3, 1), (4, 'wu'), ('er', 'dirty_data')]    '''    length=len(tabulation)    if (length % 2) == 0:        generator= iter(tabulation)        return [(gen, next(generator)) for gen in generator]    else:        generator=iter(tabulation+['dirty_data'])        return [(gen, next(generator)) for gen in generator]def count_data(tabulation:'object()'):    '''对列表或字符串计数'''    cnt_data = Counter()    if isinstance(tabulation, (list)):        for tab in tabulation:            cnt_data[str(tab)]+=1        return cnt_data    elif isinstance(tabulation, (str)):        for tab in format(tabulation):            cnt_data[str(tab)] += 1        return cnt_datadef sort_length(tabulation:'list',reverse:'True or False'=True):    '''按列表长度排序'''    return sorted(tabulation,key=lambda tab:len(str(tab)),reverse=reverse)def twolist_todict(one:'list',two:'list'):    '''两个列表转成一个字典'''    todict=collections.OrderedDict()    for x,y in zip(one,two):        todict[x]=y    return todictdef define_data(text:'str',container:'list',one:'list'=None,two:'list'=None):    '''根据container关系来对文本做自定义替换    text='2017-11-12-Sunday\PM\13-01-46-'    contaier=[{'-':'年'},{'-':'月'},{'-':'日'},{'-':'点'},{'-':'分'},{'-':'秒'}]    res:2017年11月12日Sunday\PM\14点07分57秒    如果one的值在text中,则提取出该值的索引并对应到two中    '''    for i in container:        old,new=i.popitem()        text=text.replace(old,new,1)    if bool(one):        index=extract_index(text,one)        for i in index:            text=text.replace(one[i],two[i],1)    return textdef str_split(text:'str'):    '''将字符串根据空格条件进行切割    text='周日 周六'    res=['周日','周六']    '''    return text.split()def len_twolist(text:'object()'):    '''获取双列表中的元素个数    text=[['1','2','3'],['4','5','6']]    res=6     '''    return len([y for x in text for y in x ])def len_dictlist(text:'dict'):    '''获取字典里面所有列表元素的个数    text={'a':['1','2'],'b':['3','4']}    res=4    '''    number = []    for i in text.values():        number.extend(i)    return numberdef valuelist_tolist(text:'dict'):    '''将字典中的每个列表变成一个列表     text={'a':['1','2'],'b':['3','4']}     res=['1', '2', '3', '4']    '''    big_list=[]    for i in text.values():        big_list.extend(i)    return big_listdef reform_url(container:'list'):    '''需要改善'''    '''根据所给的列表,和参数值,对url做重组'''    url='http://shouguang.58.com'    # list_a = 'shouguang'    # 种类组:    city=container.pop(0)+'.58.com'    position=container.pop(0)    # list_b = 'qzbaoxianxuqi'    # 筛选组:#6 #7 #8    # list_c = {'age': ['pve_5569_0', 'pve_5569_1', 'pve_5569_2', 'pve_5569_3', 'pve_5569_4'],    #           'experience': ['pve_5594_0', 'pve_5594_5', 'pve_5594_6', 'pve_5594_1', 'pve_5594_2', 'pve_5594_3',    #                          'pve_5594_4'],    #           'education': ['pve_5593_1', 'pve_5593_2', 'pve_5593_3', 'pve_5593_4', 'pve_5593_5', 'pve_5593_6',    #                         'pve_5593_7', 'pve_5593_8']}    f=furl(url)    x=f.set(netloc=city)    big_url=[]    for i in container:        x.fragment.path.segments = [position,i]        big_url.append(x.url.replace('#','/',1))    return big_urldef treelist_tolist(container:'dict',number=3):    '''三个字典里面的列表的for循环    list_c = {'age': ['pve_5569_0', 'pve_5569_1', 'pve_5569_4'],              'experience': ['pve_5594_0', 'pve_5594_5', 'pve_5594_3',                             'pve_5594_4'],              'education': ['pve_5593_1', 'pve_5593_2', 'pve_5593_6',                            'pve_5593_7']}     res=['pve_5569_0_pve_5593_1_pve_5594_0', 'pve_5569_1_pve_5593_1_pve_5594_0', 'pve_5569_2_pve_5593_1_pve_5594_0',     'pve_5569_3_pve_5593_1_pve_5594_0']    '''    if number==3:        [a,b,c]=[i for i in container.values()]        return ['http://gkcx.eol.cn/soudaxue/queryProvinceScore.html?&argschtype='+z+'&recomschprop='+y+'&province='+x for x in a for y in b for z in c]    else:        [a,b]=[i for i in container.values()]        return [x+'_'+y for x in a for y in b]    #用于判断def fetch_one(container:'list',number:'int'=0):    one=[]    for i in container:        one.append(i[number])    return onedef scan_url(urls:'list',judge:'str'='中鹰黑森林万杰'):    '''继续不断优化,最好把线程池也加上'''    '''异步请求url,一旦网页中含有存在的字符,放入[]'''   #继续修改    X=[]    rs = [grequests.get(u) for u in urls]    x = grequests.map(rs)    for i in x:        if judge not in transform_data.correct_encode(i.text):            X.append(i.url)    return X
阅读全文
0 0
原创粉丝点击