python多线程实践

来源:互联网 发布:软件概要设计说明书 编辑:程序博客网 时间:2024/05/21 06:53

背景:

海量数据任务的并发处理,IO较为频繁,所以采用多线程的方式进行处理

方案:

一个主线程进行任务的分发,另外再独立创建一定数量的并发工作线程

代码:

# -*- coding: utf-8 -*-__author__ = 'jasonliu''''由于是IO密集型,采用用线程的方式进行处理注意本版本代码仅仅适用于python3'''import hashlibimport jsonimport timeimport datetimeimport threadingimport requestsimport codecsimport queue#和python2不同from threading import Threadimport relyricer_pat = r'[作]{0,1}词[::]{1}(.+)'comp_pat = r'[作]{0,1}曲[::]{1}(.+)'sign_pat = r'\[mywriting:([^\]]+)\]'#加锁mu = threading.Lock()sign_mu = threading.Lock()MS = 20#启动的线程数wq = queue.Queue(maxsize=20)composer_lyrics_file = codecs.open("composer_lyrics.txt", 'w', 'utf-8')#记录作曲作词人信息sign_file = codecs.open("sign.txt", 'w', 'utf-8')net_class = "publish"def loop_worker():    while True:        if True:            line = wq.get()            wq.task_done()            if not line:                break            linedata = line.split("\t")            krcid = linedata[0]            scid = linedata[1]            if net_class == "publish":                url = "http://you_ip1/id="            else:                url = "http://you_ip2/id="            url = url + str(krcid)            cnx = requests.get(url)            ret = cnx.text            lyricer_man = "NULL"            lyricer_group = re.search(lyricer_pat, ret)            if lyricer_group:                lyricer_man = lyricer_group.group(1)#作词人信息                lyricer_man = lyricer_man.strip()            com_man = "NULL"            com_group = re.search(comp_pat, ret)#作曲人信息            if com_group:                com_man = com_group.group(1)                com_man = com_man.strip()            sign_group = re.search(sign_pat, ret)            if sign_group:                sign_content = sign_group.group(1)                sign_content = sign_content.strip()                tmplog1 = str(krcid) + "\t" + str(sign_content) + "\n"                if sign_mu.acquire(True):                    sign_file.write(tmplog1)                    sign_file.flush()                    sign_mu.release()            if lyricer_group and com_group:                tmplog = str(krcid) + "\t" + str(scid) + "\t" + str(lyricer_man) + "\t" + str(com_man) + "\n"                if mu.acquire(True):                    composer_lyrics_file.write(tmplog)                    composer_lyrics_file.flush()                    mu.release()def start_workers():    for i in range(MS):        t = Thread(target=loop_worker)        t.daemon = True        t.start()def stop_workers():    for i in range(MS):        wq.put(None)    time.sleep(5)#等待其他线程一起退出来def run():    print(datetime.datetime.now(), '===>begin')    start_workers()    with open('all_krcid_scid.txt', 'r') as fi:        num = 0        for line in fi:            num = num + 1            line = line.strip('\n')            print(line)            wq.put(line)#存储的是hash    stop_workers()    print(datetime.datetime.now(), '===>end')if '__main__' == __name__:    time1 = time.time()    run()    time2 = time.time()    print('cost=',(time2-time1))

注意,在执行的时候会有可能出现request操作的时候,连接失败,所以需要对这些情况做处理,比如用try语句,加强代码的稳健性。

在程序执行之后,如果是在windows的话,我们可以通过设置:
这里写图片描述
从而看到进程下的线程数。当然也可以采用tasklist命令查看。

原创粉丝点击