多线程快速抓取网页

来源:互联网 发布:ccn是什么网络 编辑:程序博客网 时间:2024/05/02 04:43
一段简单的代码,用于抓取wiki百科数据,简单的多线程编程例子,很少占内存,线程数开大了后效率很高。
import sys, thread, threading, time;import commandsfinish_num = 0;mutex = threading.Lock();def extract_qid(id, num_of_thread):    try:        fin = open(sys.argv[1], "r");        fout = open(sys.argv[2] + ".part" + str(id), "w");        count = 0;        for line in fin:            try:                line = line.strip();                if count % num_of_thread != id:                    count += 1;                    continue;                count += 1;                _raw_query = line;cmd = "wget \"zh.wikipedia.org/zh-hans/${query}\" -O \"fetch_wiki/tmp_search_${id}\""                cmd = cmd.replace("${query}", _raw_query).replace("${id}", str(id));                commands.getoutput(cmd);                tmp_fin = open("fetch_wiki/tmp_search_${id}".replace("${id}", str(id)), "r");fout.write("zh.wikipedia.org/zh-hans/${query}\n".replace("${query}",_raw_query))                for tmp_line in tmp_fin:                    fout.write(tmp_line)                tmp_fin.close();                commands.getoutput("rm -f \"fetch_wiki/tmp_search_${id}\"".replace("${id}", str(id)));            except:                continue;        fout.close();        fin.close();        global finish_num;        if mutex.acquire(1):            finish_num += 1;            mutex.release();        return True;    except Exception as e:        print e;        return False;for i in range(0, int(sys.argv[3])):    thread.start_new_thread(extract_qid, (i, int(sys.argv[3])));while finish_num != int(sys.argv[3]):    time.sleep(1);