python 拔取网站

来源:互联网 发布:python多进程读取文件 编辑:程序博客网 时间:2024/05/21 05:39
#coding=utf-8import urllibimport reimport osimport requestsget_path="http://www.jsdaima.com/Upload/201703/1490074013/index.html"get_path_path="http://www.jsdaima.com/Upload/201703/1490074013/"get_index="index"get_demo="./jsdaima"def download(url):    header = {        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}    path = get_demo    parram = 'http://.*?/'    out = path+"/"+re.sub(parram,'', url)    path_flag=os.path.exists(out)    #not path_flag    if not path_flag:        print "download:" + url        out1 = os.path.dirname(out)        #判断是否存在        outurl = os.path.exists(out1)        if not outurl:            os.makedirs(out1)        pic = requests.get(url)        fp = open(out, 'wb')        htmlurl=re.findall('(.*?.css)$', url, re.S)        if len(htmlurl) > 0:            css_text = urllib.urlopen(url).read()            parram_background='url\(\.\.\/(.*?)\)'            css_back=re.findall(parram_background, css_text,re.S)            for img in css_back:               if not img.strip():                   print "img:"+img                   download(get_path_path+img)        fp.write(pic.content)        fp.close()def getHtml(url):    parram = '^(http://.*?/|https://.*?/)'    htmlurl=re.findall(parram, url, re.S)    htmlurl=htmlurl[0]    html = urllib.urlopen(url).read()    css_path = re.findall('href=\"(.*?\.css)\"', html)    js_path = re.findall('src=\"(.*?\.js)\">', html)    webm_path = re.findall('src=\"(.*?\.webm)\">', html)    img_path = re.findall('<img.*?src="(.*?)"', html)    video_img_path = re.findall('<video.*?poster="(.*?)"', html)    for each in css_path+js_path+webm_path+img_path+video_img_path:        if "//" in each or "+" in each:           download(each)        else:           download(get_path_path + each)    html = html.replace("=\"/", "=\"./")    return htmldef saveHtml(file_name, file_content):    parram = '^(http://.*?/|https://.*?/)'    htmlurl = re.findall(parram, get_path, re.S)    htmlurl = htmlurl[0]    file_content=file_content.replace(htmlurl,'./')    path = get_demo    outurl = os.path.exists(path)    if not outurl:        os.makedirs(path)    #    注意windows文件命名的禁用符,比如 /    with open(path+"/"+file_name.replace('/', '_') + ".html", "wb") as f:        #   写文件用bytes而不是str,所以要转码        f.write(file_content)html = getHtml(get_path)saveHtml(get_index, html)# download("http://www.jsdaima.com/Upload/201703/1490074013/css/style.css")print ("结束")