通过网页抓取github仓库的部分文件

来源:互联网 发布:灯光矩阵什么意思 编辑:程序博客网 时间:2024/06/06 18:08

有些github仓库非常大,如果网络条件不好,只想下载部分文件,用 git无法实现,必须clone整个仓库。所以编写了这个脚本,提取仓库的部分文件。

点击打开github

import requestsfrom bs4 import BeautifulSoupimport osrepname="mahongquan/github-web-file-download"reppath="https://raw.github.com/"+repname+"/master/"outputpath="."def getfile(pathf):    print("get file:"+pathf)    reppath="https://raw.githubusercontent.com/"+repname+"/master/"    print(reppath)    #print reppath+pathf    #raw_input("pause")    res=requests.get(reppath+pathf)#"Classes/AppDelegate.h")    ps=pathf.split("/")    p="/".join(ps[:-1])    p=outputpath+"/"+p    if not os.path.exists(p):        os.makedirs(p)    open(p+"/"+ps[-1],"wb").write(res.content)def getpath(path):    print("getpath:"+path)    if path=="":        path="https://github.com/"+repname        res=requests.get(path)    else:        print(reppath+path)        res=requests.get(reppath+path)    soup = BeautifulSoup(res.content)    tbs=soup.find_all('table')    #print(tbs)    t=tbs[0].tbody    rs=t.find_all('tr')    fs=[]    paths=[]    for r in rs:        cs=r.find_all('td')        #print(cs)        #print(cs[0])        print(cs[0].svg)        print(cs[0])        if cs[0].svg!=None:            cls=cs[0].svg['class']            print("class="+str(cls))            if cls==None:                pass            elif cls[1]==u"octicon-file-directory":                print("ispath")                f=cs[1].a['href']                ps=f.split("/")                childpath="/".join(ps[5:])                print(childpath)                paths.append(childpath)            elif cls[1]=="octicon-alert":                pass            else:                print("is file")                fs.append(cs[1].a['href'])    for f in fs:        print(f)        ps=f.split("/")        getfile("/".join(ps[5:]))    for p in paths:        getpath(p)def setrepname(nm):global repnameglobal reppathglobal outputpathrepname=nmoutputpath=nm.split("/")[1]reppath="https://github.com/"+repname+"/tree/master/"def main():    setrepname("facebook/flux")    getpath("examples")#all    #getpath("Resources")#allif __name__=="__main__":    main()


1 0