解题报告

来源:互联网 发布:软件管理器下载 编辑:程序博客网 时间:2024/05/17 01:27
 #!/bin/bash  #function: crawl the webpage, and get urls #data: 2015/5/19 #author: Aleda  #$1 is the website  function curlLinks() {         for ((i=0; i<63; i++))         do                 flag=`cat "pages/page$i" | grep -i 'rel="bookmark" title=' | awk '{print $2}' | awk -F '"' '{print $2}' >> links`         done }  function uniqLinks() {         flag=`sort -u links > newLinks` }  function delSpace() {         flag=`cat newLinks | sed '/^[[:space:]]*$/d' > tureLinks` }  function finalize() {         flag=`rm links`         flag=`rm newLinks` } pages="${1}/page"   #echo $pages  #exist file  #push pages into file  for ((i=1; i<63; i++)) do         flag=`curl -o "pages/page$i" "${pages}/$i/"` done  curlLinks  uniqLinks  delSpace  finalize



 1 #!/bin/bash  2   3 #function: crawl the www.bokra.net  4 #data: 2014/5/20  5 #author: Aleda  6   7 #create folders  8   9 function createFolder() 10 { 11         for ((i=1; i<=5; i++)) 12         do 13                 if [ -e "page$i" ]; then 14                         echo "page${i} exies!" 15                 else 16                         flag=`mkdir "page${i}"` 17                 fi 18         done 19 } 20  21 function crawlPages() 22 { 23         case ${1} in 24                 1) 25                         url="http://www.bokra.net/VideoCategory/42/اm~Am~Dاm~E.html" 26                         for ((i=1; i<=34; i++)) 27                         do 28                                 flag=`curl -o "page1/pages${i}" "${url}/${i}"` 29                         done 30                         #solve the web pages 31                         for ((i=1; i<=34; i++)) 32                         do 33                                 flag=`cat "page1/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links` 34                         done 35                         ;; 36                 2) 37                         url="http://www.bokra.net/VideoCategory/39/براm~Eج_تm~Dm~Aزm~Jm~Hm~Fm~Jة.html" 38                         for ((i=1; i<=3; i++)) 39                         do 40                                 flag=`curl -o "page2/pages${i}" "${url}/${i}"` 41                         done 42                         #solve the web pages 43                         for ((i=1; i<=3; i++)) 44                         do 45                                 flag=`cat "page2/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links` 46                         done 47                         ;; 48                 3) 49                         url="http://www.bokra.net/VideoCategory/44/m~Eسرحm~Jات.html" 50                         for ((i=1; i<=2; i++)) 51                         do 52                                 flag=`curl -o "page3/pages${i}" "${url}/${i}"` 53                         done 54                         #solve the web pages 55                         for ((i=1; i<=2; i++)) 56                         do 57                                 flag=`cat "page3/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links` 58                         done 59                         ;; 60                 4) 61                         url="http://www.bokra.net/VideoCategory/43/m~Eسm~Dسm~Dات.html" 62                         for ((i=1; i<=15; i++)) 63                         do 64                                 flag=`curl -o "page4/pages${i}" "${url}/${i}"` 65                         done 66                         for ((i=1; i<=15; i++)) 67                         do 68                                 flag=`cat "page4/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links` 69                         done 70                         ;; 71                 5) 72                         url="http://www.bokra.net/VideoCategory/113/بm~Cرا_TV.html" 73                         flag=`cd page5` 74                         for ((i=1; i<=1; i++)) 75                         do 76                                 flag=`curl -o "page5/pages${i}" "${url}/${i}"` 77                         done 78                         for ((i=1; i<=1; i++)) 79                         do 80                                 flag=`cat "page5/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links` 81                         done 82                         ;; 83                 *) 84                         echo "Usage: 1, 2, 3, 4, 5" 85                         exit 1 86         esac 87  88 } 89  90 function crawling() 91 { 92         if [ -e links ]; then 93                 flag=`cat /dev/null links` 94         else 95                 flag=`touch links` 96         fi 97         for ((ii=1; ii<=5; ii++)) 98         do 99                 crawlPages ${ii}100 #               echo "${ii}..................................."101         done102 }103 104 function uniqLinks()105 {106         flag=`sort -u links > tureLinks`107         flag=`rm links`108 }109 110 #createFolder111 112 crawling113 114 uniqLinks

update:

#!/bin/bash#function: crawl the www.bokra.net#data: 2014/5/20#author: Aleda#create foldersfunction createFolder(){        for ((i=1; i<=5; i++))        do                if [ -e "page$i" ]; then                        echo "page${i} exies!"                else                        flag=`mkdir "page${i}"`                fi        done}function crawlPages(){        case ${1} in                1)                        url="http://www.bokra.net/VideoCategory/42/اm~Am~Dاm~E.html"                        for ((i=1; i<=34; i++))                        do                                 flag=`curl -o "page1/pages${i}" "${url}/${i}"`                        done                        #solve the web pages                        for ((i=1; i<=34; i++))                        do                                flag=`cat "page1/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links`                        done                        ;;                2)                        url="http://www.bokra.net/VideoCategory/39/براm~Eج_تm~Dm~Aزm~Jm~Hm~Fm~Jة.html"                        for ((i=1; i<=3; i++))                        do                                flag=`curl -o "page2/pages${i}" "${url}/${i}"`                        done                        #solve the web pages                        for ((i=1; i<=3; i++))                        do                                flag=`cat "page2/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links`                        done                        ;;                3)                        url="http://www.bokra.net/VideoCategory/44/m~Eسرحm~Jات.html"                        for ((i=1; i<=2; i++))                        do                                flag=`curl -o "page3/pages${i}" "${url}/${i}"`                        done                        #solve the web pages                        for ((i=1; i<=2; i++))                        do                                flag=`cat "page3/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links`                        done                        ;;                4)                        url="http://www.bokra.net/VideoCategory/43/m~Eسm~Dسm~Dات.html"                        for ((i=1; i<=15; i++))                        do                                flag=`curl -o "page4/pages${i}" "${url}/${i}"`                        done                        for ((i=1; i<=15; i++))                        do                                flag=`cat "page4/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links`                        done                        ;;                5)                        url="http://www.bokra.net/VideoCategory/113/بm~Cرا_TV.html"                        flag=`cd page5`                        for ((i=1; i<=1; i++))                        do                                flag=`curl -o "page5/pages${i}" "${url}/${i}"`                        done                        for ((i=1; i<=1; i++))                        do                                flag=`cat "page5/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links`                        done                        ;;                *)                        echo "Usage: 1, 2, 3, 4, 5"                        exit 1        esac }function crawling(){        if [ -e links ]; then                flag=`cat /dev/null links`        else                flag=`touch links`        fi        for ((ii=1; ii<=5; ii++))        do                crawlPages ${ii}                #echo "${ii}..................................."        done}function uniqLinks(){        flag=`sort -u links > tureLinks`        flag=`rm links`}function readLines(){        if [ -e deepPages ]; then                echo "File exists!"        else                flag=`mkdir deepPages`        fi        i=1;        while read line        do                flag=`curl -A Googlebot -o "deepPages/pages${i}" "$line"`                i=$(($i+1))        done < trueLinks}function getLinks(){        for ((i=1; i<=2530; i++))        do                flag=`cat "deepPages/pages${i}" | grep -i '<div class="pic[t]*' | awk '{print $3}' | awk -F '"' '{print $2}' >> examLinks`        done}function deepCrawl(){        readLines        getLinks        flag=`sort -u examLinks > Links`}function finalize(){        flag=`rm deepPages/pages*`        flag=`rm deepPages`        flag=`rm examLinks`        flag=`rm trueLinks`        flag=`rm -r page[1-9]`}#createFoldercrawlinguniqLinks deepCrawl#finalize


0 0
原创粉丝点击