解题报告
来源:互联网 发布:软件管理器下载 编辑:程序博客网 时间:2024/05/17 01:27
#!/bin/bash #function: crawl the webpage, and get urls #data: 2015/5/19 #author: Aleda #$1 is the website function curlLinks() { for ((i=0; i<63; i++)) do flag=`cat "pages/page$i" | grep -i 'rel="bookmark" title=' | awk '{print $2}' | awk -F '"' '{print $2}' >> links` done } function uniqLinks() { flag=`sort -u links > newLinks` } function delSpace() { flag=`cat newLinks | sed '/^[[:space:]]*$/d' > tureLinks` } function finalize() { flag=`rm links` flag=`rm newLinks` } pages="${1}/page" #echo $pages #exist file #push pages into file for ((i=1; i<63; i++)) do flag=`curl -o "pages/page$i" "${pages}/$i/"` done curlLinks uniqLinks delSpace finalize
1 #!/bin/bash 2 3 #function: crawl the www.bokra.net 4 #data: 2014/5/20 5 #author: Aleda 6 7 #create folders 8 9 function createFolder() 10 { 11 for ((i=1; i<=5; i++)) 12 do 13 if [ -e "page$i" ]; then 14 echo "page${i} exies!" 15 else 16 flag=`mkdir "page${i}"` 17 fi 18 done 19 } 20 21 function crawlPages() 22 { 23 case ${1} in 24 1) 25 url="http://www.bokra.net/VideoCategory/42/اm~Am~Dاm~E.html" 26 for ((i=1; i<=34; i++)) 27 do 28 flag=`curl -o "page1/pages${i}" "${url}/${i}"` 29 done 30 #solve the web pages 31 for ((i=1; i<=34; i++)) 32 do 33 flag=`cat "page1/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links` 34 done 35 ;; 36 2) 37 url="http://www.bokra.net/VideoCategory/39/براm~Eج_تm~Dm~Aزm~Jm~Hm~Fm~Jة.html" 38 for ((i=1; i<=3; i++)) 39 do 40 flag=`curl -o "page2/pages${i}" "${url}/${i}"` 41 done 42 #solve the web pages 43 for ((i=1; i<=3; i++)) 44 do 45 flag=`cat "page2/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links` 46 done 47 ;; 48 3) 49 url="http://www.bokra.net/VideoCategory/44/m~Eسرحm~Jات.html" 50 for ((i=1; i<=2; i++)) 51 do 52 flag=`curl -o "page3/pages${i}" "${url}/${i}"` 53 done 54 #solve the web pages 55 for ((i=1; i<=2; i++)) 56 do 57 flag=`cat "page3/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links` 58 done 59 ;; 60 4) 61 url="http://www.bokra.net/VideoCategory/43/m~Eسm~Dسm~Dات.html" 62 for ((i=1; i<=15; i++)) 63 do 64 flag=`curl -o "page4/pages${i}" "${url}/${i}"` 65 done 66 for ((i=1; i<=15; i++)) 67 do 68 flag=`cat "page4/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links` 69 done 70 ;; 71 5) 72 url="http://www.bokra.net/VideoCategory/113/بm~Cرا_TV.html" 73 flag=`cd page5` 74 for ((i=1; i<=1; i++)) 75 do 76 flag=`curl -o "page5/pages${i}" "${url}/${i}"` 77 done 78 for ((i=1; i<=1; i++)) 79 do 80 flag=`cat "page5/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links` 81 done 82 ;; 83 *) 84 echo "Usage: 1, 2, 3, 4, 5" 85 exit 1 86 esac 87 88 } 89 90 function crawling() 91 { 92 if [ -e links ]; then 93 flag=`cat /dev/null links` 94 else 95 flag=`touch links` 96 fi 97 for ((ii=1; ii<=5; ii++)) 98 do 99 crawlPages ${ii}100 # echo "${ii}..................................."101 done102 }103 104 function uniqLinks()105 {106 flag=`sort -u links > tureLinks`107 flag=`rm links`108 }109 110 #createFolder111 112 crawling113 114 uniqLinks
update:
#!/bin/bash#function: crawl the www.bokra.net#data: 2014/5/20#author: Aleda#create foldersfunction createFolder(){ for ((i=1; i<=5; i++)) do if [ -e "page$i" ]; then echo "page${i} exies!" else flag=`mkdir "page${i}"` fi done}function crawlPages(){ case ${1} in 1) url="http://www.bokra.net/VideoCategory/42/اm~Am~Dاm~E.html" for ((i=1; i<=34; i++)) do flag=`curl -o "page1/pages${i}" "${url}/${i}"` done #solve the web pages for ((i=1; i<=34; i++)) do flag=`cat "page1/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links` done ;; 2) url="http://www.bokra.net/VideoCategory/39/براm~Eج_تm~Dm~Aزm~Jm~Hm~Fm~Jة.html" for ((i=1; i<=3; i++)) do flag=`curl -o "page2/pages${i}" "${url}/${i}"` done #solve the web pages for ((i=1; i<=3; i++)) do flag=`cat "page2/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links` done ;; 3) url="http://www.bokra.net/VideoCategory/44/m~Eسرحm~Jات.html" for ((i=1; i<=2; i++)) do flag=`curl -o "page3/pages${i}" "${url}/${i}"` done #solve the web pages for ((i=1; i<=2; i++)) do flag=`cat "page3/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links` done ;; 4) url="http://www.bokra.net/VideoCategory/43/m~Eسm~Dسm~Dات.html" for ((i=1; i<=15; i++)) do flag=`curl -o "page4/pages${i}" "${url}/${i}"` done for ((i=1; i<=15; i++)) do flag=`cat "page4/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links` done ;; 5) url="http://www.bokra.net/VideoCategory/113/بm~Cرا_TV.html" flag=`cd page5` for ((i=1; i<=1; i++)) do flag=`curl -o "page5/pages${i}" "${url}/${i}"` done for ((i=1; i<=1; i++)) do flag=`cat "page5/pages${i}" | grep -i 'class="pic"' | awk '{print $3}' | awk -F '"' '{print $2}' >> links` done ;; *) echo "Usage: 1, 2, 3, 4, 5" exit 1 esac }function crawling(){ if [ -e links ]; then flag=`cat /dev/null links` else flag=`touch links` fi for ((ii=1; ii<=5; ii++)) do crawlPages ${ii} #echo "${ii}..................................." done}function uniqLinks(){ flag=`sort -u links > tureLinks` flag=`rm links`}function readLines(){ if [ -e deepPages ]; then echo "File exists!" else flag=`mkdir deepPages` fi i=1; while read line do flag=`curl -A Googlebot -o "deepPages/pages${i}" "$line"` i=$(($i+1)) done < trueLinks}function getLinks(){ for ((i=1; i<=2530; i++)) do flag=`cat "deepPages/pages${i}" | grep -i '<div class="pic[t]*' | awk '{print $3}' | awk -F '"' '{print $2}' >> examLinks` done}function deepCrawl(){ readLines getLinks flag=`sort -u examLinks > Links`}function finalize(){ flag=`rm deepPages/pages*` flag=`rm deepPages` flag=`rm examLinks` flag=`rm trueLinks` flag=`rm -r page[1-9]`}#createFoldercrawlinguniqLinks deepCrawl#finalize
0 0
- 解题报告
- 解题报告
- 解题报告
- 解题报告
- 解题报告
- 解题报告
- 解题报告
- Antiprime解题报告
- expr解题报告
- 华容道解题报告
- tju解题报告
- zju1062/pku1095解题报告
- UsacoGate解题报告 --- 序曲
- ZJU 2060 解题报告
- ZJU 1331 解题报告
- ZJU 1115 解题报告
- ZJU1057解题报告
- ZJU1146解题报告
- Tinyxml封装类COperatorXml
- 送给将要找工作的自己
- shell私房菜part2
- NYOJ-16-矩形嵌套
- 用tornado ,Supervisord ,nginx架网站
- 解题报告
- 5 Best Books for Every IT Leader 2014
- 【进程管理】内核中的互斥操作
- expdp从高版本导入到低版本
- leetcode学习笔记:Combinations & Subsets
- cocos2d-x 动画加速与减速
- 关于Jsp页面上传图片和缩略图查看
- Manacher算法:最长回文子串O(n)
- D-Bus学习(二):基本概念