Linux AWK 实现网络下载

来源:互联网 发布:软件招标书模板 编辑:程序博客网 时间:2024/05/09 04:03
  awk是一种优秀的文本处理工 具,用它来处理文本中的数据非常方便。我们现在用的绝大部分是gawk,也就是gnu awk,gnu的软件一贯表现不错,跟其他AWK的实现版本比起来,gawk添加了对网络的支持,比如我可以用awk模拟发送http请求给浏览器,然后 用正则表达式过滤网页内容,例如这里是一个awk和sed搭配获取五大联赛计分表的shell程序。

gawk编程最权威的资料在其info帮助文件里,这份帮助资料值得称道的并不是它全面的reference,而是里面包含了大量akw应用的实例。虽然用gawk进行网络编程有点类似所谓的奇技淫巧,但是相比用c来完成同样的工作,awk还是颇具生产力的。

下面这个程序是用来获取youku视频的,程序运行起来是这个样子:

程序的基本原理是用gawk发送http请求,获取服务器返回的信息,然后根据这些信息进行一些处理后重新发送,经过三次请 求,youku会发送真正的flv地址,根据这个真实地址就可以下载了,由于gawk在I/O这方面功能很弱,所以我在gawk中通过system()调 用curl来完成这最后一步的下载。

这个程序可以在命令行下如此调用:

gawk -f get_youku.awk youku.txt

其中youku是视频所在的网页地址和下载回来后要保存的名称,其格式可以这样:

csdn blog的代码模板没有awk,代码有300多行,下面是代码,可能有点乱,有兴趣仔细研究的可以留下邮箱索取源代码。

  1. #! /usr/bin/gawk -f


  2. ################################################################################
  3. #
  4. #优酷视频下载器
  5. #
  6. #Author: hailongchang@163.com
  7. #
  8. #Date: 11/15/2008
  9. #
  10. ################################################################################
  11. {
  12. adr = $1;
  13. fn = $2;
  14. download_video(adr,fn);
  15. }

  16. ################################################################################
  17. #实际的下载函数,参数url是flv的网络地址,filename是下载后保存的名称
  18. ################################################################################
  19. function download_video(url,filename)
  20. {

  21. Get_Info(Get_Vid(url));
  22. system("echo ========================================================================================");


  23. for(i=1;i<=video_info["clipcn"];i++)
  24. {
  25. if(video_info["clipcn"] > 1)
  26. {
  27. filename = filename "_" i;
  28. }

  29. tlink = "url_" i;
  30. filename = filename ".flv";

  31. echo_hint = "正在为您下载 : " filename;
  32. echo_command = "echo " echo_hint;

  33. system(echo_command);
  34. system("echo");

  35. command = "curl " Identify_video(video_info[tlink])" >" filename;
  36. system(command);

  37. system("echo");
  38. system("echo ========================================================================================");

  39. }

  40. }

  41. ################################################################################
  42. #提取网页地址,参数web_url来自于youtube.txt,是视频所在的网页地址
  43. ################################################################################

  44. function Get_url(web_url)
  45. {
  46. gsub(/http://///,"",web_url)
  47. gsub(/v/.youku/.com/,"",web_url)
  48. return web_url;
  49. }
  50. ################################################################################
  51. #提取视频id的函数
  52. ################################################################################

  53. function Get_Vid(web_url)
  54. {
  55. RS="/r/n"

  56. url = Get_url(web_url)

  57. InetFile = "/inet/tcp/0/v.youku.com/80"
  58. Request = "GET " url " HTTP/1.1/r/n"
  59. Request = Request "Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-shockwave-flash, */*"
  60. Request = Request "Accept-Language: zh-cn/r/n"
  61. Request = Request "UA-CPU: x86/r/n"
  62. Request = Request "Accept-Encoding: unzip, deflate/r/n"
  63. Request = Request "User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT5.1; .NET CLR 1.1.4322)/r/n"
  64. Request = Request "Host: v.youku.com/r/n/r/n"
  65. print Request |& InetFile;

  66. while((InetFile |& getline) >0)
  67. {
  68. if(match($0,/videoId = '[0-9]*'/,matchtext))
  69. {
  70. if(match(matchtext[0],/'[0-9]*'/,array_vid))
  71. {
  72. vid = array_vid[0];
  73. gsub(/'/,"",vid);
  74. }
  75. }
  76. }

  77. close(InetFile);
  78. return vid;

  79. }
  80. ################################################################################
  81. #获取服务器发送的key
  82. ################################################################################

  83. function Get_key(item)
  84. {
  85. split(item,item_info,":")
  86. gsub(/"/,"",item_info[2])
  87. return item_info[2]
  88. }
  89. ################################################################################
  90. #获取视频的大小
  91. ################################################################################

  92. function Get_size(item)
  93. {
  94. split(item,item_info,":")
  95. gsub(/"/,"",item_info[3])
  96. gsub(/}/,"",item_info[3])
  97. return item_info[3]
  98. }
  99. ################################################################################
  100. #获取视频的seed
  101. ################################################################################

  102. function Get_seed(item)
  103. {
  104. split(item,item_info,":")
  105. return item_info[2]

  106. }

  107. ################################################################################
  108. #一个随机数发生器
  109. ################################################################################

  110. function Genrate_rand()
  111. {

  112. seed = (seed * 211 + 30031) % 65536;
  113. num = seed / 65536;
  114. return num;
  115. }

  116. function convert_fileid(fileid)
  117. {
  118. split(fileid,fid,"*");
  119. i = 1;
  120. while(fid[i] != "")
  121. {

  122. i++;
  123. }
  124. fid_length = i-1;

  125. cg_str = "";
  126. str = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ///:._-1234567890";

  127. seed = video_info["seed"];

  128. str_length = length(str);

  129. for (i = 1; i <= str_length; ++i)
  130. {
  131. seed = (seed * 211 + 30031) % 65536;
  132. num = seed / 65536;
  133. pos = int(length(str) * num);
  134. pos += 1;
  135. ch = substr(str,pos,1);
  136. cg_str = cg_str ch;
  137. split(str,str_array,ch);
  138. str = str_array[1] str_array[2];
  139. }

  140. id = "";

  141. for (i = 1; i <= fid_length; ++i)
  142. {
  143. id = id substr(cg_str,fid[i]+1,1);

  144. }

  145. return (id);
  146. }

  147. ################################################################################
  148. #提取fileid
  149. ################################################################################

  150. function Get_fileid(item)
  151. {
  152. split(item,item_info,":")
  153. gsub(/"/,"",item_info[2])

  154. split(item_info[2],fileid,"*")
  155. return item_info[2]

  156. }

  157. ################################################################################
  158. #将16进制字符转换为数字
  159. ################################################################################

  160. function hex_convention(ch)
  161. {
  162. if(ch == "a")
  163. num = 10;
  164. else if(ch =="b")
  165. num = 11;
  166. else if(ch =="c")
  167. num = 12;
  168. else if(ch =="d")
  169. num = 13;
  170. else if(ch =="e")
  171. num = 14;
  172. else if(ch =="f")
  173. num = 15;
  174. else
  175. num = ch;
  176. return num;
  177. }

  178. ################################################################################
  179. #将16进制字符串转换为十进制数字
  180. ################################################################################

  181. function HexStr_int(str)
  182. {
  183. sum = 0;

  184. for(i=length(str);i>=1;i--)
  185. {
  186. n = substr(str,i,1);
  187. tmp = 16**(length(str)-i);
  188. sum += (hex_convention(n)) * tmp;
  189. }
  190. return sum;
  191. }

  192. ################################################################################
  193. #获取视频的相关信息
  194. ################################################################################

  195. function Get_Info(video_id)
  196. {

  197. url = "/player/getPlayList/VideoIDS/" video_id"/version/v1.0.0312/source/video/password//Type/flv";
  198. flvHttpFile = "/inet/tcp/0/v.youku.com/80"
  199. Request = "GET " url " HTTP/1.1/r/n"
  200. Request = Request "Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-shockwave-flash, */*"
  201. Request = Request "Accept-Language: zh-cn/r/n"
  202. Request = Request "UA-CPU: x86/r/n"
  203. Request = Request "Accept-Encoding: unzip, deflate/r/n"
  204. Request = Request "User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT5.1; .NET CLR 1.1.4322)/r/n"
  205. Request = Request "Host: v.youku.com/r/n"
  206. print Request |& flvHttpFile

  207. while((flvHttpFile |& getline) > 0)
  208. {
  209. split($0,match_info,",");
  210. }
  211. close(flvHttpFile);

  212. i = 1;
  213. while(match_info[i]!="")
  214. {

  215. if(0 != match(match_info[i],/"seed".*/))
  216. {
  217. video_info["seed"] = Get_seed(match_info[i]);
  218. }
  219. if(0 != match(match_info[i],/"streamsizes".*/))
  220. {
  221. video_info["size"] = Get_size(match_info[i]);
  222. }
  223. if( 0 != match(match_info[i],/"fileid"/))
  224. {
  225. video_info["fileid"] = Get_fileid(match_info[i]);
  226. }
  227. if (0 != match(match_info[i],/"key1".*/))
  228. {
  229. video_info["key1"] = Get_key(match_info[i]);
  230. }
  231. if( 0 != match(match_info[i],/"key2".*/,match_key2))
  232. {
  233. video_info["key2"] = Get_key(match_info[i]);
  234. }
  235. i++;
  236. }

  237. # printf("/n/n");
  238. # printf("seed = %s/n",video_info["seed"]);
  239. # printf("size = %s/n",video_info["size"]);
  240. # printf("fileid = %s/n",video_info["fileid"]);
  241. # printf("key1 = %s/n",video_info["key1"]);
  242. # printf("key2 = %s/n/n",video_info["key2"]);
  243. # printf("/n/n")

  244. file_id = convert_fileid(video_info["fileid"]);


  245. key_stand = sprintf("%d",0xA55AA5A5);
  246. key1 = HexStr_int(video_info["key1"]);
  247. video_info["key1"] = sprintf("%x",xor(key1,key_stand));

  248. video_info["clipcn"] =int(substr(file_id,7,2));

  249. if(video_info["clipcn"] == 1)
  250. {
  251. last_url = "http://f.youku.com/player/getFlvPath/sid/00_00/st/flv/fileid/"
  252. last_url = last_url file_id "?K=" video_info["key2"];
  253. last_url = last_url video_info["key1"];
  254. video_info["url_1"] = last_url;
  255. }
  256. else
  257. {
  258. for(i = 1; i<= video_info["clipcn"];i++ )
  259. {
  260. if(video_info["clipcn"] <= 10)
  261. {
  262. lev = "0" (i-1);

  263. }
  264. last_url = "http://f.youku.com/player/getFlvPath/sid/00_00/st/flv/fileid/"
  265. last_url = last_url substr(file_id,1,8);
  266. last_url = last_url lev;
  267. last_url = last_url substr(file_id,11,length(file_id)-10);
  268. last_url = last_url "?K=";
  269. last_url = last_url video_info["key2"];
  270. last_url = last_url video_info["key1"];
  271. tlink = "url_" i;
  272. video_info[tlink] = last_url;
  273. }
  274. }
  275. return;
  276. }


  277. ################################################################################
  278. #最后一次放松http请求,服务器将返回真实的视频地址
  279. ################################################################################

  280. function Identify_video(req)
  281. {
  282. InetDown = "/inet/tcp/0/f.youku.com/80"
  283. gsub(/http:////f.youku.com/,"",req);
  284. Request = "GET " req " HTTP/1.1/r/n";
  285. Request = Request "Accept: */*/r/n";
  286. Request = Request "Cache-Control: no-cache/r/n";
  287. Request = Request "Connection: close/r/n";
  288. Request = Request "Host: f.youku.com/r/n";
  289. Request = Request "Pragma: no-cache/r/n";
  290. Request = Request "Referer: http://f.youku.com/player/getFlvPath/sid/00_00/st/flv/fileid//r/n";
  291. Request = Request "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; )/r/n"
  292. Request = Request "/r/n";

  293. print Request |& InetDown;


  294. while((InetDown |& getline) >0)
  295. {
  296. pos = match($0,/http://///);
  297. if(0 != pos)
  298. {
  299. flvAddr = substr($0,pos,length($0) - 10);
  300. }

  301. }
  302. close(InetDown);

  303. return flvAddr;

  304. }
原创粉丝点击