HTML解析---IMDB简介抓取

来源:互联网 发布:淘宝重新注册账号申请 编辑:程序博客网 时间:2024/05/17 18:47

      今天做了个IMDB简介抓取,用的是HTML解析,不是用提供的API做的,以前用javascript做过一个。第一次用HTML解析,可以说是从一窍不通到了略知一二。用了hpple开源的解析库。新版和旧版有不少出入,绕了很多弯路,才找到了点门道。


效果图:


就一个方法

- (IBAction)searchIMDB:(id)sender {    [[self plot]setString:@" "];  //重置简介区的NSTextField        NSString *str=[NSString stringWithFormat:@"http://207.171.166.140/title/%@/",[sender stringValue]];  //形成imdb网址    NSData *siteData = [[NSData alloc] initWithContentsOfURL:[NSURL URLWithString:str]];    if (siteData) {        TFHpple *xpathParser = [[TFHpple alloc] initWithHTMLData:siteData];        NSArray *elementName = [xpathParser searchWithXPathQuery:@"//h1//text()"];         NSArray *elementStoryline1 = [xpathParser searchWithXPathQuery:@"//p[@itemprop='description']//text()"];        NSArray *elementAddPlot = [xpathParser searchWithXPathQuery:@"id('maindetails_center_bottom')/div[5]/span[2]/span[1]/a//text()"];               TFHppleElement *elementN = [elementName objectAtIndex:0];        TFHppleElement *elementS = [elementStoryline1 objectAtIndex:0];               NSString *strName = [[elementN content]stringByTrimmingCharactersInSet: [NSCharacterSet newlineCharacterSet]]; //去除电影名前后的换行符                [[self movie]setStringValue:strName];  //设置电影名,由label显示出来            if ([elementAddPlot count]) {          //如果该表达式不为0,表示抓取到了“Add Full Plot”,意味着没有其他版本简介,就继续在本页面提取简介               [[self plot]insertText:@"\n\t◉"];     //格式修饰                           NSString *plotShort = [[NSString  alloc]init];               plotShort = [[elementS content]stringByTrimmingCharactersInSet: [NSCharacterSet newlineCharacterSet]];;               [[self plot]insertText:plotShort];                    } else{                                                  //表示还有其他版本,也就意味着该页面存在“Plot Summary”,去新页面抓取详细简介                  NSString *str2=[NSString stringWithFormat:@"http://207.171.166.140/title/%@/plotsummary",[sender stringValue]];                  NSData *siteData2 = [[NSData alloc] initWithContentsOfURL:[NSURL URLWithString:str2]];                  if (siteData2) {                  TFHpple *xpathParser2 = [[TFHpple alloc] initWithHTMLData:siteData2];                  NSArray *elementStoryline = [xpathParser2 searchWithXPathQuery:@"id('tn15content')/p//text()"];                          [[self plot]insertText:@"\n\t◉"];                  if (elementStoryline) {                                  for (TFHppleElement *element in elementStoryline) {                     NSString *plotDetail = [element  content];                     NSString *plotd=[[NSString alloc]init];                     //NSLog(@"t is %@",element);                                        if ([plotDetail isEqualToString:@"\n"]) {  //处理单个的换行符                            [[self plot]insertText:@"\n\n\t◉"];                         }                        else if ([plotDetail hasSuffix:@"\n "]){     //处理“\nWritten by\n ”成“Written by ”。                            plotd=[NSString stringWithFormat:@"%@ ",[plotDetail stringByTrimmingCharactersInSet: [NSCharacterSet whitespaceAndNewlineCharacterSet]]];                        }else{                                                             //其他情况去掉前后的换行符,两个换行符的字符串被清除                           plotd=[plotDetail stringByTrimmingCharactersInSet: [NSCharacterSet newlineCharacterSet]];}                           [[self plot]insertText:plotd];                             }                                  }                              }                }             }


原创粉丝点击