生成verify文件

来源:互联网 发布:discuz数据库备份目录 编辑:程序博客网 时间:2024/04/29 16:41
#encoding:utf-8import osimport sysimport urllib2def regex_content(content):  ###content="小燕<哈否\\住宅+在哪里|---*"  #print content  #print len(content)  regexs=['\\','<','>','^','$','*','+','?','{','}','.','|','[',']','!','(',')']  aline=[]  bline=[]  cline=[]  con=""  flag=False  for regex in regexs:        bline=[]        for aa in content:                     cline=[]                     for bb in aa:                            cc=bb.split(regex)                            cline+=cc                     bline+=cline        aline+=bline  #print aline  #print len(aline)  count=0  for dd in aline:     # if flag==True or count==len(content):      if flag==True:         continue      else:        for regex in regexs:           if flag==True:             continue           elif dd==regex:             flag=True             #print con             break           else:             continue      if flag==False:        con+=dd  print len(con)   str=""  count=0  for ee in con:      if count < len(content):         str+=ee      else:         continue      count=count+1  return str       ##read sth form url.txtfilename=sys.argv[1]url_file=open('%s' % filename,'r')lines=url_file.readlines()file_tmp=[]file_tmpp=[]file_tmo=[]url_flag=Falsefor line in lines:      a_line=line.split('\t\t')      url_name=a_line[0].strip(' \n')      if url_name[0] == '#':            continue            print url_name      #create verify xml      new_file=file('%s/struc_file/%s.verify.xml' % (os.getcwd(),url_name),'a')      urls=a_line[1].split('\t')            #write head and sitename      new_file.write('<?xml version="1.0" encoding="utf-8"?>'+'\n')      new_file.write('<verify>'+'\n')                  for url in urls:      ##awrapper sth form websites         file_tmp=os.popen('./awrapper_extract_tool -t awrapper_extractor/ -u "%s" -tidy' % url)         #do sth if the url cannot be crawled         count_file=0         file_tmo=[]         for tt in file_tmp:              count_file=count_file+1              file_tmo.append(tt)          print count_file         ##if file_tmp[0].find('crawl')>=0 or count_file<=1:         if count_file <= 2:              #wget               print 'cannot crawl the url %s' % url              wrong_file=file('%s/tmp/%s' % (os.getcwd(),url_name),'a')              os.popen('wget --user-agent="gxy" "%s" -O %s/tmp/%s' % (url,os.getcwd(),url_name))                             file_tmpp=os.popen('./awrapper_extract_tool -t awrapper_extractor/ -u "%s" -tidy -d %s/tmp/%s' % (url,os.getcwd(),url_name))              file_tmo=file_tmpp              for ll in file_tmpp:                   print ll              wrong_file.close()                            print count_file         else:              file_tmpp=file_tmp              coo=0              for pp in file_tmpp:                  coo=coo+1              #print 'the num of orgin is %s ' % coo         #print "gxy"                  ##print url              #print data         for rr in file_tmo:              ##print "gxy in loop"              ##print rr              fields=rr.split(':')              if fields[0].strip()=='url':                 url_flag=True                 new_file.write('\t'+'<url name="'+url.strip('\n')+'">'+'\n')              elif fields[0].strip()=='author' or fields[0].strip()=='title' or fields[0].strip()=='category' or fields[0].strip()=='source' or fields[0].strip()=='status' or fields[0].strip()=='chapter_name':                 new_file.write('\t\t'+'<field name="'+fields[0].strip()+'" value="'+fields[1].strip()+'" verify_type="Equal" />'+'\n')              elif fields[0].strip()=='list_url':                 listurl=""                 field_name=fields[0]                 del fields[0]                 listurl=":".join(fields).strip(' \n')                 print "list_url is %s\n" % listurl                 new_file.write('\t\t'+'<field name="'+field_name.strip()+'" value="'+listurl+'" verify_type="Equal" />'+'\n')              elif fields[0].strip()=='content':                 field_name=fields[0].strip()                 del fields[0]                 contents=":".join(fields).split()                 if contents[0]==" ":                      del contents[0]                 content=" ".join(contents)                 con=regex_content(content)                 new_file.write('\t\t'+'<field name="content" value=".*'+con.strip('\n')+'.*" verify_type="RegexMatch" />'+'\n')              else:                 continue         if url_flag==False:                new_file.write('\t'+url.strip('\n')+'\n')         else:                new_file.write('\t'+'</url>'+'\n')                url_flag=False      new_file.write('</verify>'+'\n')      new_file.close()      print 'create %s.verify.xml succeedly\n' % url_nameurl_file.close()


原创粉丝点击