生成verify文件
来源:互联网 发布:discuz数据库备份目录 编辑:程序博客网 时间:2024/04/29 16:41
#encoding:utf-8import osimport sysimport urllib2def regex_content(content): ###content="小燕<哈否\\住宅+在哪里|---*" #print content #print len(content) regexs=['\\','<','>','^','$','*','+','?','{','}','.','|','[',']','!','(',')'] aline=[] bline=[] cline=[] con="" flag=False for regex in regexs: bline=[] for aa in content: cline=[] for bb in aa: cc=bb.split(regex) cline+=cc bline+=cline aline+=bline #print aline #print len(aline) count=0 for dd in aline: # if flag==True or count==len(content): if flag==True: continue else: for regex in regexs: if flag==True: continue elif dd==regex: flag=True #print con break else: continue if flag==False: con+=dd print len(con) str="" count=0 for ee in con: if count < len(content): str+=ee else: continue count=count+1 return str ##read sth form url.txtfilename=sys.argv[1]url_file=open('%s' % filename,'r')lines=url_file.readlines()file_tmp=[]file_tmpp=[]file_tmo=[]url_flag=Falsefor line in lines: a_line=line.split('\t\t') url_name=a_line[0].strip(' \n') if url_name[0] == '#': continue print url_name #create verify xml new_file=file('%s/struc_file/%s.verify.xml' % (os.getcwd(),url_name),'a') urls=a_line[1].split('\t') #write head and sitename new_file.write('<?xml version="1.0" encoding="utf-8"?>'+'\n') new_file.write('<verify>'+'\n') for url in urls: ##awrapper sth form websites file_tmp=os.popen('./awrapper_extract_tool -t awrapper_extractor/ -u "%s" -tidy' % url) #do sth if the url cannot be crawled count_file=0 file_tmo=[] for tt in file_tmp: count_file=count_file+1 file_tmo.append(tt) print count_file ##if file_tmp[0].find('crawl')>=0 or count_file<=1: if count_file <= 2: #wget print 'cannot crawl the url %s' % url wrong_file=file('%s/tmp/%s' % (os.getcwd(),url_name),'a') os.popen('wget --user-agent="gxy" "%s" -O %s/tmp/%s' % (url,os.getcwd(),url_name)) file_tmpp=os.popen('./awrapper_extract_tool -t awrapper_extractor/ -u "%s" -tidy -d %s/tmp/%s' % (url,os.getcwd(),url_name)) file_tmo=file_tmpp for ll in file_tmpp: print ll wrong_file.close() print count_file else: file_tmpp=file_tmp coo=0 for pp in file_tmpp: coo=coo+1 #print 'the num of orgin is %s ' % coo #print "gxy" ##print url #print data for rr in file_tmo: ##print "gxy in loop" ##print rr fields=rr.split(':') if fields[0].strip()=='url': url_flag=True new_file.write('\t'+'<url name="'+url.strip('\n')+'">'+'\n') elif fields[0].strip()=='author' or fields[0].strip()=='title' or fields[0].strip()=='category' or fields[0].strip()=='source' or fields[0].strip()=='status' or fields[0].strip()=='chapter_name': new_file.write('\t\t'+'<field name="'+fields[0].strip()+'" value="'+fields[1].strip()+'" verify_type="Equal" />'+'\n') elif fields[0].strip()=='list_url': listurl="" field_name=fields[0] del fields[0] listurl=":".join(fields).strip(' \n') print "list_url is %s\n" % listurl new_file.write('\t\t'+'<field name="'+field_name.strip()+'" value="'+listurl+'" verify_type="Equal" />'+'\n') elif fields[0].strip()=='content': field_name=fields[0].strip() del fields[0] contents=":".join(fields).split() if contents[0]==" ": del contents[0] content=" ".join(contents) con=regex_content(content) new_file.write('\t\t'+'<field name="content" value=".*'+con.strip('\n')+'.*" verify_type="RegexMatch" />'+'\n') else: continue if url_flag==False: new_file.write('\t'+url.strip('\n')+'\n') else: new_file.write('\t'+'</url>'+'\n') url_flag=False new_file.write('</verify>'+'\n') new_file.close() print 'create %s.verify.xml succeedly\n' % url_nameurl_file.close()