Python3 BeautifulSoup pymysql

来源:互联网 发布:淘宝雪花代码在线生成 编辑:程序博客网 时间:2024/05/22 15:44
import http.cookiejarimport loggingimport timeimport urllib.parseimport urllib.requestimport pymysqlimport xlrdfrom bs4 import BeautifulSoupclass SalesSpider():    def __init__(self, userId, passWord):        FORMAT = time.strftime("[%Y-%m-%d %H:%M:%S]", time.localtime()) + "[SalesSpider]-----%(message)s------"        logging.basicConfig(level=logging.INFO, format=FORMAT ,filemode="C:\\Users\\liyang\\Desktop\\salespider.log")        self.userId = userId        self.passWord = passWord        cj = http.cookiejar.LWPCookieJar()        self.opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))        urllib.request.install_opener(self.opener)    def request(self , url , postData):        response = None        if postData is None:            response = urllib.request.urlopen(url).read()        else:            response = urllib.request.urlopen(url, postData).read()        js = eval(response.decode('utf-8'), type('Dummy', (dict,), dict(__getitem__=lambda s, n: n))())        logging.info("url["+url+ "]="+str(js))        return js    def login(self):        params = {            "userId": self.userId,            "password": self.passWord,        }        loginUrl = ""        postData = urllib.parse.urlencode(params).encode(encoding='UTF8')        html = self.opener.open(loginUrl, postData).read()        soup = BeautifulSoup(html, "lxml")        if "" == soup.title.string :            logging.info("login success!")            return True        else:            logging.info("login failed!")            return False    def submit(self , times , recordid):        if self.findFromDB(recordid):            return        logging.info("No." + str(times) + ",key="+recordid)        #50392,50396,50397,50015,50021,50394,50395,50009,50790        params = {            "ObjSelect": "50392,50396,50397,50015,50021,50394,50395,50009,50790",            "viewMonth":"2017-05-01#2017-07-01",            "viewDay":"2017-06-18#2017-07-18",            "Slt_Latn_Id":"",            "paramSetValue": [                {"AtomId": 50007, "AtomValue": recordid, "TimeWindow": "2017-05-01#2017-05-01", "maxDataDt": "",                 "atomcopyid": "0", "operatorId": "0", "operatorName": "\u9ed8\u8ba4", "grpId": "", "grpPath": "",                 "showTypeId": "", "id": ""}]        }        jobUrl = ""        postData = urllib.parse.urlencode(params).encode(encoding='UTF8')        job = self.request(jobUrl , postData)        if "true" == job["success"]:            jobId = str(job["queryJobId"])            headerUrl = ""            desc = self.request(headerUrl , None)            saleDataUrl = ""            saleData = self.request(saleDataUrl , None)            if "0" != saleData["total"] :                self.insertIntoDB(recordid , str(saleData))    def loginAndQuery(self):        if not self.login():            return        n = len(self.queryKeys)        i = 0        while i < n :            try:                self.submit(i+1 , self.queryKeys[i])            except:                logging.info("retry login 2 times!")                reTry = 2                while reTry > 0:                    reTry -= 1                    try:                        self.login() #重新登录                        self.submit(i+1 , self.queryKeys[i])                        break                    except:                        logging.info("reTry login " + str(2-reTry) +" time also is failed!")            i += 1    def getQueryKeysFromXls(self , path , sheetName):        workbook = xlrd.open_workbook(path)        worksheets = workbook.sheet_names()        buy = workbook.sheet_by_name(sheetName)        num_rows = buy.nrows        values = []        for curr_row in range(num_rows):            row = buy.row_values(curr_row)            values.append(row[0])        self.queryKeys = values    def findFromDB(self, recordid):        db = pymysql.connect(host='localhost', user='root', passwd='root', db='test1', port=3306, charset='utf8')        cursor = db.cursor()        sql = ' select * from sales where recordid = %s '        cursor.execute(sql, (recordid))        db.commit()        cursor.close()        db.close()        return cursor.fetchone() is not None    def insertIntoDB(self, recordid, js):        db = pymysql.connect(host='localhost', user='root', passwd='root', db='test1', port=3306, charset='utf8')        cursor = db.cursor()        sql = " insert into sales(recordid , js)  values(%s , %s) "        cursor.execute(sql, (recordid , js))        db.commit()        cursor.close()        db.close()if __name__ == '__main__':    spider = SalesSpider("", "")    logging.info("spider is begin!")    spider.getQueryKeysFromXls("C:\\Users\\liyang\\Desktop\\buy.xls" , "buy")    spider.loginAndQuery()    logging.info("spider is end!")