python 抓去淘宝相册并分别下载

来源:互联网 发布:卡盟下单源码 编辑:程序博客网 时间:2024/05/17 02:54
import too
import urllib.request
import re
import os
import json
class Spider:
def __init__(self ,siteUrl):
self.siteUrl = siteUrl;
self.tool = too.Tool();

def getUrlContent(self ,start , end):
# 打开网络地址
for i inrange(start , end + 1):
myUrl = self.siteUrl + "?page=" + str(i);
context = urllib.request.urlopen(myUrl);
context = context.read();
self.getContent(context);



def getContent(self ,context):
context = context.decode("gbk");

pattern = re.compile(
'<div class="list-item".*?pic-word.*?<a href="(.*?)".*?<img src="(.*?)".*?<a class="lady-name.*?>(.*?)</a>.*?<strong>(.*?)</strong>.*?<span>(.*?)</span>',
re.S);
items = re.findall(pattern , context);

for item in items:
print("模特---" + item[2] +",年龄:" + item[3] +",地址:" + item[4])

getNumberPattern = re.compile(r'\d+');
mm_id = getNumberPattern.findall(item[0])[0];
print(mm_id);

self.startSaveUser(mm_id , item[2]);
#进一步获取信息,并保存
def startSaveUser(self ,mm_id , name):
albumsUrl = "https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20=" + mm_id +"&page=1";
albumContent = urllib.request.urlopen(albumsUrl);
albumContent = albumContent.read().decode("gbk");

# print(albumContent);
# 开始获取h4标签,用来拿到相册
res = r'<h4>(.*?)</h4>';
albums = re.findall(res , albumContent , re.S|re.M);

for oneAlbum in albums:
# print(oneAlbum);
# 获取相册地址
urlRes = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')";
oneAlbumUrl = re.findall(urlRes , oneAlbum , re.S|re.M)[0].strip();
print(oneAlbumUrl);



# 获取相册名字
nameRes = r'<a .*?>(.*?)</a>';

albumName = re.findall(nameRes , oneAlbum , re.S|re.M)[0].strip();
albumName = albumName.replace("." , "");
# 开始创建文件夹 模特名字/相册名字
path = name + "/" +albumName;
self.createDir(path);

# 根据相册地址请求所有的图片
self.getAllImages(mm_id , oneAlbumUrl , path);

# 根据相册地址请求图片的方法
def getAllImages(self ,userId, oneAlbumUrl ,path):
oneAlbumUrl = "http://" + oneAlbumUrl;
result = urllib.parse.urlparse(oneAlbumUrl);
param = urllib.parse.parse_qs(result.query , True);
albumId = param["album_id"][0];
imagesUrl = "https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id=" + userId +"&album_id=" + albumId + "&top_pic_id=0&page=1&callback=jsonp254";

# 请求相册地址
imagesResult = urllib.request.urlopen(imagesUrl);
imagesResult = imagesResult.read().decode("gbk");
imagesResult = imagesResult.replace("\r" ,"").replace("\n" ,"").replace("\t" ,"");
imagesResult = imagesResult.split("(")[1].split(")")[0];

imagesData = json.loads(imagesResult);

for index , onImage in enumerate(imagesData["picList"]):
oneImageUrl = "http:" + onImage["picUrl"];
self.saveImageByUrl(oneImageUrl , index , path);

def saveImageByUrl(self ,imgUrl , imageName ,path):
path = path + "/" + str(imageName) + ".jpg";

isExist = os.path.exists(path);
if isExist:
print(path + "已经存在了");
return;
else:
print("开始保存" + path);
imagesData = urllib.request.urlopen(imgUrl).read();
f = open(path , "wb");
f.write(imagesData);
f.close();




def createDir(self ,path):
path = path.strip();
isExist = os.path.exists(path);

if isExist:
print("该路径已经存在,不用创建了");
else:
# print("正在创建路径");
os.makedirs(path);





s = Spider("https://mm.taobao.com/json/request_top_list.htm");

s.getUrlContent(1 , 2);


原创粉丝点击