看到网上很多教程教大家爬取淘宝的淘女郎图库。看来美女对程序猿们的吸引力不可谓不大啊。于是乎,根据昨晚学习的经历(详见《Python实例:JS爬虫,抓取今日头条“街拍”图库》),自己也写一个爬取淘宝淘女郎图库的代码。
大概看了下网上的教程,很多教程都是通过PhantomJS来爬取 “https://mm.taobao.com/json/request_top_list.htm?page=1” 这个页面来获取的数据信息。然而我脑袋不好使,看了许久都不知道他们是如何得到到这个html页面的。在很多情况下,如果不知道如何获取到页面数据,那就无从下手了啊哭。
试试看能不能直接用昨晚的办法来获取数据吧。
主要步骤如下:
我们通过PhantomJS来模拟网页登陆从而获取网页的html信息,获取完数据记得要用 quit() 方法退出PhantomJS。
1 2 3 4 5 6 |
def get_page_source(url): driver = webdriver.PhantomJS() driver.get(url) html = driver.page_source driver.quit() return html |
然后分析JS返回的数据,通过zip函数将其整理成字典中待调用。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 |
def get_mm_dict_list(html): reg_mm_name = re.compile(r'class=\"name\">(.*?)</span>') reg_mm_city = re.compile(r'class=\"city\">(.*?)</span>') reg_mm_height = re.compile(r'<span>(.*?) ') reg_mm_weight = re.compile(r'/ (.*?)</span>') reg_mm_url = re.compile(r'<a href=\"(//mm.taobao.com/self/.*?)\"') mm_name_list = reg_mm_name.findall(html) mm_city_list = reg_mm_city.findall(html) mm_height_list = reg_mm_height.findall(html) mm_weight_list = reg_mm_weight.findall(html) mm_url_list = reg_mm_url.findall(html) dict_key = ['name', 'city', 'height', 'weight', 'url'] mm_dict_list = [] if len(mm_name_list) == len(mm_city_list) == len(mm_height_list) == len(mm_weight_list) == len(mm_url_list): mm_list = zip(mm_name_list, mm_city_list, mm_height_list, mm_weight_list, mm_url_list) for i in list(mm_list): mm_dict = dict(zip(dict_key, i)) mm_dict_list.append(mm_dict) return mm_dict_list |
再将字典中的url再次分析,获取图片路径,然后下载到本地
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 |
def get_mm_img(path, mm_dict_list): for i in range(len(mm_dict_list)): mm_path = path + mm_dict_list[i]['name'] + ' ' + mm_dict_list[i]['city'] + ' ' + mm_dict_list[i]['height'] + ' ' + mm_dict_list[i]['weight'] +'\\' if os.path.isdir(mm_path): mm_path = mm_path else: os.mkdir(mm_path) mm_path = mm_path mm_page_source = get_page_source('https:' + mm_dict_list[i]['url'], mm_path) reg_img = re.compile(r'(//img.alicdn.com/imgextra/.*?)\"') mm_img_list = reg_img.findall(mm_page_source) n = 0 for j in mm_img_list: n += 1 print('正在下载 %s 的第 %d 张图片' % (mm_dict_list[i]['name'], n)) url = 'https:' + j urllib.request.urlretrieve(url, '%s%s.%s' %(mm_path, n, j[-3:])) |
所有的代码如下(增加了部分提示性语句的输出):
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # 抓取淘宝淘女郎图册 from selenium import webdriver import re import os import urllib.request import requests def get_page_source(url, path): print('正在获取 %s 的源码数据' % url) driver = webdriver.PhantomJS() driver.get(url) html = driver.page_source driver.quit() with open(path + 'souce_code.txt', 'w', encoding='utf-8') as f: f.write(html) print('源码数据获取完成') return html def get_mm_dict_list(html, path): print('正在获取主页面中MM列表数据') reg_mm_name = re.compile(r'class=\"name\">(.*?)</span>') reg_mm_city = re.compile(r'class=\"city\">(.*?)</span>') reg_mm_height = re.compile(r'<span>(.*?) ') reg_mm_weight = re.compile(r'/ (.*?)</span>') reg_mm_url = re.compile(r'<a href=\"(//mm.taobao.com/self/.*?)\"') mm_name_list = reg_mm_name.findall(html) mm_city_list = reg_mm_city.findall(html) mm_height_list = reg_mm_height.findall(html) mm_weight_list = reg_mm_weight.findall(html) mm_url_list = reg_mm_url.findall(html) dict_key = ['name', 'city', 'height', 'weight', 'url'] mm_dict_list = [] if len(mm_name_list) == len(mm_city_list) == len(mm_height_list) == len(mm_weight_list) == len(mm_url_list): mm_list = zip(mm_name_list, mm_city_list, mm_height_list, mm_weight_list, mm_url_list) for i in list(mm_list): mm_dict = dict(zip(dict_key, i)) mm_dict_list.append(mm_dict) with open(path + 'mm_list.txt', 'w', encoding='utf-8') as f: text = '%s' % mm_dict_list f.write(text) print('%s 个MM列表数据获取完成' % len(mm_dict_list)) return mm_dict_list def set_path(path_name): path = 'F:\\%s\\' % path_name print('正在设置保存路径为:', path) if os.path.isdir(path): return path else: os.mkdir(path) return path def get_mm_img(path, mm_dict_list): for i in range(len(mm_dict_list)): mm_path = path + mm_dict_list[i]['name'] + ' ' + mm_dict_list[i]['city'] + ' ' + mm_dict_list[i]['height'] + ' ' + mm_dict_list[i]['weight'] +'\\' print('正在设置保存路径为:', mm_path) if os.path.isdir(mm_path): mm_path = mm_path else: os.mkdir(mm_path) mm_path = mm_path mm_page_source = get_page_source('https:' + mm_dict_list[i]['url'], mm_path) print('正在获取 %s 的图片数据' % mm_dict_list[i]['name']) reg_img = re.compile(r'(//img.alicdn.com/imgextra/.*?)\"') mm_img_list = reg_img.findall(mm_page_source) n = 0 with open(mm_path + 'mm_img_list.txt', 'w', encoding='utf-8') as f: text = '%s' % mm_img_list f.write(text) for j in mm_img_list: n += 1 response = requests.get('https:' + j) if response.status_code == 200: print('正在下载 %s 的第 %d 张图片:%s' % (mm_dict_list[i]['name'], n, j)) url = 'https:' + j urllib.request.urlretrieve(url, '%s%s.%s' %(mm_path, n, j[-3:])) else: print('下载 %s 的第 %d 张图片失败:%s' % (mm_dict_list[i]['name'], n, j)) def main(): url = 'https://mm.taobao.com/search_tstar_model.htm' path = set_path('淘宝MM') index_source = get_page_source(url, path) mm_dict_list = get_mm_dict_list(index_source, path) get_mm_img(path, mm_dict_list) if __name__ == '__main__': main() |
执行结果: