自建随机图片接口后续之二-百度图片高清原图爬取

在上一篇搭建随机图片接口的文章在自己的个人网站上搭建了随机图片获取接口~ 之后，尝试去爬虫了下更多的图包，然后就发现了一个坑，之前写的爬虫代码，即使手动设置了witdh和height参数为1920和1080，爬下来的也还是缩略图￣□￣｜｜。

百度搜索了下果然有人已经分享了类似问题的解决方法：

PYTHON 百度原图爬虫高清图 ~按分辨率爬取，解析简单加密。

简单的来说，就是之前我抓取的一直都是thumbURL这个变量对应的值，然而实际上百度把真实的原图url放在了objURL这个变量里。。而且还做了个字符串替换的加密处理，搞明白了这一点就好解决了，直接在自己原来的代码里加入了上述链接里博主分享的解密函数，这样就可以获得高清原图啦。

解密函数如下：

   def  baidu_uncomplie(url):
        res = ''
        c = ['_z2C$q', '_z&e3B', 'AzdH3F']
        d= {'w':'a', 'k':'b', 'v':'c', '1':'d', 'j':'e', 'u':'f', '2':'g', 'i':'h', 't':'i', '3':'j', 'h':'k', 's':'l', '4':'m', 'g':'n', '5':'o', 'r':'p', 'q':'q', '6':'r', 'f':'s', 'p':'t', '7':'u', 'e':'v', 'o':'w', '8':'1', 'd':'2', 'n':'3', '9':'4', 'c':'5', 'm':'6', '0':'7', 'b':'8', 'l':'9', 'a':'0', '_z2C$q':':', '_z&e3B':'.', 'AzdH3F':'/'}
        if(url==None or 'http' in url):
            return url
        else:
            j= url
            for m in c:
                j=j.replace(m,d[m])
            for char in j:
                if re.match('^[a-w\d]+$',char):
                    char = d[char]
                res= res+char
            return res

可以看到，就是个简单的字符串移位替换= =

最后，优化后的爬虫代码如下：

#! #自己的python解析器
#-*-coding:utf-8-*-

import random
import re
import os
import time
import requests
import json


ua_list = [
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
    'User-Agent:Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
    'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
]

class BaiduPicture:

    def __init__(self):
        self.url = 'https://image.baidu.com/search/acjson?{}'

    def get_image(self, word, page, pn_end,index_start,path):
        """_summary_:获取图片链接并保存图片

        Args:
            word (str)          : 图片关键词
            page (int)          : 起始页数
            pn_end (int)        : 终止页数
            index_start (int)   : 起始索引
            path (str)          : 保存路径
        """
        word = word
        image_url = []
        for i in range(page, pn_end + 1):
            params = {
                'tn'          : 'resultjson_com',
                'logid'       : '',#自己输入
                'ipn'         : 'rj',
                'ct'          : '201326592',
                'is'          : '',
                'fp'          : 'result',
                'fr'          : '',
                'word'        : word,
                'queryWord'   : word,
                'cl'          : '2',
                'lm'          : '-1',
                'ie'          : 'utf-8',
                'oe'          : 'utf-8',
                'adpicid'     : '',
                'st'          : '-1',
                'z'           : '',
                'ic'          : '0',
                'hd'          : '',
                'latest'      : '',
                'copyright'   : '',
                's'           : '',
                'se'          : '',
                'tab'         : '',
                'width'       : '1920', # 图片宽度,手动设置,懒得写进参数了
                'height'      : '1080', # 图片高度,手动设置,懒得写进参数了
                'face'        : '0',
                'istype'      : '2',
                'qc'          : '',
                'nc'          : '1',
                'expermode'   : '',
                'nojc'        : '',
                'isAsync'     : '',
                'pn'          : str(30 * i),
                'rn'          : '30',
                'gsm'         : '5a',
            }
            response = requests.get(url=self.url, headers={'User-Agent': random.choice(ua_list)}, params=params)
            regex = re.compile(r'\\(?![/u"])')
            json_data = json.loads(regex.sub(r"\\\\", response.text)) 
            
            try:
                for item in json_data['data']:
                    URL = item['objURL']
                    type = item['type']
                    pic_url = BaiduPicture.baidu_uncomplie(URL)
                    image_url.append(pic_url)
                    
            except :
                pass
            time.sleep(random.random()+1)
            print('第{}页已获取'.format(i))

        if len(image_url) > 0:
            print('共{}个图片需要下载'.format(len(image_url)))
            for index, image in enumerate(image_url):
                file_name = word + "_" + str(index_start+index + 1) + '.jpg'
                time.sleep(random.randint(1, 2))
                self.save_image(image, file_name, word,path)
            print('下载完成')
    #解密
    @staticmethod
    def  baidu_uncomplie(url):
        res = ''
        c = ['_z2C$q', '_z&e3B', 'AzdH3F']
        d= {'w':'a', 'k':'b', 'v':'c', '1':'d', 'j':'e', 'u':'f', '2':'g', 'i':'h', 't':'i', '3':'j', 'h':'k', 's':'l', '4':'m', 'g':'n', '5':'o', 'r':'p', 'q':'q', '6':'r', 'f':'s', 'p':'t', '7':'u', 'e':'v', 'o':'w', '8':'1', 'd':'2', 'n':'3', '9':'4', 'c':'5', 'm':'6', '0':'7', 'b':'8', 'l':'9', 'a':'0', '_z2C$q':':', '_z&e3B':'.', 'AzdH3F':'/'}
        if(url==None or 'http' in url):
            return url
        else:
            j= url
            for m in c:
                j=j.replace(m,d[m])
            for char in j:
                if re.match('^[a-w\d]+$',char):
                    char = d[char]
                res= res+char
            return res


    def save_image(self, url, file_name, word,path):
        """_summary_:保存图片

        Args:
            url (str)         : 图片链接
            file_name (str)   : 图片名称
            word (str)        : 图片关键词
            path (str)        : 保存路径
        """
        html = requests.get(url, headers={'User-Agent': random.choice(ua_list)}).content
 
        path = os.path.join(path, word)
        
        if not os.path.exists(path):
            os.makedirs(path)
        file_path = os.path.join(path, file_name)
        
        with open(file_path, 'wb') as f:
            f.write(html)
        print('{}已保存'.format(file_name))

    def run(self):
        """_summary_:主函数
        """
        
        word          = input('输入需要下载的图片关键词：')
        pn            = int(input('输入起始页数:'))
        pn_end        = int(input('输入终止页数:'))
        index_start   = int(input('输入起始索引:'))
        path          = input('输入保存路径：')
        if not os.path.exists(path) or path == '':
            path = os.getcwd()
        self.get_image(word, pn, pn_end,index_start,path)

if __name__ == '__main__':
    requests.adapters.DEFAULT_RETRIES =5    # 增加重连次数
    s = requests.session()
    s.keep_alive = False   # 关闭多余连接
    baidu = BaiduPicture()
    baidu.run()