自建随机图片接口后续之二-百度图片高清原图爬取

发布于 2022-09-05  919 次阅读


在上一篇搭建随机图片接口的文章 在自己的个人网站上搭建了随机图片获取接口~ 之后,尝试去爬虫了下更多的图包,然后就发现了一个坑,之前写的爬虫代码,即使手动设置了witdh和height参数为1920和1080,爬下来的也还是缩略图 ̄□ ̄||。

百度搜索了下果然有人已经分享了类似问题的解决方法:

简单的来说,就是之前我抓取的一直都是thumbURL这个变量对应的值,然而实际上百度把真实的原图url放在了objURL这个变量里。。而且还做了个字符串替换的加密处理,搞明白了这一点就好解决了,直接在自己原来的代码里加入了上述链接里博主分享的解密函数,这样就可以获得高清原图啦。

解密函数如下:

   def  baidu_uncomplie(url):
        res = ''
        c = ['_z2C$q', '_z&e3B', 'AzdH3F']
        d= {'w':'a', 'k':'b', 'v':'c', '1':'d', 'j':'e', 'u':'f', '2':'g', 'i':'h', 't':'i', '3':'j', 'h':'k', 's':'l', '4':'m', 'g':'n', '5':'o', 'r':'p', 'q':'q', '6':'r', 'f':'s', 'p':'t', '7':'u', 'e':'v', 'o':'w', '8':'1', 'd':'2', 'n':'3', '9':'4', 'c':'5', 'm':'6', '0':'7', 'b':'8', 'l':'9', 'a':'0', '_z2C$q':':', '_z&e3B':'.', 'AzdH3F':'/'}
        if(url==None or 'http' in url):
            return url
        else:
            j= url
            for m in c:
                j=j.replace(m,d[m])
            for char in j:
                if re.match('^[a-w\d]+$',char):
                    char = d[char]
                res= res+char
            return res

可以看到,就是个简单的字符串移位替换= =

最后,优化后的爬虫代码如下:

#! #自己的python解析器
#-*-coding:utf-8-*-

import random
import re
import os
import time
import requests
import json


ua_list = [
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
    'User-Agent:Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
    'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
]

class BaiduPicture:

    def __init__(self):
        self.url = 'https://image.baidu.com/search/acjson?{}'

    def get_image(self, word, page, pn_end,index_start,path):
        """_summary_:获取图片链接并保存图片

        Args:
            word (str)          : 图片关键词
            page (int)          : 起始页数
            pn_end (int)        : 终止页数
            index_start (int)   : 起始索引
            path (str)          : 保存路径
        """
        word = word
        image_url = []
        for i in range(page, pn_end + 1):
            params = {
                'tn'          : 'resultjson_com',
                'logid'       : '',#自己输入
                'ipn'         : 'rj',
                'ct'          : '201326592',
                'is'          : '',
                'fp'          : 'result',
                'fr'          : '',
                'word'        : word,
                'queryWord'   : word,
                'cl'          : '2',
                'lm'          : '-1',
                'ie'          : 'utf-8',
                'oe'          : 'utf-8',
                'adpicid'     : '',
                'st'          : '-1',
                'z'           : '',
                'ic'          : '0',
                'hd'          : '',
                'latest'      : '',
                'copyright'   : '',
                's'           : '',
                'se'          : '',
                'tab'         : '',
                'width'       : '1920', # 图片宽度,手动设置,懒得写进参数了
                'height'      : '1080', # 图片高度,手动设置,懒得写进参数了
                'face'        : '0',
                'istype'      : '2',
                'qc'          : '',
                'nc'          : '1',
                'expermode'   : '',
                'nojc'        : '',
                'isAsync'     : '',
                'pn'          : str(30 * i),
                'rn'          : '30',
                'gsm'         : '5a',
            }
            response = requests.get(url=self.url, headers={'User-Agent': random.choice(ua_list)}, params=params)
            regex = re.compile(r'\\(?![/u"])')
            json_data = json.loads(regex.sub(r"\\\\", response.text)) 
            
            try:
                for item in json_data['data']:
                    URL = item['objURL']
                    type = item['type']
                    pic_url = BaiduPicture.baidu_uncomplie(URL)
                    image_url.append(pic_url)
                    
            except :
                pass
            time.sleep(random.random()+1)
            print('第{}页已获取'.format(i))

        if len(image_url) > 0:
            print('共{}个图片需要下载'.format(len(image_url)))
            for index, image in enumerate(image_url):
                file_name = word + "_" + str(index_start+index + 1) + '.jpg'
                time.sleep(random.randint(1, 2))
                self.save_image(image, file_name, word,path)
            print('下载完成')
    #解密
    @staticmethod
    def  baidu_uncomplie(url):
        res = ''
        c = ['_z2C$q', '_z&e3B', 'AzdH3F']
        d= {'w':'a', 'k':'b', 'v':'c', '1':'d', 'j':'e', 'u':'f', '2':'g', 'i':'h', 't':'i', '3':'j', 'h':'k', 's':'l', '4':'m', 'g':'n', '5':'o', 'r':'p', 'q':'q', '6':'r', 'f':'s', 'p':'t', '7':'u', 'e':'v', 'o':'w', '8':'1', 'd':'2', 'n':'3', '9':'4', 'c':'5', 'm':'6', '0':'7', 'b':'8', 'l':'9', 'a':'0', '_z2C$q':':', '_z&e3B':'.', 'AzdH3F':'/'}
        if(url==None or 'http' in url):
            return url
        else:
            j= url
            for m in c:
                j=j.replace(m,d[m])
            for char in j:
                if re.match('^[a-w\d]+$',char):
                    char = d[char]
                res= res+char
            return res


    def save_image(self, url, file_name, word,path):
        """_summary_:保存图片

        Args:
            url (str)         : 图片链接
            file_name (str)   : 图片名称
            word (str)        : 图片关键词
            path (str)        : 保存路径
        """
        html = requests.get(url, headers={'User-Agent': random.choice(ua_list)}).content
 
        path = os.path.join(path, word)
        
        if not os.path.exists(path):
            os.makedirs(path)
        file_path = os.path.join(path, file_name)
        
        with open(file_path, 'wb') as f:
            f.write(html)
        print('{}已保存'.format(file_name))

    def run(self):
        """_summary_:主函数
        """
        
        word          = input('输入需要下载的图片关键词:')
        pn            = int(input('输入起始页数:'))
        pn_end        = int(input('输入终止页数:'))
        index_start   = int(input('输入起始索引:'))
        path          = input('输入保存路径:')
        if not os.path.exists(path) or path == '':
            path = os.getcwd()
        self.get_image(word, pn, pn_end,index_start,path)

if __name__ == '__main__':
    requests.adapters.DEFAULT_RETRIES =5    # 增加重连次数
    s = requests.session()
    s.keep_alive = False   # 关闭多余连接
    baidu = BaiduPicture()
    baidu.run()