Dengpangpang

从数据库中可分类下载图片

download_urls.py
分类下载图片.py
不分类下载图片.py
新的需求

防损项目经常要采集图片用于训练。

利用脚本可便捷完成从数据库中下载图片，只需要提供sql语句，可以直接运行脚本将图片下载到本地，或者按照商品code分类下载到不同文件夹。

一共三个脚本和一个文本文件：

具体操作：在sql.txt文件中写入查询语句，然后分类下载图片.py和不分类下载图片.py两个脚本根据需求选择其中一个执行即可。

关于download_urls.py：

此文件是作为模块被另外两个脚本导入执行。
此文件作为脚本单独执行的结果是新建文本文件urls.txt，将sql查询结果写入其中，一般不需要单独执行。

download_urls.py

import pymysql

url_file = 'urls.txt'
with open("sql.txt", 'r') as s:
    data = s.read()
query = data

# 连接到MySQL数据库
def connect_to_mysql():
    try:
        conn = pymysql.connect(
            host='xxxx',  # MySQL主机
            database='xxxx',  # 数据库名称
            user='xxxx',  # 数据库用户名
            password='xxxx'  # 数据库密码
        )
        print("成功连接到MySQL数据库")
        return conn
    except pymysql.Error as error:
        print("连接MySQL数据库失败: {}".format(error))
        return None


# 查询并将结果写入文件
def execute_query_write_to_file(conn, query, filename):
    try:
        cursor = conn.cursor()

        # 执行查询语句
        cursor.execute(query)

        # 获取查询结果
        rows = cursor.fetchall()

        # 将结果写入文件
        with open(filename, 'w') as file:
            for row in rows:
                # 每项数据以制表符分隔
                line = '\t'.join(str(col) for col in row) + '\n'
                file.write(line)

        print("查询结果已写入文件：{}".format(filename))
    except pymysql.Error as error:
        print("执行查询语句失败: {}".format(error))


# 处理无效行
def delete_empty_lines(url_file):
    # 打开文件并读取内容
    with open(url_file, "r") as file:
        lines = file.readlines()

    # 过滤空行和不以"http"开头的行
    lines = [line for line in lines if line.strip() and line.startswith("http")]

    # 打开文件并覆盖写入过滤后的内容
    with open(url_file, "w") as file:
        file.write("".join(lines))


# 主函数
def main():
    # 连接到MySQL数据库
    conn = connect_to_mysql()
    if not conn:
        return

    # 执行查询并将结果写入文件
    execute_query_write_to_file(conn, query, url_file)

    # 关闭数据库连接
    conn.close()

    # 处理无效行
    delete_empty_lines(url_file)


if __name__ == '__main__':
    main()

分类下载图片.py

import os
import asyncio
import aiohttp
import time
import download_urls

start = time.time()

# 存放图片url的txt文件命名为url.txt，放在python脚本的同级目录下
url_file = download_urls.url_file

# 创建存放图片的文件夹
save_dir = './images/'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)


# 定义异步协程函数，用于下载单张图片并保存到本地
async def download_image(session, url, code):
    try:
        async with session.get(url) as resp:
            if not os.path.exists(save_dir+code):  # 创建商品代码命名的文件夹
                os.makedirs(save_dir+code)
            dir_path, filename = os.path.split(url)     # 获取文件名
            download_path = os.path.join('images', code, filename)  # 保存路径
            with open(download_path, 'wb') as f:  # 下载
                while True:
                    chunk = await resp.content.read(1024 * 1024)
                    if not chunk:
                        break
                    f.write(chunk)
            print(f'{url} 下载完成！')
    except Exception as e:
        print(f'{url} 下载失败！异常信息：{str(e)}')


# 定义入口函数，启动异步事件循环，并逐个下载所有图片
async def main():
    async with aiohttp.ClientSession() as session:
        with open(url_file, 'r') as f:
            lines = f.readlines()
        tasks = [download_image(session, line.strip().split("\t")[0], line.strip().split("\t")[1]) for line in lines]
        await asyncio.gather(*tasks)

# 显式调用，下载商品url和code到文件
download_urls.main()

# 启动异步事件循环，并等待所有任务完成
asyncio.run(main())

# 执行结束，输出信息
print('脚本执行结束，总耗时：{}秒，请退出！'.format(time.time() - start))
time.sleep(30)

不分类下载图片.py

import asyncio
import os
import time

import aiohttp

import download_urls

start = time.time()

# 存放图片url的txt文件命名为url.txt，放在python脚本的同级目录下
url_file = download_urls.url_file

# 创建存放图片的文件夹
save_dir = './images/'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)


# 定义异步协程函数，用于下载单张图片并保存到本地
async def download_image(session, url):
    try:
        async with session.get(url) as resp:  # 异步下载
            dir_path, dir_name = os.path.split(url)  # URL资源目录和图片文件名
            save_path = os.path.join(save_dir + dir_name)  # 图片保存路径
            with open(save_path, 'wb') as f:
                while True:
                    chunk = await resp.content.read(1024 * 1024)
                    if not chunk:
                        break
                    f.write(chunk)
            print(f'{url} 下载完成！')

    except Exception as e:
        print(f'{url} 下载失败！异常信息：{str(e)}')


# 定义入口函数，启动异步事件循环，并逐个下载所有图片
async def main():
    async with aiohttp.ClientSession() as session:
        with open(url_file, 'r') as f:
            lines = f.readlines()
        tasks = [download_image(session, line.strip().split("\t")[0]) for line in lines]  # 此处只传递了url
        await asyncio.gather(*tasks)


# 显式调用，下载商品url和code到文件中
download_urls.main()

# 启动异步事件循环，并等待所有任务完成
asyncio.run(main())

# 执行结束，输出信息
print('脚本执行结束，总耗时：{}秒，请退出！'.format(time.time() - start))
time.sleep(30)

新的需求

不分类下载图片，但是每张图片都有一个对应的json，需要将每张图片的json写到一个txt文件中，并且json文本的文件名需要与图片相同。

import os
import asyncio
import aiohttp
import time
import download_urls

start = time.time()

# 存放图片url的txt文件命名为url.txt，放在python脚本的同级目录下
url_file = download_urls.url_file

# 创建存放图片的文件夹
save_dir = './images/'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# 创建存放json的文件夹
if not os.path.exists('./labels/'):
    os.makedirs('./labels/')

    
# 定义异步协程函数，用于下载单张图片并保存到本地
async def download_image(session, url, json):
    try:
        async with session.get(url) as resp:
            dir_path, file_name = os.path.split(url)  # URL资源目录和图片文件名
            save_path = os.path.join(save_dir + file_name)  # 图片保存路径
            with open("./labels/{}.txt".format(file_name), "w") as file:		# 写入json文件
                file.write(json)
            print("json已成功写入文件")
            with open(save_path, 'wb') as f:			# 下载图片
                while True:
                    chunk = await resp.content.read(1024 * 1024)
                    if not chunk:
                        break
                    f.write(chunk)
        	print('图片下载完成')
    except Exception as e:
        print(f'{url} 下载失败！异常信息：{str(e)}')


# 定义入口函数，启动异步事件循环，并逐个下载所有图片
async def main():
    async with aiohttp.ClientSession() as session:
        with open(url_file, 'r') as f:
            lines = f.readlines()
        tasks = [download_image(session, line.strip().split("\t")[0], line.strip().split("\t")[1]) for line in lines]
        await asyncio.gather(*tasks)


# 显式调用，下载商品url和code到文件中
download_urls.main()

# 启动异步事件循环，并等待所有任务完成
asyncio.run(main())

# 执行结束，输出信息
print('脚本执行结束，总耗时：{}秒，请退出！'.format(time.time() - start))
time.sleep(30)

2023-10-14 该篇文章被邓胖胖打上标签: python脚本归为分类: 工作笔记