批量爬取网站图片脚本

不分文件夹

import requests
from bs4 import BeautifulSoup
import os
from concurrent.futures import ThreadPoolExecutor

def download_image(img_url):
    # 检查图片后缀是否为.jpg或.jpeg
    if img_url.lower().endswith(('.jpg', '.jpeg')):
        try:
            img_response = requests.get(img_url, stream=True)
            img_size = int(img_response.headers.get('content-length', 0))
            if img_size > 50 * 1024:  # 大于50KB
                filename = os.path.join(images_dir, img_url.split('/')[-1])
                with open(filename, 'wb') as f:
                    for chunk in img_response.iter_content(1024):
                        f.write(chunk)
                print(f"Downloaded {img_url}")
        except Exception as e:
            print(f"Error downloading {img_url}: {e}")
    else:
        print(f"Skipped {img_url} due to file extension")

def download_images_from_page(url):
    page_response = requests.get(url)
    page_soup = BeautifulSoup(page_response.content, 'html.parser')
    images = page_soup.find_all('img')
    with ThreadPoolExecutor(max_workers=5) as executor:  # 可以调整max_workers来改变线程数
        for img in images:
            img_url = img['src']
            executor.submit(download_image, img_url)

def main(base_url, start_path):
    global images_dir
    images_dir = 'images'
    if not os.path.exists(images_dir):
        os.makedirs(images_dir)

    start_url = f"{base_url}/{start_path}"
    response = requests.get(start_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    links = soup.find_all('h3')
    for link in links:
        a_tag = link.find('a', href=True)
        if a_tag:
            full_url = f"{base_url}/{a_tag['href']}"
            download_images_from_page(full_url)

# 示例中使用的基本URL和开始路径
base_url = 'http://xxxxxxx'
start_path = 'thread6.php?fid=15'

if __name__ == "__main__":
    main(base_url, start_path)

按文件夹分类

import requests
from bs4 import BeautifulSoup
import os
from concurrent.futures import ProcessPoolExecutor
import re

def sanitize_folder_name(name):
    """清理文件夹名称，移除或替换不合法的文件系统字符。"""
    return re.sub(r'[\\/*?:"<>|]', '_', name)

def download_image(data):
    img_url, filename_prefix = data
    if img_url.lower().endswith(('.jpg', '.jpeg')):
        try:
            img_response = requests.get(img_url, stream=True)
            img_size = int(img_response.headers.get('content-length', 0))
            if img_size > 20 * 1024:  # 大于20KB
                filename = f"{filename_prefix}.jpg"
                with open(filename, 'wb') as f:
                    for chunk in img_response.iter_content(1024):
                        f.write(chunk)
                print(f"Downloaded {filename}")
        except Exception as e:
            print(f"Error downloading {img_url}: {e}")
    else:
        print(f"Skipped {img_url} due to file extension")

def download_images_from_page(url, base_dir):
    page_response = requests.get(url)
    page_soup = BeautifulSoup(page_response.content, 'html.parser')
    images = page_soup.find_all('img')
    img_data = []
    for i, img in enumerate(images):
        img_url = img['src']
        filename_prefix = os.path.join(base_dir, f"{i:04d}")
        img_data.append((img_url, filename_prefix))

    with ProcessPoolExecutor(max_workers=4) as executor:  # 调整max_workers来改变进程数
        executor.map(download_image, img_data)

def main(base_url, start_path):
    global images_dir
    images_dir = 'images'
    if not os.path.exists(images_dir):
        os.makedirs(images_dir)

    start_url = f"{base_url}/{start_path}"
    response = requests.get(start_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    links = soup.find_all('h3')
    for link_index, link in enumerate(links):
        a_tag = link.find('a', href=True)
        if a_tag:
            folder_name = sanitize_folder_name(a_tag.text.strip())
            full_url = f"{base_url}/{a_tag['href']}"
            page_dir = os.path.join(images_dir, folder_name)
            if not os.path.exists(page_dir):
                os.makedirs(page_dir)
            download_images_from_page(full_url, page_dir)

# 示例中使用的基本URL和开始路径
base_url = 'http://xxx/pw'
start_path = 'thread1022.php?fid=15&page=3'

if __name__ == "__main__":
    main(base_url, start_path)

ascarl2010

批量爬取网站图片脚本