在日常开发中,我们经常遇到需要同时运行多个浏览器实例的场景:
默认情况下,浏览器共享相同的缓存和Cookie,这会导致账号冲突、数据污染等问题。本文将深入探讨如何使用Python实现浏览器多开并确保缓存完全隔离。
浏览器缓存通常包括:
import os
import tempfile
import shutil
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from concurrent.futures import ThreadPoolExecutor
class IsolatedBrowser:
def __init__(self, user_data_dir=None, headless=False):
"""
初始化隔离的浏览器实例
Args:
user_data_dir: 用户数据目录路径,None则创建临时目录
headless: 是否无头模式
"""
self.user_data_dir = user_data_dir or tempfile.mkdtemp(prefix='chrome_profile_')
self.options = self._create_options(headless)
self.driver = None
def _create_options(self, headless):
"""创建浏览器选项"""
options = Options()
# 基本隔离配置
options.add_argument(f'--user-data-dir={self.user_data_dir}')
options.add_argument('--no-first-run')
options.add_argument('--no-default-browser-check')
# 增强隔离性
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
if headless:
options.add_argument('--headless')
options.add_argument('--disable-gpu')
return options
def start(self):
"""启动浏览器"""
self.driver = webdriver.Chrome(options=self.options)
# 隐藏自动化特征
self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
return self.driver
def close(self):
"""关闭浏览器并清理资源"""
if self.driver:
self.driver.quit()
# 可选:清理临时目录
# if os.path.exists(self.user_data_dir) and 'tmp' in self.user_data_dir:
# shutil.rmtree(self.user_data_dir, ignore_errors=True)import threading
import uuid
from typing import Dict, List
class BrowserManager:
"""浏览器多开管理器"""
def __init__(self, max_instances=5, base_profile_dir=None):
self.max_instances = max_instances
self.base_profile_dir = base_profile_dir or os.path.join(os.getcwd(), 'browser_profiles')
self.active_instances: Dict[str, IsolatedBrowser] = {}
self.lock = threading.Lock()
# 确保基础目录存在
os.makedirs(self.base_profile_dir, exist_ok=True)
def create_instance(self, instance_id=None, headless=False, profile_name=None):
"""创建新的浏览器实例"""
with self.lock:
if len(self.active_instances) >= self.max_instances:
raise Exception(f"达到最大实例数限制: {self.max_instances}")
instance_id = instance_id or str(uuid.uuid4())[:8]
# 创建隔离的用户数据目录
if profile_name:
user_data_dir = os.path.join(self.base_profile_dir, profile_name)
os.makedirs(user_data_dir, exist_ok=True)
else:
user_data_dir = os.path.join(self.base_profile_dir, f'instance_{instance_id}')
# 创建浏览器实例
browser = IsolatedBrowser(user_data_dir=user_data_dir, headless=headless)
driver = browser.start()
self.active_instances[instance_id] = browser
return instance_id, driver
def get_instance(self, instance_id):
"""获取实例的driver"""
browser = self.active_instances.get(instance_id)
return browser.driver if browser else None
def close_instance(self, instance_id, cleanup=False):
"""关闭指定实例"""
with self.lock:
browser = self.active_instances.pop(instance_id, None)
if browser:
browser.close()
if cleanup and os.path.exists(browser.user_data_dir):
shutil.rmtree(browser.user_data_dir, ignore_errors=True)
def close_all(self):
"""关闭所有实例"""
for instance_id in list(self.active_instances.keys()):
self.close_instance(instance_id)# docker_browser.py
import docker
import time
import requests
class DockerizedBrowser:
"""使用Docker容器实现完全隔离"""
def __init__(self):
self.client = docker.from_env()
self.containers = {}
def start_chrome_container(self, container_name, vnc_port=5900, webdriver_port=4444):
"""启动Chrome Docker容器"""
# 为每个容器创建独立的卷
volume_name = f"{container_name}_data"
# 运行容器
container = self.client.containers.run(
'selenium/standalone-chrome',
name=container_name,
ports={
'4444/tcp': webdriver_port,
'5900/tcp': vnc_port
},
volumes={volume_name: {'bind': '/home/seluser/Downloads', 'mode': 'rw'}},
shm_size='2g',
detach=True,
remove=True,
environment=[
'SE_NODE_MAX_SESSIONS=1',
'SE_NODE_OVERRIDE_MAX_SESSIONS=true',
'SCREEN_WIDTH=1920',
'SCREEN_HEIGHT=1080'
]
)
self.containers[container_name] = {
'container': container,
'webdriver_port': webdriver_port
}
# 等待容器就绪
self._wait_for_container_ready(webdriver_port)
return f'http://localhost:{webdriver_port}/wd/hub'
def _wait_for_container_ready(self, port, timeout=30):
"""等待容器WebDriver服务就绪"""
start_time = time.time()
while time.time() - start_time < timeout:
try:
response = requests.get(f'http://localhost:{port}/status')
if response.status_code == 200:
return True
except:
pass
time.sleep(1)
raise TimeoutError("容器启动超时")# fingerprint_isolation.py
import random
class FingerprintManager:
"""浏览器指纹管理"""
@staticmethod
def get_random_user_agent():
"""获取随机User-Agent"""
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',
]
return random.choice(user_agents)
@staticmethod
def configure_fingerprint_options(options, profile_id):
"""配置浏览器指纹选项"""
# 设置User-Agent
options.add_argument(f'--user-agent={FingerprintManager.get_random_user_agent()}')
# 设置语言
options.add_argument('--lang=en-US')
# 设置时区
options.add_argument('--timezone=America/New_York')
# 设置屏幕分辨率
resolutions = ['1920,1080', '1366,768', '1536,864']
options.add_argument(f'--window-size={random.choice(resolutions)}')
# 禁用WebRTC防止IP泄露
prefs = {
"webrtc.ip_handling_policy": "disable_non_proxied_udp",
"webrtc.multiple_routes_enabled": False,
"webrtc.nonproxied_udp_enabled": False
}
options.add_experimental_option("prefs", prefs)
return options# multi_account_automation.py
from browser_manager import BrowserManager
import time
import json
class MultiAccountAutomator:
"""多账号自动化操作"""
def __init__(self, accounts_config):
self.manager = BrowserManager(max_instances=len(accounts_config))
self.accounts = accounts_config
self.sessions = {}
def run_parallel_sessions(self):
"""并行运行多个账号会话"""
import concurrent.futures
def process_account(account):
account_id = account['id']
instance_id, driver = self.manager.create_instance(
instance_id=account_id,
profile_name=f"account_{account_id}"
)
try:
# 执行登录操作
self._login(driver, account)
# 执行业务操作
self._perform_actions(driver, account)
# 保存会话状态
self._save_session_state(account_id, driver)
finally:
# 注意:实际使用中可能不需要立即关闭
pass
# 使用线程池并行执行
with concurrent.futures.ThreadPoolExecutor(max_workers=len(self.accounts)) as executor:
executor.map(process_account, self.accounts)
def _login(self, driver, account):
"""执行登录操作"""
driver.get("https://example.com/login")
time.sleep(2)
# 填充登录表单
driver.find_element('name', 'username').send_keys(account['username'])
driver.find_element('name', 'password').send_keys(account['password'])
driver.find_element('tag name', 'form').submit()
time.sleep(3)
print(f"账号 {account['username']} 登录成功")
def _save_session_state(self, account_id, driver):
"""保存会话状态"""
# 获取Cookies
cookies = driver.get_cookies()
# 保存到文件
with open(f'session_{account_id}.json', 'w') as f:
json.dump({
'cookies': cookies,
'timestamp': time.time()
}, f)# distributed_crawler.py
import redis
import hashlib
from queue import Queue
from threading import Thread
class DistributedCrawler:
"""分布式爬虫系统"""
def __init__(self, redis_host='localhost', redis_port=6379):
self.redis = redis.StrictRedis(host=redis_host, port=redis_port, decode_responses=True)
self.browser_pool = []
self.task_queue = Queue()
def create_browser_pool(self, pool_size=3):
"""创建浏览器池"""
for i in range(pool_size):
profile_hash = hashlib.md5(f"crawler_{i}".encode()).hexdigest()[:16]
manager = BrowserManager(max_instances=1)
instance_id, driver = manager.create_instance(
profile_name=f"crawler_{profile_hash}",
headless=True
)
self.browser_pool.append({
'id': instance_id,
'driver': driver,
'manager': manager,
'busy': False
})
def assign_task(self, url, callback):
"""分配爬取任务"""
browser = self._get_available_browser()
if browser:
browser['busy'] = True
try:
result = self._crawl_with_browser(browser['driver'], url)
callback(result)
finally:
browser['busy'] = False
def _crawl_with_browser(self, driver, url):
"""使用指定浏览器进行爬取"""
# 设置请求头
driver.execute_cdp_cmd('Network.setUserAgentOverride', {
"userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
})
driver.get(url)
time.sleep(2)
# 执行爬取逻辑
content = driver.page_source
# ... 解析内容
return {
'url': url,
'content': content,
'status': 'success'
}# resource_manager.py
import psutil
import logging
class BrowserResourceManager:
"""浏览器资源管理器"""
def __init__(self, max_memory_mb=2048, max_cpu_percent=80):
self.max_memory_mb = max_memory_mb
self.max_cpu_percent = max_cpu_percent
self.logger = logging.getLogger(__name__)
def check_system_resources(self):
"""检查系统资源"""
memory = psutil.virtual_memory()
cpu_percent = psutil.cpu_percent(interval=1)
memory_usage_mb = memory.used / 1024 / 1024
self.logger.info(f"内存使用: {memory_usage_mb:.2f}MB, CPU使用: {cpu_percent}%")
if memory_usage_mb > self.max_memory_mb:
self.logger.warning("内存使用超过阈值")
return False
if cpu_percent > self.max_cpu_percent:
self.logger.warning("CPU使用超过阈值")
return False
return True
def cleanup_orphan_processes(self):
"""清理孤儿浏览器进程"""
for proc in psutil.process_iter(['name', 'cmdline']):
try:
if 'chrome' in proc.info['name'].lower():
cmdline = proc.info['cmdline'] or []
if any('--user-data-dir' in arg for arg in cmdline):
# 检查是否为孤立的自动化进程
parent = proc.parent()
if parent and 'python' not in parent.name().lower():
proc.terminate()
self.logger.info(f"终止孤立进程: {proc.pid}")
except (psutil.NoSuchProcess, psutil.AccessDenied):
pass# config_templates.py
from dataclasses import dataclass
from typing import Optional, List
@dataclass
class BrowserConfig:
"""浏览器配置模板"""
user_data_dir: str
headless: bool = False
proxy: Optional[str] = None
user_agent: Optional[str] = None
window_size: str = "1920,1080"
disable_images: bool = False
disable_js: bool = False
@classmethod
def create_crawler_config(cls, profile_name: str):
"""创建爬虫专用配置"""
return cls(
user_data_dir=f"./profiles/crawler_{profile_name}",
headless=True,
disable_images=True,
user_agent="Mozilla/5.0 (compatible; CrawlerBot/1.0)"
)
@classmethod
def create_testing_config(cls, profile_name: str):
"""创建测试专用配置"""
return cls(
user_data_dir=f"./profiles/test_{profile_name}",
headless=False,
window_size="1366,768"
)解决方案:动态分配端口
import socket
def find_free_port():
"""查找空闲端口"""
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('', 0))
return s.getsockname()[1]解决方案:定期重启和监控
class BrowserSession:
def __init__(self, max_operations=100):
self.operation_count = 0
self.max_operations = max_operations
def should_restart(self):
"""判断是否需要重启"""
self.operation_count += 1
return self.operation_count >= self.max_operations解决方案:行为模拟和指纹多样化
def simulate_human_behavior(driver):
"""模拟人类行为"""
import random
# 随机鼠标移动
action = ActionChains(driver)
for _ in range(random.randint(2, 5)):
x = random.randint(0, 100)
y = random.randint(0, 100)
action.move_by_offset(x, y)
action.perform()
# 随机滚动
driver.execute_script(f"window.scrollBy(0, {random.randint(100, 500)})")
time.sleep(random.uniform(0.5, 2))通过本文的介绍,我们详细探讨了Python多开浏览器缓存隔离的多种方案:
关键要点:
--user-data-dir缓存隔离不仅是技术问题,更是系统设计问题。合理的隔离策略可以显著提高系统的稳定性、安全性和可维护性。
可以在GitHub获取完整示例代码:[示例仓库链接]
注意:在实际使用中,请遵守网站的服务条款,合理使用自动化工具,避免对目标服务器造成过大压力。