Scraping Ozon Products with Botasaurus: A Step-by-Step Guide – Portfolio
Introduction
This guide demonstrates how to scrape product data from Ozon (Products), a leading Russian e-commerce site, using the Botasaurus library. Botasaurus excels in Ozon scraping by bypassing anti-bot measures with its anti-detection features. We’ll create two scripts: one to collect product links and another to extract detailed product information.
Note: Ensure compliance with Ozon’s terms of service and use proxies responsibly.
Setup: .env File
Create a .env
file to securely store proxy credentials (optional if not using proxies):
proxy_username=YOUR_DATA
proxy_password=YOUR_DATA
proxy_server_address=YOUR_DATA
proxy_server_port=YOUR_DATA
Load these variables in your scripts using dotenv
for secure access.
Script 1: main.py (Collect Product Links)
This script collects product links from a specified Ozon brand page, saving them to product_links.txt
.
import os
import time
from bs4 import BeautifulSoup
from botasaurus.browser import browser, Driver, Wait
from botasaurus.window_size import WindowSize
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Proxy details (optional)
proxy_username = os.getenv("proxy_username")
proxy_password = os.getenv("proxy_password")
proxy_server_address = os.getenv("proxy_server_address")
proxy_server_port = os.getenv("proxy_server_port")
http_proxy_url = f'http://{proxy_username}:{proxy_password}@{proxy_server_address}:{proxy_server_port}' if proxy_username else None
# Directories for output
base_dir = os.path.dirname(__file__)
html_dir = os.path.join(base_dir, 'html')
json_dir = os.path.join(base_dir, 'json')
os.makedirs(html_dir, exist_ok=True)
os.makedirs(json_dir, exist_ok=True)
# Base URL for scraping
base_url = 'https://www.ozon.ru/brand/khadas-100160406/'
@browser(
block_images=True, # Speed up loading by blocking images
window_size=WindowSize.window_size_1280_720,
proxy=http_proxy_url,
headless=True # Run in headless mode for efficiency
)
def scrape_ozon(driver: Driver, data=None):
page_number = 1
max_pages = 10 # Adjust as needed
product_links = set()
for _ in range(max_pages):
url = base_url if page_number == 1 else f"{base_url}?page={3 * (page_number - 1) + 1}"
print(f"Scraping: {url}")
try:
driver.get(url, wait=Wait.LONG)
driver.select("#layoutPage > div.b6 > footer").scroll_into_view()
html_content = driver.page_html
# Save HTML for debugging
with open(os.path.join(html_dir, f'page_{page_number}.html'), 'w', encoding='utf-8') as f:
f.write(html_content)
# Parse links
soup = BeautifulSoup(html_content, 'html.parser')
for tag in soup.find_all('a', href=True):
link = tag['href']
if link.startswith('/product/') and '/?advert' not in link and '/?avtc' not in link:
product_links.add(f"https://www.ozon.ru{link}")
page_number += 1
time.sleep(2) # Rate limiting
except Exception as e:
print(f"Error loading {url}: {e}")
continue
# Save unique links
links_file_path = os.path.join(base_dir, 'product_links.txt')
try:
with open(links_file_path, 'w', encoding='utf-8') as f:
f.write('\n'.join(product_links))
print(f"Saved {len(product_links)} links to {links_file_path}")
except Exception as e:
print(f"Error saving links: {e}")
if __name__ == "__main__":
scrape_ozon()
Features: Blocks images, uses proxies, scrolls to load dynamic content, and filters out ads.
Script 2: collect.py (Extract Product Details)
This script reads links from product_links.txt
and extracts detailed product info, saving it to an Excel file.
import re
import json
import os
import pandas as pd
from tqdm import tqdm
from botasaurus.browser import browser, Driver, Wait
from botasaurus.soupify import soupify
from botasaurus.window_size import WindowSize
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Proxy details (optional)
proxy_username = os.getenv("proxy_username")
proxy_password = os.getenv("proxy_password")
proxy_server_address = os.getenv("proxy_server_address")
proxy_server_port = os.getenv("proxy_server_port")
http_proxy_url = f'http://{proxy_username}:{proxy_password}@{proxy_server_address}:{proxy_server_port}' if proxy_username else None
# Directories
base_dir = os.path.dirname(__file__)
json_dir = os.path.join(base_dir, 'json')
os.makedirs(json_dir, exist_ok=True)
def remove_wc_prefix(url):
return re.sub(r'/w[wc]\d+/', '/', url)
def extract_characteristics(soup):
block = soup.find("div", id=lambda x: x and x.startswith("state-webShortCharacteristics-"))
if block and block.get('data-state'):
data = json.loads(block['data-state'])
return "\n".join(f"{char['title']['textRs'][0]['content']}: {char['values'][0]['text']}"
for char in data.get("characteristics", []) if char.get("title") and char.get("values"))
return ""
def extract_media_links(soup):
media_links = set()
gallery = soup.find("div", {"data-widget": "webGallery"})
if gallery:
for img in gallery.find_all("img"):
if src := img.get("src"):
media_links.add(remove_wc_prefix(src))
for video in gallery.find_all("video"):
if src := video.get("src"):
media_links.add(remove_wc_prefix(src))
return list(media_links)
def extract_seller_info(soup):
element = soup.find("div", id=lambda x: x and x.startswith("state-webStickyProducts-"))
if element and element.get('data-state'):
data = json.loads(element['data-state'])
seller = data.get('seller', {})
return f"https://www.ozon.ru{seller.get('link', '').replace('\\', '')}", seller.get('name', '')
return None, None
def extract_product_info(driver, url):
driver.get(url, wait=Wait.LONG)
soup = soupify(driver)
heading = soup.find("h1").text.strip() if soup.find("h1") else "No Title"
characteristics = extract_characteristics(soup)
media_links = extract_media_links(soup)
seller_link, seller_name = extract_seller_info(soup)
price_element = soup.find("div", id=lambda x: x and x.startswith("state-webPrice-"))
card_price = re.sub(r"\D", "", json.loads(price_element['data-state']).get('cardPrice', '0')) if price_element and price_element.get('data-state') else None
json_data = soup.find("script", type="application/ld+json")
if json_data:
data = json.loads(json_data.string)
offers = data.get("offers", {})
return {
'sku': data.get("sku"),
'heading': heading,
'rating': data.get("aggregateRating", {}).get("ratingValue"),
'review_count': data.get("aggregateRating", {}).get("reviewCount"),
'brand': data.get("brand"),
'description': data.get("description"),
'price': offers.get("price"),
'card_price': card_price,
'currency': offers.get("priceCurrency"),
'url': offers.get("url"),
'characteristics': characteristics,
'media_links': media_links,
'seller_name': seller_name,
'seller_link': seller_link
}
return {'heading': heading, 'url': url}
@browser(
proxy=http_proxy_url,
window_size=WindowSize.window_size_1280_720,
wait_for_complete_page_load=False,
headless=True
)
def main(driver: Driver, data=None):
links_file = os.path.join(base_dir, 'product_links.txt')
with open(links_file, 'r', encoding='utf-8') as f:
links = [line.strip() for line in f if line.strip()]
print(f"Processing {len(links)} links")
products = []
for url in tqdm(links, desc="Scraping products"):
try:
product_info = extract_product_info(driver, url)
products.append(product_info)
print(f"Collected: {product_info.get('heading')}")
except Exception as e:
print(f"Error processing {url}: {e}")
df = pd.DataFrame(products)
excel_path = os.path.join(base_dir, 'ozon_products.xlsx')
df.to_excel(excel_path, index=False)
print(f"Saved to {excel_path}")
if __name__ == "__main__":
main()
Features: Extracts SKU, price, media, seller info, and more, with progress tracking via tqdm
.
Conclusion
Using Botasaurus, we’ve built a robust web scraping Ozon solution to collect product links and detailed data. The first script (main.py
) gathers links efficiently, while the second (collect.py
) extracts rich product details into an Excel file. This Botasaurus tutorial leverages anti-detection and proxy support, making it ideal for scraping dynamic e-commerce sites like Ozon in 2025.

Professional data parsing via ZennoPoster, Python, creating browser and keyboard automation scripts. SEO-promotion and website creation: from a business card site to a full-fledged portal.