0 %
!
Programmer
SEO-optimizer
English
German
Russian
HTML
CSS
WordPress
Python
C#
  • Bootstrap, Materialize
  • GIT knowledge
0

No products in the cart.

Scraping Ozon Products with Botasaurus: A Step-by-Step Guide – Portfolio

12.01.2025
59 / 100

Introduction

This guide demonstrates how to scrape product data from Ozon (Products), a leading Russian e-commerce site, using the Botasaurus library. Botasaurus excels in Ozon scraping by bypassing anti-bot measures with its anti-detection features. We’ll create two scripts: one to collect product links and another to extract detailed product information.

Note: Ensure compliance with Ozon’s terms of service and use proxies responsibly.

Setup: .env File

Create a .env file to securely store proxy credentials (optional if not using proxies):

proxy_username=YOUR_DATA
proxy_password=YOUR_DATA
proxy_server_address=YOUR_DATA
proxy_server_port=YOUR_DATA

Load these variables in your scripts using dotenv for secure access.


Script 1: main.py (Collect Product Links)

This script collects product links from a specified Ozon brand page, saving them to product_links.txt.

import os
import time
from bs4 import BeautifulSoup
from botasaurus.browser import browser, Driver, Wait
from botasaurus.window_size import WindowSize
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Proxy details (optional)
proxy_username = os.getenv("proxy_username")
proxy_password = os.getenv("proxy_password")
proxy_server_address = os.getenv("proxy_server_address")
proxy_server_port = os.getenv("proxy_server_port")
http_proxy_url = f'http://{proxy_username}:{proxy_password}@{proxy_server_address}:{proxy_server_port}' if proxy_username else None

# Directories for output
base_dir = os.path.dirname(__file__)
html_dir = os.path.join(base_dir, 'html')
json_dir = os.path.join(base_dir, 'json')
os.makedirs(html_dir, exist_ok=True)
os.makedirs(json_dir, exist_ok=True)

# Base URL for scraping
base_url = 'https://www.ozon.ru/brand/khadas-100160406/'

@browser(
    block_images=True,  # Speed up loading by blocking images
    window_size=WindowSize.window_size_1280_720,
    proxy=http_proxy_url,
    headless=True  # Run in headless mode for efficiency
)
def scrape_ozon(driver: Driver, data=None):
    page_number = 1
    max_pages = 10  # Adjust as needed
    product_links = set()

    for _ in range(max_pages):
        url = base_url if page_number == 1 else f"{base_url}?page={3 * (page_number - 1) + 1}"
        print(f"Scraping: {url}")

        try:
            driver.get(url, wait=Wait.LONG)
            driver.select("#layoutPage > div.b6 > footer").scroll_into_view()
            html_content = driver.page_html

            # Save HTML for debugging
            with open(os.path.join(html_dir, f'page_{page_number}.html'), 'w', encoding='utf-8') as f:
                f.write(html_content)

            # Parse links
            soup = BeautifulSoup(html_content, 'html.parser')
            for tag in soup.find_all('a', href=True):
                link = tag['href']
                if link.startswith('/product/') and '/?advert' not in link and '/?avtc' not in link:
                    product_links.add(f"https://www.ozon.ru{link}")

            page_number += 1
            time.sleep(2)  # Rate limiting
        except Exception as e:
            print(f"Error loading {url}: {e}")
            continue

    # Save unique links
    links_file_path = os.path.join(base_dir, 'product_links.txt')
    try:
        with open(links_file_path, 'w', encoding='utf-8') as f:
            f.write('\n'.join(product_links))
        print(f"Saved {len(product_links)} links to {links_file_path}")
    except Exception as e:
        print(f"Error saving links: {e}")

if __name__ == "__main__":
    scrape_ozon()

Features: Blocks images, uses proxies, scrolls to load dynamic content, and filters out ads.

main.py screenshot

Script 2: collect.py (Extract Product Details)

This script reads links from product_links.txt and extracts detailed product info, saving it to an Excel file.

import re
import json
import os
import pandas as pd
from tqdm import tqdm
from botasaurus.browser import browser, Driver, Wait
from botasaurus.soupify import soupify
from botasaurus.window_size import WindowSize
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Proxy details (optional)
proxy_username = os.getenv("proxy_username")
proxy_password = os.getenv("proxy_password")
proxy_server_address = os.getenv("proxy_server_address")
proxy_server_port = os.getenv("proxy_server_port")
http_proxy_url = f'http://{proxy_username}:{proxy_password}@{proxy_server_address}:{proxy_server_port}' if proxy_username else None

# Directories
base_dir = os.path.dirname(__file__)
json_dir = os.path.join(base_dir, 'json')
os.makedirs(json_dir, exist_ok=True)

def remove_wc_prefix(url):
    return re.sub(r'/w[wc]\d+/', '/', url)

def extract_characteristics(soup):
    block = soup.find("div", id=lambda x: x and x.startswith("state-webShortCharacteristics-"))
    if block and block.get('data-state'):
        data = json.loads(block['data-state'])
        return "\n".join(f"{char['title']['textRs'][0]['content']}: {char['values'][0]['text']}" 
                         for char in data.get("characteristics", []) if char.get("title") and char.get("values"))
    return ""

def extract_media_links(soup):
    media_links = set()
    gallery = soup.find("div", {"data-widget": "webGallery"})
    if gallery:
        for img in gallery.find_all("img"):
            if src := img.get("src"):
                media_links.add(remove_wc_prefix(src))
        for video in gallery.find_all("video"):
            if src := video.get("src"):
                media_links.add(remove_wc_prefix(src))
    return list(media_links)

def extract_seller_info(soup):
    element = soup.find("div", id=lambda x: x and x.startswith("state-webStickyProducts-"))
    if element and element.get('data-state'):
        data = json.loads(element['data-state'])
        seller = data.get('seller', {})
        return f"https://www.ozon.ru{seller.get('link', '').replace('\\', '')}", seller.get('name', '')
    return None, None

def extract_product_info(driver, url):
    driver.get(url, wait=Wait.LONG)
    soup = soupify(driver)

    heading = soup.find("h1").text.strip() if soup.find("h1") else "No Title"
    characteristics = extract_characteristics(soup)
    media_links = extract_media_links(soup)
    seller_link, seller_name = extract_seller_info(soup)

    price_element = soup.find("div", id=lambda x: x and x.startswith("state-webPrice-"))
    card_price = re.sub(r"\D", "", json.loads(price_element['data-state']).get('cardPrice', '0')) if price_element and price_element.get('data-state') else None

    json_data = soup.find("script", type="application/ld+json")
    if json_data:
        data = json.loads(json_data.string)
        offers = data.get("offers", {})
        return {
            'sku': data.get("sku"),
            'heading': heading,
            'rating': data.get("aggregateRating", {}).get("ratingValue"),
            'review_count': data.get("aggregateRating", {}).get("reviewCount"),
            'brand': data.get("brand"),
            'description': data.get("description"),
            'price': offers.get("price"),
            'card_price': card_price,
            'currency': offers.get("priceCurrency"),
            'url': offers.get("url"),
            'characteristics': characteristics,
            'media_links': media_links,
            'seller_name': seller_name,
            'seller_link': seller_link
        }
    return {'heading': heading, 'url': url}

@browser(
    proxy=http_proxy_url,
    window_size=WindowSize.window_size_1280_720,
    wait_for_complete_page_load=False,
    headless=True
)
def main(driver: Driver, data=None):
    links_file = os.path.join(base_dir, 'product_links.txt')
    with open(links_file, 'r', encoding='utf-8') as f:
        links = [line.strip() for line in f if line.strip()]

    print(f"Processing {len(links)} links")
    products = []

    for url in tqdm(links, desc="Scraping products"):
        try:
            product_info = extract_product_info(driver, url)
            products.append(product_info)
            print(f"Collected: {product_info.get('heading')}")
        except Exception as e:
            print(f"Error processing {url}: {e}")

    df = pd.DataFrame(products)
    excel_path = os.path.join(base_dir, 'ozon_products.xlsx')
    df.to_excel(excel_path, index=False)
    print(f"Saved to {excel_path}")

if __name__ == "__main__":
    main()

Features: Extracts SKU, price, media, seller info, and more, with progress tracking via tqdm.

collect.py screenshot

Conclusion

Using Botasaurus, we’ve built a robust web scraping Ozon solution to collect product links and detailed data. The first script (main.py) gathers links efficiently, while the second (collect.py) extracts rich product details into an Excel file. This Botasaurus tutorial leverages anti-detection and proxy support, making it ideal for scraping dynamic e-commerce sites like Ozon in 2025.

Attachments

File Description File size Downloads
xls Demo file Demo file of scraping Ozon
Download file of Ozon products
14 KB 47
Posted in Portfolio, PythonTags:
Write a comment
© 2025... All Rights Reserved.