0 %
!
Programmer
SEO-optimizer
English
German
Russian
HTML
CSS
WordPress
Python
C#
  • Bootstrap, Materialize
  • GIT knowledge
0

No products in the cart.

Scraping Ozon products – Portfolio

12.01.2025

Collect information on the popular in Russia and not only site ozon and products on it. Let’s create 2 separate files-scripts, which will first collect the necessary data on the request, and then take information from the product pages. We use the botasaurus library for parsing data, as it allows us to bypass parser locking.

First of all, let’s prepare the ‘.env’ file, it will be necessary for secure loading of confidential data, such as proxy address and login and password.

Sample code for this file:

proxy_username = 'YOUR_DATA'
proxy_password = 'YOUR_DATA'
proxy_server_address = 'YOUR_DATA'
proxy_server_port = 'YOUR_DATA'

Of course, in case you are not using a proxy, you will not have to use this file, as we will not authorize on the site.

main.py

File main.py

Create the ‘main.py’ file and import the python libraries. We also write proxies if necessary.

import os
import time
import json
from bs4 import BeautifulSoup
from botasaurus.browser import browser, Driver, Wait
from botasaurus.window_size import WindowSize
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# PROXY details
proxy_username = os.getenv("proxy_username")
proxy_password = os.getenv("proxy_password")
proxy_server_address = os.getenv("proxy_server_address")
proxy_server_port = os.getenv("proxy_server_port")
http_proxy_url = f'http://{proxy_username}:{proxy_password}@{proxy_server_address}:{proxy_server_port}'

Specify folders to store the collected files and if they are not available, create them.

# Folder for HTML and JSON files
base_dir = os.path.dirname(__file__)
html_dir = os.path.join(base_dir, 'html')
json_dir = os.path.join(base_dir, 'json')

# Create folders if they do not exist
os.makedirs(html_dir, exist_ok=True)
os.makedirs(json_dir, exist_ok=True)

Now we specify the required address of the Ozon site, on which we plan to collect information.

# Basic request URL without page parameter
base_url = 'https://www.ozon.ru/brand/khadas-100160406/'

Next comes the main function of the parser itself. In the function, you can specify the browser screen resolution, the maximum number of pages to collect. We also automatically remove unnecessary data, such as advertising links.

# The @browser decorator creates and manages the browser driver
@browser(block_images=True, window_size=WindowSize.window_size_1280_720, proxy=http_proxy_url)
def scrape_ozon(driver: Driver, data=None):
page_number = 1
max_page = 10
product_links = set()

for i in range(max_page):
if page_number == 1:
request = base_url
else:
page_param = 3 * (page_number - 1) + 1
request = f"{base_url}&page={page_param}"

print(f"Collecting data from {request}")

try:
response = driver.get(request, wait=Wait.LONG)
driver.select("#layoutPage > div.b6 > footer").scroll_into_view()
html_content = driver.page_html
except Exception as e:
print(f"Page load error {request}: {e}")
continue

with open(os.path.join(html_dir, f'page_{page_number}.html'), 'w', encoding='utf-8') as f:
f.write(html_content)

soup = BeautifulSoup(html_content, 'html.parser')
anchor_tags = soup.find_all('a', href=True)
for tag in anchor_tags:
link = tag['href']
if link.startswith('/product/') and '/?advert' not in link and '/?avtc' not in link:
product_links.add(link)

page_number += 1
time.sleep(2)

links_file_path = os.path.join(base_dir, 'product_links.txt')
try:
with open(links_file_path, 'w', encoding='utf-8') as links_file:
for link in product_links:
links_file.write(f"https://www.ozon.ru{link}\n")
print(f"Product links are saved in: {links_file_path}")
print(f"Number of unique links: {len(product_links)}")
except Exception as e:
print(f"File write error {links_file_path}: {e}")

So, now we just run the script. When the script is finished, the ‘product_links.txt’ file will collect product links.


# Function start
scrape_ozon()

File collect.py

The second file is needed to collect not pages, but products and information from them.


import re
import json
import os
import pandas as pd
from tqdm import tqdm
from botasaurus.browser import browser, Driver, Wait
from botasaurus.soupify import soupify
from botasaurus.window_size import WindowSize
import time

from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# PROXY details
proxy_username = os.getenv("proxy_username")
proxy_password = os.getenv("proxy_password")
proxy_server_address = os.getenv("proxy_server_address")
proxy_server_port = os.getenv("proxy_server_port")
http_proxy_url = f'http://{proxy_username}:{proxy_password}@{proxy_server_address}:{proxy_server_port}'

base_dir = os.path.dirname(__file__)
json_dir = os.path.join(base_dir, 'json')
os.makedirs(json_dir, exist_ok=True)

def remove_wc_prefix(url):
return re.sub(r'/w[wc]\d+/', '/', url)

def extract_characteristics(page_soup):
characteristics = []
block = page_soup.find("div", id=lambda x: x and x.startswith("state-webShortCharacteristics-"))

if block and block.get('data-state'):
data = json.loads(block['data-state'])
for char in data.get("characteristics", []):
title = char.get("title", {}).get("textRs", [{}])[0].get("content", "")
values = char.get("values", [{}])[0].get("text", "")
if title and values:
characteristics.append(f"{title}: {values}")
return "\n".join(characteristics)

def extract_media_links(page_soup):
media_links = set()
gallery_block = page_soup.find("div", {"data-widget": "webGallery"})
if gallery_block:
for img in gallery_block.find_all("img"):
media_links.add(remove_wc_prefix(img.get("src", "")))
for video in gallery_block.find_all("video"):
media_links.add(remove_wc_prefix(video.get("src", "")))
return list(media_links)

def extract_seller_info(page_soup):
element = page_soup.find("div", id=lambda x: x and x.startswith("state-webStickyProducts-"))
if element and element.get('data-state'):
data = json.loads(element['data-state'])
link = f"https://www.ozon.ru{data.get('seller', {}).get('link', '').replace('\\', '')}"
name = data.get('seller', {}).get('name', '')
return link, name
return None, None

def extract_product_info(driver, url):
driver.get(url, wait=Wait.LONG)
time.sleep(5)
page_soup = soupify(driver)

heading = page_soup.find("h1").text.strip() if page_soup.find("h1") else "Нет названия"
characteristics = extract_characteristics(page_soup)
media_links = extract_media_links(page_soup)
seller_link, seller_name = extract_seller_info(page_soup)

# Price with card
price_element = page_soup.find("div", id=lambda x: x and x.startswith("state-webPrice-"))
card_price = None
if price_element and price_element.get('data-state'):
data = json.loads(price_element['data-state'])
card_price = re.sub(r"\D", "", data.get('cardPrice', '0')) # Удаление нецифровых символов

# Product JSON data
json_data = page_soup.find("script", type="application/ld+json")
if json_data:
json_content = json.loads(json_data.string)
offers = json_content.get("offers", {})
return {
'sku': json_content.get("sku"),
'heading': heading,
'rating': json_content.get("aggregateRating", {}).get("ratingValue"),
'review_count': json_content.get("aggregateRating", {}).get("reviewCount"),
'brand': json_content.get("brand"),
'description': json_content.get("description"),
'price': offers.get("price"),
'cardPrice': card_price,
'currency': offers.get("priceCurrency"),
'url': offers.get("url"),
'characteristics': characteristics,
'media_links': media_links,
'seller_name': seller_name,
'seller_link': seller_link
}
return {}

@browser(proxy=http_proxy_url, window_size=WindowSize.window_size_1280_720, wait_for_complete_page_load=False)
def main(driver: Driver, data=None):
links_file_path = os.path.join(base_dir, 'product_links.txt')
with open(links_file_path, 'r', encoding='utf-8') as file:
links = [line.strip() for line in file if line.strip()]

print(f"Processing {len(links)} ссылок.")
processed_links = []

for url in tqdm(links, desc="Link processing"):
try:
product_info = extract_product_info(driver, url)
except Exception as e:
print(f"Processing error {url}: {e}")
continue
print(f"Information has been gathered for: {product_info.get('heading')}")
processed_links.append(product_info)

df = pd.DataFrame(processed_links)
excel_path = os.path.join(base_dir, 'processed_links_ozon.xlsx')
df.to_excel(excel_path, index=False)
print(f"Exported to Excel: {excel_path}")

if __name__ == '__main__':
main()

After the final data collection, you get a finished Excel file.

Scraping Ozon products

Attachments

FileDescriptionFile sizeDownloads
xls Demo file Demo file of scraping Ozon
Download file of Ozon products
14 KB7
Posted in Portfolio, PythonTags:
Write a comment

You cannot copy content of this page