Python Web Scraping | BeautifulSoup and Selenium Explained

Python Web Scraping | BeautifulSoup and Selenium Explained

이 글의 핵심

Step-by-step guide to collecting data from the web with Python: HTTP requests, HTML parsing, dynamic pages with Selenium, and responsible scraping habits.

Introduction

“Collect data from the web”

Web scraping is the technique of automatically extracting data from websites.


1. requests basics

Fetching HTML

import requests

# GET request
response = requests.get('https://example.com')

print(response.status_code)  # 200
print(response.text)  # HTML body
print(response.headers)  # Response headers

# Custom User-Agent
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get('https://example.com', headers=headers)

2. BeautifulSoup

Parsing HTML

from bs4 import BeautifulSoup
import requests

url = 'https://example.com'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Single tag
title = soup.find('title')
print(title.text)

# Multiple tags
links = soup.find_all('a')
for link in links:
    print(link.get('href'))

# CSS selectors
articles = soup.select('.article-title')
for article in articles:
    print(article.text)

Example: news headlines

import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_news(url):
    """Collect news titles and links."""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
    }
    
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    articles = []
    
    for item in soup.select('.news-item'):
        title = item.select_one('.title').text.strip()
        link = item.select_one('a')['href']
        date = item.select_one('.date').text.strip()
        
        articles.append({
            'title': title,
            'link': link,
            'date': date
        })
    
    return pd.DataFrame(articles)

# Usage
df = scrape_news('https://news.example.com')
df.to_csv('news.csv', index=False, encoding='utf-8-sig')

3. Selenium (dynamic pages)

Install

pip install selenium

Basic usage

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver = webdriver.Chrome()

try:
    driver.get('https://example.com')
    
    element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, 'content'))
    )
    
    title = driver.find_element(By.TAG_NAME, 'h1')
    print(title.text)
    
    button = driver.find_element(By.ID, 'load-more')
    button.click()
    
    driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
    
finally:
    driver.quit()

4. Real-world example

Price monitoring

import requests
from bs4 import BeautifulSoup
import time
from datetime import datetime

def check_price(url, target_price):
    """Read product price from a page (selectors vary by site)."""
    headers = {
        'User-Agent': 'Mozilla/5.0'
    }
    
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    price_text = soup.select_one('.price').text
    price = int(price_text.replace(',', '').replace('원', ''))
    
    print(f"[{datetime.now()}] Current price: {price:,} KRW")
    
    if price <= target_price:
        print(f"🎉 Target reached! (≤ {target_price:,} KRW)")
        return True
    
    return False

# Check every hour
url = 'https://shopping.example.com/product/123'
target = 50000

while True:
    if check_price(url, target):
        break
    time.sleep(3600)

5. Saving data

CSV export

import pandas as pd

def scrape_and_save(url, output_file):
    """Scrape and write CSV."""
    data = scrape_data(url)
    df = pd.DataFrame(data)
    df.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"Saved: {output_file}")

Practical tips

Scraping etiquette

# ✅ Check robots.txt
# https://example.com/robots.txt

# ✅ Space out requests
import time
time.sleep(1)

# ✅ Set a descriptive User-Agent
headers = {'User-Agent': '...'}

# ✅ Handle errors
try:
    response = requests.get(url, timeout=10)
    response.raise_for_status()
except requests.exceptions.RequestException as e:
    print(f"Request failed: {e}")

Summary

Key takeaways

  1. requests: HTTP calls
  2. BeautifulSoup: HTML parsing
  3. Selenium: JavaScript-heavy pages
  4. Etiquette: robots.txt, pacing
  5. Storage: CSV, JSON, databases

Next steps