Python Web Scraping | BeautifulSoup and Selenium Explained
이 글의 핵심
Step-by-step guide to collecting data from the web with Python: HTTP requests, HTML parsing, dynamic pages with Selenium, and responsible scraping habits.
Introduction
“Collect data from the web”
Web scraping is the technique of automatically extracting data from websites.
1. requests basics
Fetching HTML
import requests
# GET request
response = requests.get('https://example.com')
print(response.status_code) # 200
print(response.text) # HTML body
print(response.headers) # Response headers
# Custom User-Agent
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get('https://example.com', headers=headers)
2. BeautifulSoup
Parsing HTML
from bs4 import BeautifulSoup
import requests
url = 'https://example.com'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Single tag
title = soup.find('title')
print(title.text)
# Multiple tags
links = soup.find_all('a')
for link in links:
print(link.get('href'))
# CSS selectors
articles = soup.select('.article-title')
for article in articles:
print(article.text)
Example: news headlines
import requests
from bs4 import BeautifulSoup
import pandas as pd
def scrape_news(url):
"""Collect news titles and links."""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
articles = []
for item in soup.select('.news-item'):
title = item.select_one('.title').text.strip()
link = item.select_one('a')['href']
date = item.select_one('.date').text.strip()
articles.append({
'title': title,
'link': link,
'date': date
})
return pd.DataFrame(articles)
# Usage
df = scrape_news('https://news.example.com')
df.to_csv('news.csv', index=False, encoding='utf-8-sig')
3. Selenium (dynamic pages)
Install
pip install selenium
Basic usage
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
try:
driver.get('https://example.com')
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'content'))
)
title = driver.find_element(By.TAG_NAME, 'h1')
print(title.text)
button = driver.find_element(By.ID, 'load-more')
button.click()
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
finally:
driver.quit()
4. Real-world example
Price monitoring
import requests
from bs4 import BeautifulSoup
import time
from datetime import datetime
def check_price(url, target_price):
"""Read product price from a page (selectors vary by site)."""
headers = {
'User-Agent': 'Mozilla/5.0'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
price_text = soup.select_one('.price').text
price = int(price_text.replace(',', '').replace('원', ''))
print(f"[{datetime.now()}] Current price: {price:,} KRW")
if price <= target_price:
print(f"🎉 Target reached! (≤ {target_price:,} KRW)")
return True
return False
# Check every hour
url = 'https://shopping.example.com/product/123'
target = 50000
while True:
if check_price(url, target):
break
time.sleep(3600)
5. Saving data
CSV export
import pandas as pd
def scrape_and_save(url, output_file):
"""Scrape and write CSV."""
data = scrape_data(url)
df = pd.DataFrame(data)
df.to_csv(output_file, index=False, encoding='utf-8-sig')
print(f"Saved: {output_file}")
Practical tips
Scraping etiquette
# ✅ Check robots.txt
# https://example.com/robots.txt
# ✅ Space out requests
import time
time.sleep(1)
# ✅ Set a descriptive User-Agent
headers = {'User-Agent': '...'}
# ✅ Handle errors
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
except requests.exceptions.RequestException as e:
print(f"Request failed: {e}")
Summary
Key takeaways
- requests: HTTP calls
- BeautifulSoup: HTML parsing
- Selenium: JavaScript-heavy pages
- Etiquette: robots.txt, pacing
- Storage: CSV, JSON, databases