{"id":239,"date":"2026-05-15T06:01:11","date_gmt":"2026-05-14T22:01:11","guid":{"rendered":"https:\/\/junai.ai\/blog\/python-crawling-23\/"},"modified":"2026-05-17T07:31:30","modified_gmt":"2026-05-16T23:31:30","slug":"python-crawling-23","status":"publish","type":"post","link":"https:\/\/junai.ai\/blog\/python-crawling-23\/","title":{"rendered":"\ud30c\uc774\uc36c \uc6f9 \ud06c\ub864\ub9c1 \u2014 BeautifulSoup (23\ud3b8)"},"content":{"rendered":"\n<!-- WordPress REST API \ubc1c\ud589\uc6a9 HTML (\uc790\ub3d9 \uc0dd\uc131) -->\n<!-- WP-FEATURED-MEDIA-ID: 602 -->\n<div style=\"max-width:800px;margin:0 auto;\">\n<style>\n\/* js-textbook \u2014 PostgreSQL \uad50\uc7ac \ub3d9\uc77c \ud314\ub808\ud2b8 *\/\n:root {\n  --color-primary: #336791;\n  --color-accent: #60a5fa;\n  --color-bg: #fafafa;\n  --color-bg-card: #ffffff;\n  --color-text: #1e293b;\n  --color-text-muted: #64748b;\n  --hero-start: #0f172a;\n  --hero-end: #336791;\n  --font-body: -apple-system, BlinkMacSystemFont, 'Apple SD Gothic Neo', 'Noto Sans KR', sans-serif;\n  --size-body: 17px;\n  --line-height: 1.75;\n}\n* { box-sizing: border-box; }\n.container { max-width: 760px; margin: 0 auto; padding: 0 22px 80px; }\n.hero { background: linear-gradient(135deg, var(--hero-start) 0%, var(--hero-end) 100%); color: #fff; padding: 72px 22px 56px; text-align: center; }\n.hero .badge { display: inline-block; background: rgba(96,165,250,0.18); color: var(--color-accent); padding: 6px 14px; border-radius: 999px; font-size: 13px; font-weight: 600; letter-spacing: 0.5px; margin-bottom: 18px; }\n.hero h1 { margin: 0 0 18px; font-size: 36px; line-height: 1.3; letter-spacing: -0.3px; }\n.hero p.sub { margin: 0 auto; max-width: 580px; font-size: 17px; color: #bfdbfe; }\n.hero img { width: 100%; max-width: 720px; height: auto; margin: 36px auto 0; display: block; border-radius: 10px; box-shadow: 0 8px 32px rgba(0,0,0,0.3); }\n.meta { display: flex; gap: 14px; justify-content: center; margin-top: 20px; font-size: 13px; color: #93c5fd; flex-wrap: wrap; }\n.meta span::before { content: \"\u00b7\"; margin-right: 14px; color: #1e3a8a; }\n.meta span:first-child::before { content: \"\"; margin: 0; }\narticle { background: var(--color-bg-card); margin-top: -36px; padding: 44px 28px; border-radius: 14px; box-shadow: 0 2px 18px rgba(0,0,0,0.06); }\narticle p { margin: 0 0 18px; }\nh2 { font-size: 28px; line-height: 1.35; letter-spacing: -0.3px; margin: 48px 0 18px; padding-bottom: 10px; border-bottom: 2px solid var(--color-primary); }\nh2:first-of-type { margin-top: 8px; }\nh3 { font-size: 21px; line-height: 1.4; margin: 30px 0 12px; color: var(--color-primary); }\ncode { background: #f1f5f9; color: #0f172a; padding: 2px 6px; border-radius: 4px; font-family: 'SFMono-Regular', Menlo, Consolas, monospace; font-size: 0.92em; }\npre { background: #0f172a; color: #e2e8f0; padding: 18px 20px; border-radius: 10px; overflow-x: auto; font-size: 14.5px; line-height: 1.65; margin: 18px 0; }\npre code { background: transparent; color: inherit; padding: 0; }\nul, ol { margin: 0 0 18px; padding-left: 24px; }\nli { margin-bottom: 8px; }\nstrong { color: #0f172a; }\n.databox { background: #eff6ff; border-left: 4px solid var(--color-primary); padding: 18px 20px; border-radius: 6px; margin: 22px 0; }\n.databox p { margin: 0 0 8px; }\n.databox p:last-child { margin: 0; }\n.databox strong { color: var(--color-primary); }\n.warnbox { background: linear-gradient(135deg, #fef3c7 0%, #fde68a 100%); border-left: 4px solid #d97706; padding: 18px 20px; border-radius: 6px; margin: 22px 0; }\n.warnbox strong { color: #92400e; }\n.tablewrap { overflow-x: auto; -webkit-overflow-scrolling: touch; margin: 22px 0; }\ntable { width: 100%; border-collapse: collapse; font-size: 15px; background: var(--color-bg-card); }\nth, td { padding: 11px 12px; text-align: left; border-bottom: 1px solid #e2e8f0; vertical-align: top; }\nth { background: #f1f5f9; font-weight: 700; color: #0f172a; }\ntd:first-child, th:first-child { font-weight: 700; }\n.cta { background: linear-gradient(135deg, #336791 0%, #60a5fa 100%); color: #fff; padding: 30px 24px; border-radius: 12px; margin-top: 44px; text-align: center; }\n.cta h3 { color: #fff; margin: 0 0 10px; }\n.cta p { margin: 0; color: #dbeafe; }\n.series-nav { background: #eff6ff; padding: 18px 22px; border-radius: 10px; margin-top: 24px; font-size: 14.5px; color: var(--color-text-muted); }\n.series-nav strong { color: var(--color-primary); }\n@media (max-width: 480px) {\n  .hero { padding: 52px 18px 44px; }\n  .hero h1 { font-size: 26px; }\n  .hero p.sub { font-size: 15px; }\n  article { padding: 28px 18px; border-radius: 10px; }\n  h2 { font-size: 22px; }\n  h3 { font-size: 18px; }\npre { font-size: 13px; padding: 14px 16px; }\n}\n@media (max-width: 560px) {\n  .tablewrap table, .tablewrap thead, .tablewrap tbody, .tablewrap tr, .tablewrap th, .tablewrap td { display: block; width: auto; }\n  .tablewrap thead { display: none; }\n  .tablewrap tr { margin: 0 0 14px; border: 1px solid #e2e8f0; border-radius: 10px; overflow: hidden; }\n  .tablewrap td { border: none; border-bottom: 1px solid #f1f5f9; padding: 9px 14px; }\n  .tablewrap td:first-child { background: #eff6ff; font-weight: 800; font-size: 15.5px; }\n  .tablewrap td:last-child { border-bottom: none; }\n  .tablewrap td[data-label]::before { content: attr(data-label) \" \u2014 \"; font-weight: 700; color: var(--color-primary); }\n}\n<\/style>\n<header class=\"hero\">\n  <span class=\"badge\">\ud30c\uc774\uc36c \uad50\uc7ac \u00b7 23\ud3b8 \/ 27\ud3b8<\/span>\n  <h1>\ud30c\uc774\uc36c \uc6f9 \ud06c\ub864\ub9c1<\/h1>\n  <p class=\"sub\">\uc815\uc801 \ud398\uc774\uc9c0\ub294 BeautifulSoup, JS \ub3d9\uc801 \ud398\uc774\uc9c0\ub294 Playwright.<\/p>\n  <div class=\"meta\"><span>\uc2e4\uc804<\/span><span>\uc77d\ub294 \uc2dc\uac04 7\ubd84<\/span><span>2026-05-13<\/span><\/div>\n  <img decoding=\"async\" src=\"https:\/\/junai.ai\/blog\/wp-content\/uploads\/2026\/05\/hero-221.jpg\" alt=\"requests \uc640 BeautifulSoup \ub85c HTML \uc5d0\uc11c \uc81c\ubaa9\uc744 \ucd94\ucd9c\ud558\ub294 \ucf54\ub4dc \ud654\uba74\">\n<\/header>\n\n<div class=\"container\">\n<article>\n\n<p>22\ud3b8\uc758 \uc790\ub3d9\ud654\uc5d0 \ud55c \ub3c4\uad6c\ub97c \ub354\ud558\uba74 \uc9c4\uc9dc \uac15\ub825\ud574\uc9d1\ub2c8\ub2e4 \u2014 <strong>\uc6f9\uc5d0\uc11c \ub370\uc774\ud130 \uac00\uc838\uc624\uae30<\/strong>. \uac00\uaca9 \ube44\uad50\u00b7\ub274\uc2a4 \ubaa8\ub2c8\ud130\ub9c1\u00b7\ud658\uc728 \ucd94\uc801\u00b7\uacbd\uc7c1\uc0ac \uc2e0\uc81c\ud488 \uac10\uc2dc. \uacf5\uc2dd API \uac00 \uc5c6\ub294 \uc815\ubcf4\uc758 95% \ub294 \uc6f9 \ud398\uc774\uc9c0\uc5d0 HTML \ub85c \ub178\ucd9c\ub3fc \uc788\uc5b4\uc694. \uadf8\uac78 \uc790\ub3d9 \ucd94\ucd9c\ud558\ub294 \uac8c \ud06c\ub864\ub9c1(\uc2a4\ud06c\ub808\uc774\ud551).<\/p>\n\n<p>23\ud3b8\uc744 \ub9c8\uce58\uba74 \u2460 \uc815\uc801\/\ub3d9\uc801 \ud398\uc774\uc9c0\uc758 \ucc28\uc774 \u2461 <code>requests + BeautifulSoup<\/code> \uae30\ubcf8 \u2462 CSS \uc120\ud0dd\uc790 \u2463 Playwright \ub85c JS \ud398\uc774\uc9c0 \u2464 \ub9e4\ub108 \u2014 robots.txt\u00b7rate-limit\u00b7\ubc95\uc801 \uacbd\uacc4 \u2014 5\uac00\uc9c0\ub97c \uc190\uc5d0 \uc775\ud799\ub2c8\ub2e4.<\/p>\n\n<h2>\uc815\uc801 vs \ub3d9\uc801 \ud398\uc774\uc9c0 \u2014 \uccab \ubd84\uae30<\/h2>\n\n<p>\uac19\uc740 \ud398\uc774\uc9c0 URL \uc774\ub77c\ub3c4 \ub450 \uc885\ub958\ub85c \ub098\ub269\ub2c8\ub2e4.<\/p>\n\n<ul>\n  <li><strong>\uc815\uc801<\/strong> \u2014 \uc11c\ubc84\uac00 \uc644\uc131\ub41c HTML \uc744 \ud55c \ubc88\uc5d0 \ubcf4\ub0c4. \ud398\uc774\uc9c0 \uc18c\uc2a4\uc5d0 \ub0b4\uac00 \ubcf4\ub294 \uae00\uc790\uac00 \ub2e4 \uc788\uc74c. <code>requests + BeautifulSoup<\/code> \uc73c\ub85c \ucda9\ubd84<\/li>\n  <li><strong>\ub3d9\uc801<\/strong> \u2014 JS \uac00 \ube0c\ub77c\uc6b0\uc800\uc5d0\uc11c \ub370\uc774\ud130\ub97c \ucd94\uac00\ub85c \ubc1b\uc544 \ud654\uba74\uc744 \uadf8\ub9bc. \ud398\uc774\uc9c0 \uc18c\uc2a4\uc5d0\ub294 \ube48 \uaecd\ub370\uae30\ub9cc. <strong>\uc2e4\uc81c \ube0c\ub77c\uc6b0\uc800<\/strong>\uac00 \ud544\uc694 \u2192 Playwright<\/li>\n<\/ul>\n\n<p>\ud310\ubcc4: <code>requests.get(URL).text<\/code> \uc548\uc5d0 \ucc3e\ub294 \ub0b4\uc6a9\uc774 \uc788\uc73c\uba74 \uc815\uc801, \uc5c6\uc73c\uba74 \ub3d9\uc801. \ub3d9\uc801\uc774\ub77c\ub3c4 \ubcf4\ud1b5\uc740 <code>view-source:<\/code> \ub610\ub294 \uac1c\ubc1c\uc790 \ub3c4\uad6c Network \ud0ed\uc5d0\uc11c \uc9c4\uc9dc API URL \uc744 \ucc3e\uc544 \uc9c1\uc811 \ud638\ucd9c\ud558\ub294 \uac8c \uac00\uc7a5 \ube60\ub985\ub2c8\ub2e4.<\/p>\n\n<h2>requests + BeautifulSoup \u2014 \uc815\uc801 \uc815\uacf5\ubc95<\/h2>\n\n<pre><code>(.venv) $ pip install requests beautifulsoup4 lxml\n\nimport requests\nfrom bs4 import BeautifulSoup\n\nUA = {\"User-Agent\": \"Mozilla\/5.0 (study-bot\/1.0)\"}\n\nr = requests.get(\"https:\/\/example.com\/blog\", headers=UA, timeout=10)\nr.raise_for_status()\nsoup = BeautifulSoup(r.text, \"lxml\")     # lxml \uc774 \uac00\uc7a5 \ube60\ub978 \ud30c\uc11c\n\n# \ud55c \uc694\uc18c\ntitle = soup.select_one(\"h1.post-title\")\nprint(title.get_text(strip=True))\n\n# \uc5ec\ub7ec \uc694\uc18c\nfor art in soup.select(\"article.post\"):\n    t = art.select_one(\"h2\").get_text(strip=True)\n    link = art.select_one(\"a\")[\"href\"]\n    print(t, \"\u2192\", link)<\/code><\/pre>\n\n<p><code>select<\/code> \/ <code>select_one<\/code> \uc774 \ud575\uc2ec \u2014 <strong>CSS \uc120\ud0dd\uc790<\/strong>\ub85c \ucc3e\uc2b5\ub2c8\ub2e4. \ube0c\ub77c\uc6b0\uc800 \uac1c\ubc1c\uc790 \ub3c4\uad6c\uc5d0\uc11c \uc694\uc18c \uc6b0\ud074\ub9ad \u2192 &#8220;Copy \u2192 Copy selector&#8221; \uac00 \uc790\uc8fc \uc4f0\ub294 \ud2b8\ub9ad. \ub2e4\uc74c \ud45c\uac00 \uc790\uc8fc \uc4f0\ub294 \ud328\ud134.<\/p>\n\n<div class=\"databox\">\n  <p><strong>\ud83d\udccc CSS \uc120\ud0dd\uc790 7\uac00\uc9c0<\/strong><\/p>\n  <p>\u00b7 <code>h1<\/code> \u2014 \ud0dc\uadf8\uba85<\/p>\n  <p>\u00b7 <code>.title<\/code> \u2014 \ud074\ub798\uc2a4<\/p>\n  <p>\u00b7 <code>#main<\/code> \u2014 id<\/p>\n  <p>\u00b7 <code>div.post<\/code> \u2014 \ud0dc\uadf8 + \ud074\ub798\uc2a4<\/p>\n  <p>\u00b7 <code>article a<\/code> \u2014 \uc790\uc190<\/p>\n  <p>\u00b7 <code>li &gt; a<\/code> \u2014 \uc9c1\uacc4 \uc790\uc2dd<\/p>\n  <p>\u00b7 <code>a[href^=\"https\"]<\/code> \u2014 \uc18d\uc131 \uc870\uac74 (\uc2dc\uc791\/\ub05d\/\ud3ec\ud568)<\/p>\n<\/div>\n\n<h2>Playwright \u2014 JS \ub3d9\uc801 \ud398\uc774\uc9c0<\/h2>\n\n<pre><code>(.venv) $ pip install playwright\n(.venv) $ playwright install chromium     # \ube0c\ub77c\uc6b0\uc800 \ub2e4\uc6b4\ub85c\ub4dc\n\nfrom playwright.sync_api import sync_playwright\n\nwith sync_playwright() as p:\n    browser = p.chromium.launch(headless=True)\n    page = browser.new_page()\n    page.goto(\"https:\/\/example-spa.com\/products\", timeout=30000)\n    page.wait_for_selector(\"div.product-card\")\n\n    titles = page.eval_on_selector_all(\n        \"div.product-card h3\",\n        \"els =&gt; els.map(e =&gt; e.textContent.trim())\"\n    )\n    print(titles)\n    browser.close()<\/code><\/pre>\n\n<p>\uc2e4\uc81c \ud06c\ub86c\uc744 \ub744\uc6cc JS \uac00 \uadf8\ub9b0 \uacb0\uacfc\ub97c \ubc1b\uc2b5\ub2c8\ub2e4. \ubb34\uac81\uc9c0\ub9cc React\u00b7Vue\u00b7Angular SPA \uac00 \uadf8\ub9b0 \ud654\uba74\ub3c4 \uadf8\ub300\ub85c \ubd05\ub2c8\ub2e4.<\/p>\n\n<h3>\ub300\uc548: \ube0c\ub77c\uc6b0\uc800 Network \ud0ed \ud2b8\ub9ad<\/h3>\n\n<pre><code># JS \uac00 \ud638\ucd9c\ud558\ub294 API \ub97c \ucc3e\uc544 \uc9c1\uc811 \ubd80\ub978\ub2e4 (\uac00\ub2a5\ud558\uba74 \ud56d\uc0c1 \uc774\uac8c \uc815\uacf5\ubc95)\nr = requests.get(\n    \"https:\/\/api.example.com\/v2\/products\",\n    headers={\"User-Agent\": \"...\", \"Accept\": \"application\/json\"},\n    params={\"page\": 1},\n)\ndata = r.json()\n# Playwright \ubcf4\ub2e4 100\ubc30 \ube60\ub974\uace0 \uc548\uc815\uc801<\/code><\/pre>\n\n<p>Playwright \ub294 <strong>\ubaa8\ub4e0 \ub2e4\ub978 \ubc29\ubc95\uc774 \uc2e4\ud328\ud588\uc744 \ub54c<\/strong>\uc758 \ub9c8\uc9c0\ub9c9 \uce74\ub4dc. JS \uac00 \uadf8\ub9b0 \ud398\uc774\uc9c0\ub77c\ub3c4 \uadf8 \ub370\uc774\ud130\ub97c \ubc1b\uc544\uc624\ub294 API \uac00 \uc788\uace0, \uac1c\ubc1c\uc790 \ub3c4\uad6c\uc5d0\uc11c \ucc3e\uc744 \uc218 \uc788\uc2b5\ub2c8\ub2e4.<\/p>\n\n<h2>\ub9e4\ub108\uc640 \ud568\uc815 + \ub9c8\ubb34\ub9ac<\/h2>\n\n<div class=\"warnbox\">\n  <strong>\u26a0\ufe0f \ud06c\ub864\ub9c1 \ub9e4\ub108 5\uac00\uc9c0.<\/strong> \u2460 <code>robots.txt<\/code> \ud655\uc778 (<code>example.com\/robots.txt<\/code>) \u2014 Disallow \uacbd\ub85c\ub294 \uc548 \uac00\uc838\uac04\ub2e4. \u2461 <strong>rate-limit<\/strong> \u2014 \ud55c \uc0ac\uc774\ud2b8\ub2f9 \ucd08\ub2f9 1\ud68c \uc774\ud558 \uad8c\uc7a5, <code>time.sleep(1)<\/code>. \u2462 <strong>User-Agent \uc640 \uc5f0\ub77d\ucc98<\/strong> \u2014 \ubd07\uc774\ub77c\uace0 \uc194\uc9c1\ud558\uac8c. \u2463 <strong>API \uac00 \uc788\uc73c\uba74 API \uc6b0\uc120<\/strong>. \u2464 \uc800\uc791\uad8c\u00b7\uc57d\uad00 \u2014 \ub370\uc774\ud130 \uc7ac\ubc30\ud3ec\ub294 \ubcc4\ub3c4 \ubb38\uc81c.\n<\/div>\n\n<pre><code># \uc548\uc804\ud55c \ud06c\ub864\ub7ec \ubf08\ub300\nimport time, requests\nfrom bs4 import BeautifulSoup\n\nUA = {\"User-Agent\": \"research-bot\/0.1 (+mailto:me@junai.ai)\"}\n\ndef fetch(url: str) -&gt; BeautifulSoup:\n    r = requests.get(url, headers=UA, timeout=10)\n    r.raise_for_status()\n    return BeautifulSoup(r.text, \"lxml\")\n\nfor url in urls:\n    try:\n        soup = fetch(url)\n        # ... \ucd94\ucd9c ...\n    except requests.RequestException as e:\n        log.error(\"\uc2e4\ud328 %s: %s\", url, e)\n    time.sleep(1.0)   # rate-limit \ub9e4\ub108<\/code><\/pre>\n\n<p>\ub2e4\uc74c \ubbf8\uc158: \u2460 \ubcf8\uc778 \ube14\ub85c\uadf8(\ub610\ub294 \uacf5\uac1c \ub274\uc2a4 RSS) \uc758 \uae00 \uc81c\ubaa9 10\uac1c \ucd94\ucd9c \u2461 <code>requests + BeautifulSoup<\/code> \ub85c \ud658\uc728 \ud55c \ud398\uc774\uc9c0 \ud06c\ub864\ub9c1 \ud6c4 CSV \uc800\uc7a5 \u2462 \uac1c\ubc1c\uc790 \ub3c4\uad6c\ub85c \uc5b4\ub5a4 \uc0ac\uc774\ud2b8\uc758 \uc228\uc740 API URL \ucc3e\uc544\ubcf4\uae30.<\/p>\n\n<div class=\"cta\">\n  <h3>\ub2e4\uc74c \ud3b8 \ubbf8\ub9ac\ubcf4\uae30<\/h3>\n  <p>24\ud3b8 \u2014 &#8220;\ud30c\uc774\uc36c async\u00b7await&#8221;: \ub3d9\uc2dc\uc131 \ucc98\ub9ac. 100\uac1c URL \ud638\ucd9c\uc774 100\ubc30 \ube68\ub77c\uc9c0\ub294 \ube44\ubc00.<\/p>\n<\/div>\n\n<div class=\"series-nav\">\n  <strong>\ud83d\udcda 27\ud3b8 \ud30c\uc774\uc36c \uad50\uc7ac \uc2dc\ub9ac\uc988 \u2014 23\/27\ud3b8<\/strong><br>\n  \u2190 22\ud3b8 &#8220;\uc790\ub3d9\ud654&#8221; \u00b7 \ub2e4\uc74c: 24\ud3b8 &#8220;async\u00b7await&#8221;\n<\/div>\n\n<\/article>\n<\/div>\n<\/div>\n","protected":false},"excerpt":{"rendered":"<p>\uc815\uc801\/\ub3d9\uc801 \ud06c\ub864\ub9c1\u00b7CSS \uc120\ud0dd\uc790\u00b7robots\/rate-limit \ub9e4\ub108. 27\ud3b8 \uad50\uc7ac 23\ud3b8.<\/p>\n","protected":false},"author":1,"featured_media":602,"comment_status":"closed","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[16],"tags":[],"class_list":["post-239","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-python"],"aioseo_notices":[],"_links":{"self":[{"href":"https:\/\/junai.ai\/blog\/wp-json\/wp\/v2\/posts\/239","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/junai.ai\/blog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/junai.ai\/blog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/junai.ai\/blog\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/junai.ai\/blog\/wp-json\/wp\/v2\/comments?post=239"}],"version-history":[{"count":1,"href":"https:\/\/junai.ai\/blog\/wp-json\/wp\/v2\/posts\/239\/revisions"}],"predecessor-version":[{"id":629,"href":"https:\/\/junai.ai\/blog\/wp-json\/wp\/v2\/posts\/239\/revisions\/629"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/junai.ai\/blog\/wp-json\/wp\/v2\/media\/602"}],"wp:attachment":[{"href":"https:\/\/junai.ai\/blog\/wp-json\/wp\/v2\/media?parent=239"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/junai.ai\/blog\/wp-json\/wp\/v2\/categories?post=239"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/junai.ai\/blog\/wp-json\/wp\/v2\/tags?post=239"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}