Data Crawling with Selenium and Creating a Word Cloud with Naver Webtoon
Summary
Naver Login: Automated using Selenium.
Webtoon Data: Scraped titles, ratings, authors, likes (hearts), and comments.
Word Cloud: Created from webtoon comments to visualize frequently mentioned words.
1. Automating Naver Login
1.1 Using Selenium for Naver Login
Goal: Automate the login process to Naver using Selenium.
Method: Use
XPATH
to click the login button andexecute_script
to input login credentials.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
# Set up Chrome WebDriver
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
# Navigate to Naver URL
naver_url = 'https://www.naver.com/'
driver.get(naver_url)
# Click login button
element = driver.find_element(By.XPATH, '//*[@id="account"]/div/a')
element.click()
# Input login credentials and click login
naver_id = "your_naver_id"
naver_pw = "your_naver_password"
driver.execute_script("document.getElementsByName('id')[0].value=\'"+naver_id+"\'")
driver.execute_script("document.getElementsByName('pw')[0].value=\'"+naver_pw+"\'")
driver.find_element(By.XPATH, '//*[@id="log.login"]').click()
# Close the driver
driver.quit()
2. Naver Webtoon Data Scraping
2.1 Scraping Webtoon Titles, Ratings, and Authors
Goal: Scrape webtoon data such as titles, ratings, and author names for each day of the week.
Method: Use Selenium to navigate and extract data using
CLASS_NAME
andCSS_SELECTOR
.
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
# Set up Chrome WebDriver
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
# Days of the week
days = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
titles, rate, authors = [], [], []
for i in range(7):
url = f"https://comic.naver.com/webtoon?tab={days[i]}"
driver.get(url)
time.sleep(1)
# Scrape titles, ratings, and authors
contents = driver.find_elements(By.CLASS_NAME, 'ContentTitle__title_area--x24vt')
contents_rate = driver.find_elements(By.CSS_SELECTOR, '.Rating__star_area--dFzsb > .text')
contents_auth = driver.find_elements(By.CLASS_NAME, 'ContentAuthor__author--CTAAP')
for title in contents[3:-10]:
titles.append(title.text)
for rate_value in contents_rate[3:]:
rate.append(rate_value.text)
for author in contents_auth[3:-10]:
authors.append(author.text)
# Close the driver
driver.quit()
2.2 Scraping Webtoon Likes (Hearts)
Goal: Collect the number of likes (hearts) for each webtoon.
Method: Iterate through each day’s webtoons, navigate to each webtoon’s page, and scrape the likes count.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time
import pandas as pd
# Set up Chrome WebDriver
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
days = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
titles, hearts = [], []
for i in range(7):
url = f"https://comic.naver.com/webtoon?tab={days[i]}"
driver.get(url)
time.sleep(1)
selector = '.ContentList__content_list--q5KXY > .item > .Poster__link--sopnC'
data = driver.find_elements(By.CSS_SELECTOR, selector)
sub_urls = [i.get_attribute('href') for i in data]
for sub_url in sub_urls:
driver.get(sub_url)
time.sleep(0.5)
titles_data = driver.find_elements(By.CLASS_NAME, 'EpisodeListInfo__title--mYLjC')
hearts_data = driver.find_elements(By.CLASS_NAME, 'EpisodeListUser__count--fNEWK')
for t in titles_data:
titles.append(t.text)
for h in hearts_data:
hearts.append(h.text)
# Save the result to CSV
result_df = pd.DataFrame({'title': titles, 'hearts': hearts})
result_df.to_csv('hearts.csv', index=False, encoding='utf-8-sig')
# Close the driver
driver.quit()
3. Scraping Webtoon Comments
3.1 Scraping Comments for Selected Webtoons
Goal: Scrape comments from selected webtoons and save them to CSV files.
Method: Navigate through webtoon episodes, collect comments, and save them.
python코드 복사import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import pandas as pd
title_ids = [648419, 758037, 733074] # List of webtoon IDs to scrape
# Set up Chrome WebDriver
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
for id in title_ids:
url = f'https://comic.naver.com/webtoon/list?titleId={id}&page=1&sort=DESC&tab=tue'
driver.get(url)
time.sleep(1)
titles_data = driver.find_elements(By.CLASS_NAME, 'EpisodeListInfo__title--mYLjC')
title = titles_data[0].text if titles_data else "Unknown"
comments = []
for page_num in range(1, 6):
url = f'https://comic.naver.com/webtoon/list?titleId={id}&page={page_num}&sort=DESC&tab=tue'
driver.get(url)
time.sleep(1)
data = driver.find_elements(By.CLASS_NAME, 'EpisodeListList__link--DdClU')
epi_urls = [epi_url.get_attribute("href") for epi_url in data]
for epi_url in epi_urls:
driver.get(epi_url)
time.sleep(1)
eip_tmp_data = driver.find_elements(By.CLASS_NAME, 'u_cbox_contents')
for epi_com in eip_tmp_data:
comments.append(epi_com.text)
# Save the comments to CSV
test_df = pd.DataFrame({'comment': comments})
filename = f'{title}.csv'
test_df.to_csv(filename, index=False, encoding='utf-8-sig')
# Close the driver
driver.quit()
4. Creating a Word Cloud
Goal: Generate a word cloud from the scraped comments.
Method: Use
Okt
for Korean text processing andWordCloud
for visualization.
from konlpy.tag import Okt
import pandas as pd
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# Load the comments data
comments_df = pd.read_csv('comments.csv')
# Text processing
okt = Okt()
comments = ' '.join(comments_df['comment'])
comments = re.sub('[^가-힣\s]', '', comments)
tokens = okt.nouns(comments)
# Generate the word cloud
wordcloud = WordCloud(font_path='malgun', background_color='white', width=800, height=600).generate(' '.join(tokens))
# Visualize the word cloud
plt.figure(figsize=(10, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
1. Library Imports
Okt
: Korean text processor for noun extraction.re
: Regular expressions for text cleaning.matplotlib.pyplot
: Visualization tool for word cloud display.
2. Load Comments Data
pd.read
_csv('comments.csv')
: Load comments from a CSV file into a DataFrame.
3. Text Processing
Okt()
: Initialize the Korean text processor.comments = ' '.join(comments_df['comment'])
: Merge all comments into a single string.re.sub('[^가-힣\s]', '', comments)
: Remove non-Korean characters.okt.nouns(comments)
: Extract nouns from the text.