Data Crawling with Selenium and Creating a Word Cloud with Naver Webtoon

·

4 min read

Summary

  • Naver Login: Automated using Selenium.

  • Webtoon Data: Scraped titles, ratings, authors, likes (hearts), and comments.

  • Word Cloud: Created from webtoon comments to visualize frequently mentioned words.

1. Automating Naver Login

1.1 Using Selenium for Naver Login

  • Goal: Automate the login process to Naver using Selenium.

  • Method: Use XPATH to click the login button and execute_script to input login credentials.

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

# Set up Chrome WebDriver
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

# Navigate to Naver URL
naver_url = 'https://www.naver.com/'
driver.get(naver_url)

# Click login button
element = driver.find_element(By.XPATH, '//*[@id="account"]/div/a')
element.click()

# Input login credentials and click login
naver_id = "your_naver_id"
naver_pw = "your_naver_password"
driver.execute_script("document.getElementsByName('id')[0].value=\'"+naver_id+"\'")
driver.execute_script("document.getElementsByName('pw')[0].value=\'"+naver_pw+"\'")
driver.find_element(By.XPATH, '//*[@id="log.login"]').click()

# Close the driver
driver.quit()

2. Naver Webtoon Data Scraping

2.1 Scraping Webtoon Titles, Ratings, and Authors

  • Goal: Scrape webtoon data such as titles, ratings, and author names for each day of the week.

  • Method: Use Selenium to navigate and extract data using CLASS_NAME and CSS_SELECTOR.

import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

# Set up Chrome WebDriver
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

# Days of the week
days = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
titles, rate, authors = [], [], []

for i in range(7):
    url = f"https://comic.naver.com/webtoon?tab={days[i]}"
    driver.get(url)
    time.sleep(1)

    # Scrape titles, ratings, and authors
    contents = driver.find_elements(By.CLASS_NAME, 'ContentTitle__title_area--x24vt')
    contents_rate = driver.find_elements(By.CSS_SELECTOR, '.Rating__star_area--dFzsb > .text')
    contents_auth = driver.find_elements(By.CLASS_NAME, 'ContentAuthor__author--CTAAP')

    for title in contents[3:-10]:
        titles.append(title.text)
    for rate_value in contents_rate[3:]:
        rate.append(rate_value.text)
    for author in contents_auth[3:-10]:
        authors.append(author.text)

# Close the driver
driver.quit()

2.2 Scraping Webtoon Likes (Hearts)

  • Goal: Collect the number of likes (hearts) for each webtoon.

  • Method: Iterate through each day’s webtoons, navigate to each webtoon’s page, and scrape the likes count.

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time
import pandas as pd

# Set up Chrome WebDriver
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

days = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
titles, hearts = [], []

for i in range(7):
    url = f"https://comic.naver.com/webtoon?tab={days[i]}"
    driver.get(url)
    time.sleep(1)

    selector = '.ContentList__content_list--q5KXY > .item > .Poster__link--sopnC'
    data = driver.find_elements(By.CSS_SELECTOR, selector)
    sub_urls = [i.get_attribute('href') for i in data]

    for sub_url in sub_urls:
        driver.get(sub_url)
        time.sleep(0.5)

        titles_data = driver.find_elements(By.CLASS_NAME, 'EpisodeListInfo__title--mYLjC')
        hearts_data = driver.find_elements(By.CLASS_NAME, 'EpisodeListUser__count--fNEWK')

        for t in titles_data:
            titles.append(t.text)
        for h in hearts_data:
            hearts.append(h.text)

# Save the result to CSV
result_df = pd.DataFrame({'title': titles, 'hearts': hearts})
result_df.to_csv('hearts.csv', index=False, encoding='utf-8-sig')

# Close the driver
driver.quit()

3. Scraping Webtoon Comments

3.1 Scraping Comments for Selected Webtoons

  • Goal: Scrape comments from selected webtoons and save them to CSV files.

  • Method: Navigate through webtoon episodes, collect comments, and save them.

python코드 복사import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import pandas as pd

title_ids = [648419, 758037, 733074]  # List of webtoon IDs to scrape

# Set up Chrome WebDriver
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

for id in title_ids:
    url = f'https://comic.naver.com/webtoon/list?titleId={id}&page=1&sort=DESC&tab=tue'
    driver.get(url)
    time.sleep(1)

    titles_data = driver.find_elements(By.CLASS_NAME, 'EpisodeListInfo__title--mYLjC')
    title = titles_data[0].text if titles_data else "Unknown"
    comments = []

    for page_num in range(1, 6):
        url = f'https://comic.naver.com/webtoon/list?titleId={id}&page={page_num}&sort=DESC&tab=tue'
        driver.get(url)
        time.sleep(1)

        data = driver.find_elements(By.CLASS_NAME, 'EpisodeListList__link--DdClU')
        epi_urls = [epi_url.get_attribute("href") for epi_url in data]

        for epi_url in epi_urls:
            driver.get(epi_url)
            time.sleep(1)

            eip_tmp_data = driver.find_elements(By.CLASS_NAME, 'u_cbox_contents')
            for epi_com in eip_tmp_data:
                comments.append(epi_com.text)

    # Save the comments to CSV
    test_df = pd.DataFrame({'comment': comments})
    filename = f'{title}.csv'
    test_df.to_csv(filename, index=False, encoding='utf-8-sig')

# Close the driver
driver.quit()

4. Creating a Word Cloud

  • Goal: Generate a word cloud from the scraped comments.

  • Method: Use Okt for Korean text processing and WordCloud for visualization.

from konlpy.tag import Okt
import pandas as pd
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Load the comments data
comments_df = pd.read_csv('comments.csv')

# Text processing
okt = Okt()
comments = ' '.join(comments_df['comment'])
comments = re.sub('[^가-힣\s]', '', comments)
tokens = okt.nouns(comments)

# Generate the word cloud
wordcloud = WordCloud(font_path='malgun', background_color='white', width=800, height=600).generate(' '.join(tokens))

# Visualize the word cloud
plt.figure(figsize=(10, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

1. Library Imports

  • Okt: Korean text processor for noun extraction.

  • re: Regular expressions for text cleaning.

  • matplotlib.pyplot: Visualization tool for word cloud display.

2. Load Comments Data

  • pd.read_csv('comments.csv'): Load comments from a CSV file into a DataFrame.

3. Text Processing

  • Okt(): Initialize the Korean text processor.

  • comments = ' '.join(comments_df['comment']): Merge all comments into a single string.

  • re.sub('[^가-힣\s]', '', comments): Remove non-Korean characters.

  • okt.nouns(comments): Extract nouns from the text.