Konlpy

# 한글 기반 형태소 분석
# -> Okt 클래스를 사용해 한글 명사 단어 빈도 계산

from konlpy.tag import Okt
import numpy as np
from collections import Counter  # 빈도수 계산

data = ""
with open("./section1015/대한민국헌법.txt", "r", encoding="utf-8") as f:
    data = f.read()

# print(data)

# data 변수가 가지고 있는 내용을 기반으로 형태소 분석
nlp = Okt()  # 형태소 분석 클래스 객체 생성

# morphs()  : 형태소 단위 구문 분석
# nouns()   : 명사만 추출
# phrases() : 어절만 추출
# pos()     : 형태소 단위로 쪼갠 후 각 품사들을 태깅해서 리스트형태로 반환

nouns = nlp.nouns(data)
# print(nouns)
# print(type(nouns))

# 명사형태의 단어들만 뽑아서 리스트를 만들고 한글자로 된 다어는 잘라냄
# -> ro인이 판단
words = []
for n in nouns:
    if len(n) > 1:
        words.append(n)

# print(words)

# 단어 빈도수 계산
# Counter 객체를 통해 리스트 요소들의 빈도수 계산해서 딕셔너리 값으로 반환
count = Counter(words)
# print(count)
# print(type(count))

# 가장 많이 등장한 상위 100개 추출
most = count.most_common(100)
# print(most)
# print(type(most))

# WordCloud 객체가 요구하는 형식으로 딕셔너리 구성
tags = {}
for n, c in most:
    tags[n] = c

print(tags)

# 수집 결과를 활용해서 워드클라우드 생성

from matplotlib import pyplot
from wordcloud import WordCloud
from wordcloud import STOPWORDS

wc = WordCloud(font_path="gulim", width=1200, height=800, scale=2.0, max_font_size=250)

gen = wc.generate_from_frequencies(tags)
pyplot.figure()
pyplot.imshow(gen, interpolation="bilinear")
pyplot.show()
pyplot.close()

Crawling

html 기초

<!-- 참조 https://www.w3schools.com/html/default.asp -->
<!-- 참조 https://www.w3schools.com/css/default.asp -->

<!DOCTYPE html>
<html>
<head>
    <meta charset='utf-8'>
    <meta http-equiv='X-UA-Compatible' content='IE=edge'>
    <title>환영합니다</title>
    <style>
        #kim {font-size: 20px; color: red}
        #lee {font-size: 30px; color: blue}
        .txt {text-decoration: underline; text-shadow: 2px 2px red;}
    </style>
    <meta name='viewport' content='width=device-width, initial-scale=1'>
    <link rel='stylesheet' type='text/css' media='screen' href='main.css'>
    <script src='main.js'></script>
</head>
<body>
    무궁화 꽃이 피었습니다
    <div id="kim">
        파이썬
    </div>
    <div id="lee">
        자바
    </div>
    <div class="txt">
        국어
    </div>
    <div class="txt">
        영어
    </div>
    <div class="txt">
        수학
    </div>
</body>
</html>

Crawling

# 크롤링(Crawling)
# ->웹 페이지를 그대로 가져와서 거기서 데이터를 추출해 내는 행위다.
# ->크롤링하는 소프트웨어는 크롤러(crawler)라고 부른다.
# ->동적으로 웹페이지를 돌아다니면서 수집하는 것을 말한다.
# ->Web Scraping

"""----------------------------------------------
   없으면 모듈설치
   >pip install requests
   >pip install bs4
-------------------------------------------------"""

import requests
from bs4 import BeautifulSoup

# URL
url = "https://www.itwill.co.kr/"

# 특정 웹 페이지 접속
r = requests.get(url)

# 인코딩 형식 지정
r.encoding = "utf-8"

print(r.text)

네이버 뉴스 기사의 상세 설명 부분 가져오기

import requests
from bs4 import BeautifulSoup

# 웹 사이트 크롤링에서 사용할 URL 정보
# 웹 브라우저 버전 정보
# F12 -> Network -> Headers 또는 페이지 소스 보기

# 응답 받는 사용자의 웹브라우저 정보
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36"

# 네이버 뉴스 기사
naver_news_url = "https://n.news.naver.com/article/422/0000790979?ntype=RANKING"

# 데이터 수집 - 웹 페이지 HTML 소스코드 가져오기
# 접속 세션을 생성
# 세션 -> 클라이언트(브라우저)와 서버(웹사이트)간의 연결단위
#     -> 이 객체에 접속에 필요한 기본 정보를 설정한다.
session = requests.Session()

# 현재 세션의 referer 페이지를 '없음'으로 강제 설정
# -> referer : 이전에 머물렀던 페이지 주소
# -> referer값이 없으면 웹 서버는 브라우저에서 직접 URL을 입력한 것으로 간주한다.
# 현재 세션의 웹 브라우저 정보(User-agent)를 구글 크롬으로 설정
session.headers.update({"referer": None, "User-agent": user_agent})

# 특정 웹 페이지에 접속
r = session.get(naver_news_url)

# 가져온 HTML 코드 확인
r.encoding = "utf-8"
# print(r.text)

## 1) HTML 문서를 분석해서 원하는 영역 추출

# 웹 페이지의 소스코드 HTML 분석 객체로 생성
soup = BeautifulSoup(r.text, "html.parser")

# CSS 선택자를 활용하여 가져오기를 원하는 부분 지정
selector = soup.select("#dic_area")
# print(selector)

## 2) 데이터 전처리
for item in selector:
    for target in item.find_all("br"):
        target.extract()
    for target in item.find_all("div"):
        target.extract()
    for target in item.find_all("img"):
        target.extract()
    for target in item.find_all("em"):
        target.extract()
    for target in item.find_all("span"):
        target.extract()

# print(item)

# 최종 결과값 확인
result_str = item.text.strip()
print(result_str)

네이버 뉴스 중에서 [세계] 카테고리의 본문

# 네이버 뉴스 중에서 [세계] 카테고리의 본문

from Crawler import crawler
from bs4 import BeautifulSoup

# 1. URL 설정하기
URL = "https://news.naver.com/section/104"

# 2. 수집할 뉴스기사의 URL 조사
link_list = crawler.select(
    URL,
    encoding="utf-8",
    selector=".sa_text > a",
)
# print(len(link_list))
# print(type(link_list))

# for item in link_list:
#     print(item.text, item.get("href"))

# 3. item에서 <a> 태그가 가지고 있는 속성들 중에서 href 속성 값만 추출하기
url_list = [i.get("href") for i in link_list]
# print(url_list)

# 4. 네이버 뉴스기사(세계)에 접속해서 본문 크롤링하기
## <article id="dic_area"> ..... </a>
news_content = ""  # 뉴스기사의 본문을 누적해서 저장할 문자열 변수
for i, url_world in enumerate(url_list):  # url_world : 크롤링할 페이지의 URL주소
    print("%d번째 뉴스기사 수집중...>> %s" % (i + 1, url_world))
    # URL에 접근해서 뉴스기사 가져오기
    news_html = crawler.select(url_world, encoding="utf-8", selector="#dic_area")

    if not news_html:
        print("%d번째 뉴스기사 크롤링 실패" % (i + 1))
    else:
        print("%d번째 뉴스기사 크롤링 성공" % (i + 1))
        for item in news_html:
            crawler.remove(item, "br")
            crawler.remove(item, "div")
            crawler.remove(item, "img")
            crawler.remove(item, "em")
            crawler.remove(item, "strong")
            crawler.remove(item, "span")
            crawler.remove(item, "script")
            news_content += item.text.strip()

print(news_content)

# 텍스트 마이닝
from konlpy.tag import Okt
from collections import Counter

okt = Okt()

nouns = okt.nouns(news_content)

words = []
for n in nouns:
    if len(n) > 1:
        words.append(n)

count = Counter(words)

most = count.most_common(100)

tags = {}
for n, c in most:
    tags[n] = c

from wordcloud import WordCloud
from matplotlib import pyplot

wc = WordCloud(
    font_path="gulim",
    max_font_size=200,
    width=1200,
    height=800,
    scale=2.0,
    background_color="#ffffff",
)

gen = wc.generate_from_frequencies(tags)

pyplot.figure()
pyplot.imshow(gen, interpolation="bilinear")
pyplot.axis("off")
wc.to_file("naver_worldnews_20251015.png")
pyplot.show()
pyplot.close()

# 정제한 데이터를 엑셀로 저장
from pandas import DataFrame
from pandas import ExcelFile

lists = []
for n, c in most:
    if len(n) > 1:
        lists.append([n, c])

df = DataFrame(lists, columns=["단어", "빈도수"])
df.to_excel(
    "네이버세계뉴스_20251015.xlsx",
    sheet_name="단어빈도수",
    index=False,
)

print(df)

저작자표시 비영리 변경금지 (새창열림)

'Courses > 아이티윌 오라클 DBA 과정' 카테고리의 다른 글

251017 TIL (0)	2025.10.17
251016 TIL (0)	2025.10.16
251014 TIL (0)	2025.10.14
251013 TIL (0)	2025.10.13
251010 TIL (0)	2025.10.10

Joy's Devlog

Joy's Devlog

태그

최근글

댓글

공지사항

아카이브

Konlpy

Crawling

html 기초

Crawling

네이버 뉴스 기사의 상세 설명 부분 가져오기

네이버 뉴스 중에서 [세계] 카테고리의 본문

'Courses > 아이티윌 오라클 DBA 과정' 카테고리의 다른 글

관련글

티스토리툴바