๋ฐ์ํ
# request ๋ผ์ด๋ธ๋ฌ๋ฆฌ , ์ํ๋ ํ์ด์ง์ ์ ๋ณด๋ฅผ ๊ฐ์ ธ์์ฃผ๋ ๋ผ์ด๋ธ๋ฌ๋ฆฌ
import requests as req
res =req.get("http://www.naver.com")
# <Response 200> : success / <Response 406> : ์คํจ๋จ
res
# ํ
์คํธ๋ก ๋ด์ฉ์ ํํํ๋๋ฐ, '~~~' ์ ์ฒด ๋ฐ์ดํฐ๊ฐ ํ๋์ ๋ค์ด๊ฐ ์๋ค.
res.text
# Melon ์ฌ์ดํธ์ ์ ๋ณด๋ฅผ ๋ฐ์์ค๊ธฐ
req.get("https://www.melon.com/")
# ์ฌ์ดํธ ์ ๊ทผ ์คํจ์, ์ฌ๋์ด ์ ๊ทผํ ๊ฒ ์ฒ๋ผ ์์ด๋ ์์
header ์์
์ ํด์ผํ๋ค.
# f12 > network > document ํ์
์์ "user-agent" ๋ฅผ ์ฐพ๋๋ค.
head = {"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"}
res = req.get("https://www.melon.com/", headers = head)
res.text
# ๋ค์ด๋ฒ์์ ์ํ๋ ๊ธ ๊ฐ์ ธ์ค๊ธฐ
res = req.get("http://www.nave.com")
naver = res.text
# beautiful soup ๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ด์ฉํ๊ธฐ, ํ๋์ ๋ฌธ์ฅ(html)์ ํ๊ทธ๋ณ๋ก ์ชผ๊ฐ์ค๋ค.
from bs4 import BeautifulSoup as bs
soup = bs(naver, "lxml")
# find_all("์ด๋ค ํ๊ทธ","๊ตฌ๋ถ์")
# html ๋ฐ์ดํฐ์์ ๋ด๊ฐ ํ์ํ ํ๊ทธ๋ค์ ๋ฝ์์ค๋ ๊ธฐ๋ฅ
# class ๊ฐ์ ๊ฒฝ์ฐ : class_ ๋ก ํํํ๋ค.
data = soup.find_all("a", class_ = "nav")
# find_all์ ์ด์ฉํ๋ฉด ๋ฆฌ์คํธ ํํ๋ก ๋ด์ฉ์ ๋ฝ๋๋ค
data[0].text
# ๋ฆฌ์คํธ ์ ์ฒด ๋ด์ฉ ๋ฝ๊ธฐ
for content in data:
print(content.text)
# ๋ค์ด๋ฒ์์ ๋ด์ค ํ์ดํ๋ง ๊ฐ์ ธ์ค๊ธฐ , url ๊ฐ์ ธ์ค๊ธฐ(์ฑ๊ณต์ฌ๋ถ) -> text -> bs -> find_all
res = req.get("https://search.naver.com/search.naver?where=nexearch&sm=top_sug.pre&fbm=1&acr=1&acq=%EC%BD%94%EB%A1%9C%EB%82%98&qdt=0&ie=utf8&query=%EC%BD%94%EB%A1%9C%EB%82%98")
data = res.text
soup = bs(data, "lxml")
news = soup.find_all("a", class_ ="news_tit")
for i in news:
print(i.text)
# Melon Top100 ์์ง
import requests as req # r,e,q ๋ง ๊ฐ์ ธ์ค๊ธฐ
from bs4 import BeautifulSoup as bs # B, S ๊ฐ ๋๋ฌธ์
url = "https://www.melon.com/chart/index.htm"
req.get(url)
#์๋ฒ ์ ๊ทผ ์คํจ์
head = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"}
res = req.get("https://www.melon.com/chart/index.htm", headers = head) # res ๋ก ์ฑ๊ณต์ฌ๋ถ ๊ฐ์ ธ์ค๊ธฐ
melon = res.text
soup = bs(melon, "lxml")
# select ๋ฌธ์ ์ด์ฉํด์ ๊ฐ์ ธ์ค๊ธฐ
singer = soup.select("div.ellipsis.rank02 > span.checkEllipsis")
print(len(singer))
# ๊ฐ์ ธ์จ ๋ฐ์ดํฐ ๊ธธ์ด ํ์ธํ๊ธฐ
song = soup.select("div.ellipsis.rank01 > span ")
print(len(song))
# ์ถ๋ ฅํด๋ณด๊ธฐ, ์์ํ ํ
์คํธ์ ๋ณด๋ง ๊ฐ๋๊ณ ์๋ ์๋ก์ด ๋ฆฌ์คํธ ๋ง๋ค๊ธฐ
# ์์๊ฐ ํ์ํ๊ฒ ์๋๊ณ , ์์ํ ํ
์คํธ ์ ๋ณด๋ง
song_list = []
singer_list = []
rank_list = []
for i in range(len(song)):
song_list.append(song[i].text.strip())
sing_list.append(singer[i].text.strip())
rank_list.append(i+1)
# ๋ฐ์ดํฐ๋ฅผ ํ๋ก ํ์ธํด๋ณด๊ณ , ๋ฐ์ดํฐ๋ฅผ ํ์ผ๋ก ์ ์ฅํ๊ธฐ
import pandas as pd
# ์ปฌ๋ผ๊ณผ ๋ฐ์ดํฐ ๊ตฌ์กฐ๋ฅผ ๊ฐ์ง ๋์
๋๋ฆฌ ์์ฑ
info = {"๊ฐ์" : singer_list , "๋
ธ๋" : song_list, "์์" : rank_list}
music_pd = pd.DataFrame(info)
# ๋์
๋๋ฆฌ > ๋ฐ์ดํฐ ํ๋ ์ > csv
encoding = "" / encoding = "euc-kr" / encoding = "utf-8-sig"
music_pd.to_csv("melon.csv", encoding = "utf-8-sig")
# @์ํ๋ญํน ํ์ด์ง์์ ์ ๋ชฉ ํ์ ์์ง
res = req.get("https://movie.naver.com/movie/sdb/rank/rmovie.naver?sel=cur&date=20210908")
movie = res.text
soup = bs(movie, "lxml")
rate = soup.select("td.point") # ํ๊ทธ์ ํด๋์ค๋ก ์ ๊ทผ
print(len(rate))
# ์ํ ์ ๋ชฉ, ํ์ ์์ง
title = soup.select("div.tit5 > a")
print(len(title))
title_list = []
rate_list = []
rank_list = []
# ์ ์ฒด ๊ฐ์๋ก ๋ด์ฉ์ ํ๋จํ๊ธฐ
for i in range(len(title_list)):
title_list.append(title[i].text.strip())
rate_list.append(rate[i].text.strip())
rank_list.append(i+1)
dic = {"์์" : rank_list , "์ํ์ ๋ชฉ": title_list, "ํ์ ": rate_list}
movie_pd = pd.DataFrame(dic)
movie_pd.set_index("์์", inplace = True)
# html ๋ก๋ ํํํ ์ ์์
movie_pd.to_html("movie.html")
# @์ํ ๊ด๋๊ฐ ๋ฆฌ๋ทฐ ์์ง
import requests as req
from bs4 import BeautifulSoup as bs
res = req.get("https://movie.naver.com/movie/bi/mi/pointWriteFormList.naver?code=192150&type=after&isActualPointWriteExecute=false&isMileageSubscriptionAlready=false&isMileageSubscriptionReject=false")
movie = res.text
soup = bs(movie, "lxml")
# ๋ฆฌ๋ทฐ ๋ฐ์ดํฐ ์์ง , ์ ์ฒ๋ฆฌ ๊ณผ์ ์ค์ ๋ถํ์ํ ๋ฐ์ดํฐ์ธ ์ญ์ ํ๋ ๋ฐฉ๋ฒ, ๋ถ๋ฅํด์ฃผ๊ธฐ
review = soup.select("div.score_reple > p")
print(len(review))
for i in reqview:
print(i.text.strip())
# ์ญ์ ํ๋ ๋ฐฉ๋ฒ & ๋ถํ์ํ ๊ด๋๊ฐ ๋ถ๋ถ๋ง ์ญ์
viewer = soup.select("span.ico_viewer")
for i in viewer:
i.extract()
for i in review:
print(i.text.strip())
# ๋ฐ๋ณต๋ฌธ์ ์งํ์ํฉ์ ์ฒดํฌํด์ฃผ๋ ๋ผ์ด๋ธ๋ฌ๋ฆฌ โ
โ
from tqdm import tqdm_notebook as tq
# for๋ฌธ์์ tq๋ฅผ ์ค์ ํ๋ฉด ์งํ ์ํฉ์ ์ ์ ์๋ค,ํ์ด์ง๋ณ๋ก ๋ชจ๋ ๋ฐ์ดํฐ ๊ฐ์ ธ์ค๊ธฐ
# url + i ํด์ฃผ๋ ๋ฐฉ๋ฒ
for i in tq(range(1,11)):
res = req.get("https://movie.naver.com/movie/bi/mi/pointWriteFormList.naver?code=192150&type=after&isActualPointWriteExecute=false&isMileageSubscriptionAlready=false&isMileageSubscriptionReject=false&page="+str(i))
movie = res.text
soup = bs(movie, "lxml")
viewer = soup.select("span.ico_viewer")
review = soup.select("div.score_reple > p")
for i in viewer:
i.extract()
for i in review:
print(i.text.strip())
# @@์นํฌ๋กค๋ง ๊ธฐ๋ฒ : ์
๋ ๋ ๋ผ์ด๋ธ๋ฌ๋ฆฌ(์ปดํจํฐ๊ฐ ์ปดํจํฐ๋ฅผ ์ ์ด). ๋จผ์ ๊ฐ์ ธ์ค๊ธฐ
# ๊ธฐ๋ณธ์ผ๋ก ๋ด์ฅ๋์ด ์๋ ๋ผ์ด๋ธ๋ฌ๋ฆฌ๊ฐ ์๋๊ธฐ ๋๋ฌธ์ ์ค์น๊ฐ ํ์ํ๋ค!
!pip install selenium
# webdriver = ๋ธ๋ผ์ฐ์ ๋ฅผ ์ ์ดํ๋ ๋ผ์ด๋ธ๋ฌ๋ฆฌ
# Keys ์ปดํจํฐ์๊ฒ ํค๋ณด๋์ ์ญํ ์ ํด์ฃผ๋ ๋ผ์ด๋ธ๋ฌ๋ฆฌ
from selenium import webdriver as wb
from selenium.webdriver.common.keys import Keys
driver = wb.Chrome()
driver.get("http://www.naver.com")
# ์ปดํจํฐ์๊ฒ ์ค๊ฐ์ค๊ฐ ์ฌ๋ ์๊ฐ ๋ถ์ฌ
# ์๋ฒ์ ๊ณผ๋ถํ๋ฅผ ๋ง๊ธฐ ์ํด์
import time
driver = wb.Chorome()
time.sleep(0.5)
driver.get("http://www.naver.com")
# driver ๊ฐ ์ฌ์ฉํ ์ ์๋ ๊ธฐ๋ฅ๋ค ๋์ดํ๊ธฐ, ํ์ฌ ํ๋ฉด์์ ๊ฐ์ ธ์ค๊ธฐ
# driver.page_source : ํ์ฌ ํ๋ฉด์ ์ ๋ณด๋ฅผ ๊ฐ์ ธ์ค๊ธฐ
# driver.find_element_by_id("~~~") : id ์ ๋ณด๋ฅผ ๊ฐ์ ธ์ค๊ธฐ
# driver.find_element_by_css_selector : css ์ ๋ณด๋ฅผ ๊ฐ์ ธ์ค๋ ๊ฒ. ๋ฒํผ์ css๋ก??
# driver.find_element_by_tag_name : tag ๊ฐ์ ธ์ค๊ธฐ
inputArea = driver.find_element_by_id("query")
inputArea.send_keys("์ถ์")
# ์ํฐ์น๋ ๋ฐฉ์
inputArea.send_keys("์ถ์")
# ์ํฐ์น๋ ๋ฐฉ๋ฒ
inputArea.send_keys(Keys.ENTER)
# ๊ฒ์ ๋ฒํผ์ ํด๋ฆญํ๋ ๋ฐฉ๋ฒ
driver.find_element_by_css_selector("button.search_btn > a")
btn = driver.find_element_by_id("search_btn")
btn.click()
driver.find
driver.quit()
# selenium ๋ฐ์ดํฐ ๊ฐ์ ธ์ค๊ธฐ, ๋๊ฐ๋๋ฐฉ๋ฒ
#
soup = bs(driver.page_source, 'lxml')
result = soup.select('.holi_txt')
result[0].text
# @์ ํ๋ธ์์ ์ ๋ชฉ, ์กฐํ์ ๊ฐ์ ธ์ค๊ธฐ
from selenium import webdriver as wb
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as bs
import pandas as pd
import time
url = 'https://www.youtube.com/user/rladndgussla/videos'
driver = wb.Chrome()
driver.get(url)
body = driver.find_element_by_tag_name('body')
# ์คํฌ๋กค๋ฐ ๋ด๋ฆฌ๊ธฐ
for i in range(30):
body.send_keys(Keys.PAGE_DOWN)
time.sleep(0.5)
soup = bs(driver.page_source, 'lxml')
title = soup.select('#vide-title') # id๋ฅผ ๊ฐ์ ธ์ค๊ธฐ
search = soup.select('#metadata-line > span:nth-child(1)')
title_list = []
search_list = []
rank_list = []
for i in range(len(title)):
title_list.append(title[i].text)
search_list.append(search[i].text)
rank_list.append(i+1)
print(len(title_list))
print(len(search_list))
print(len(rank_list))
dic = {'์ ๋ชฉ' : title_list, '์กฐํ์' : search_list, '์์' : rank_list}
df = pd.DataFrame(dic)
df.set_index('์์', inplace = True)
# @์ด๋ฏธ์ง ํฌ๋กค๋ง
import os # ํ์ผ ์์คํ
์ ์ํ ๋ผ์ด๋ธ๋ฌ๋ฆฌ, ex) ํ์ผ, ํด๋๋ฅผ ์์ฑ, ์ญ์ , ์กด์ฌ์ฌ๋ถ
from urllib.request import urlretrieve
# ํด๋ ์์ฑ, python ์ ๊ฒฝ์ฐ : ์ฃผ์ ๊ฐ๊ฒฉ \ ์ฐ๋ฉด X, ๋ฐ๋ก / ์จ์ผํ๋ค.
if not os.path.isdir('C:/Users/smhrd/test_study/img'):
# ํด๋น ์์น์ ํ์ผ์ด ์์ผ๋ฉด ์์ฑํ๊ธฐ
os.mkdir('C:/Users/smhrd/test_study/img')
print("ํด๋๋ฅผ ์์ฑ")
url = 'https://search.naver.com/search.naver?where=image&sm=tab_jum&query=%EC%95%84%EC%9D%B4%EC%9C%A0'
driver = wb.Chrome()
driver.get(url)
body = drvier.find_element_by_tag_name('body')
for i in range(30):
body.send_keys(Keys.PAGE_DOWN)
time.sleep(0.5)
soup = bs(driver.page_source, 'lxml')
img = soup.select('img._image._listImage')
# ์ถ์ฒ ์์น ๊ฐ์ ธ์ค๊ธฐ
img[25]['src']
# ์ ์ฒด ์ด๋ฏธ์ง ๊ฐ์ ธ์ค๊ธฐ(โ
โ
โ
)
img_list = []
for i in img:
try:
img_list.append(i['data-lazy-src'])
except:
img_list.append(i['src'])
img_list
cnt = 1
# image๋ url๋กํํ๋ ๊ฒ์, ์ด๋ฏธ์ง ํ์์ ๊ฐ์ ธ์ > ํด๋์ jpg ํ์์ผ๋ก ๋ด๋๋ค
for i in img_list:
urlretrieve(i, 'C:/Users/SM210715/ํฌ๋กค๋ง/img/'+ str(cnt) + '.jpg')
cnt += 1
img_list = []
# ํ์ฅ๋์๋ฝ ์ด๋ฆ, ๊ฐ๊ฒฉ ์ ๋ณด ์์ง
driver = wb.Chrome()
driver.get("https://www.hsd.co.kr/menu/menu_list#none")
soup = bs(driver.page_source,"lxml")
title = soup.select("h4.h.fz_03")
#์ํ์ ๊ฐ๊ฒฉ๋ง!
price = soup.select("div.item-price > strong")
# ๋๋ณด๊ธฐ ๋ฒํผ ํด๋ฆญ
btn_more = driver.find_element_by_css_selector("#btn_more > span > a")
try :
for i in range(10) :
btn_more.click()
time.sleep(2)
except:
print("๋๋ณด๊ธฐ๊ฐ ์กด์ฌ ํ์ง ์์ต๋๋ค!")
# Gmarket top 10 ์์ง
driver = wb.Chrome()
driver.get("http://corners.gmarket.co.kr/Bestsellers")
div = driver.find_element_by_css_selector("p#no1 + span + div")
div.click()
#์ํ๋ช
driver.find_element_by_css_selector("h1.itemtit").text
# ๊ฐ๊ฒฉ์ ๋ณด!
driver.find_element_by_css_selector("strong.price_real").text
#์นดํ
๊ณ ๋ฆฌ!
driver.find_element_by_css_selector("li.on > a").text
#๋ค๋ก๊ฐ๊ธฐ
driver.back()
title_list = []
price_list = []
cate_list = []
driver = wb.Chrome()
driver.get("http://corners.gmarket.co.kr/Bestsellers")
for i in range(1,6) :
div = driver.find_element_by_css_selector("p#no"+str(i)+"+ span + div")
div.click()
time.sleep(1)
title_list.append(driver.find_element_by_css_selector("h1.itemtit").text)
price_list.append(driver.find_element_by_css_selector("strong.price_real").text)
cate_list.append(driver.find_element_by_css_selector("li.on > a").text)
time.sleep(0.5)
driver.back()
time.sleep(1)
drive.quit()
# ๋ง์ง๋ง pd ์ ๋ฆฌ
dic = {"์ํ๋ช
" : title_list , "๊ฐ๊ฒฉ" : price_list, "์นดํ
๊ณ ๋ฆฌ" : cate_list}
pd.DataFrame(dic)
๋ฐ์ํ