웹프로그래밍 - 파이썬 API다루기, 크롤링, DB처리 3주차 정리

스파르타코딩클럽 웹프로그래밍 공부 3주차 개발일지 기록

파이썬 - API가져오기 기본 (requests)

!pip install requests # requests 라이브러리 설치

# requests 라이브러리로 API 가져오기 기본 세팅

import requests

r = requests.get('http://spartacodingclub.shop/sparta_api/seoulair') # 미세먼지 API

rjson = r.json() # 여기까지 requests 라이브러리로 API 가져오기 기본

rows = rjson['RealtimeCityAir']['row']

for row in rows:

gu_name = row['MSRSTE_NM']

gu_mise = row['IDEX_MVL']

if gu_mise < 60:

print (gu_name, gu_mise)

파이썬 - 크롤링 기초 (BeautifulSoup (bs4))

!pip install bs4 requests # BeautifulSoup 라이브러리와 requests 라이브러리 설치

# BeautifulSoup 라이브러리 : 텍스트 데이터 HTML을 Soup객체로 만들어 추출하기 쉽게함

# requests + bs4 사용한 크롤링 기본 세팅

import requests

from bs4 import BeautifulSoup

headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}

# headers 설정은 코드에서 데이터 불러온게 아니고 사람이 직접 브라우저에서 데이터 불러온것처럼 위장

data = requests.get('https://movie.naver.com/movie/sdb/rank/rmovie.naver?sel=pnt&date=20210829',headers=headers)

# URL 넣어서 HTML를 받아오기

soup = BeautifulSoup(data.text, 'html.parser') # soup이라는 변수에 "파싱 용이해진 html"이 담긴 상태가 됨

# 이제부터는 파이썬 문법보다는 BeautifulSoup 문법

title = soup.select_one('#old_content > table > tbody > tr:nth-child(2) > td.title > div > a')

# select_one()에 들어간 내용은 크롬에서 원하는 부분(영화이름) 마우스오른쪽 > 검사 > 태그선택부분 마우스오른쪽 > Copy > Copy Selector 해서 붙여넣기

print(title)

# 에러남 (NotImplementedError: Only the following pseudo-classes are implemented: nth-of-type.)

# PyCharm에선 되지만 여기에서는 nth-child를 지원하지 않음 nth-of-type로 바꿔줘야함

# nth-child를 nth-of-type로 변경함

title = soup.select_one('#old_content > table > tbody > tr:nth-of-type(2) > td.title > div > a')

print(title['href']) #href속성값만 불러옴

#영화이름들 3개만 우선 Copy Selector로 붙여와 봄

#old_content > table > tbody > tr:nth-child(2) > td.title > div > a

#old_content > table > tbody > tr:nth-child(3) > td.title > div > a

#old_content > table > tbody > tr:nth-child(4) > td.title > div > a

#공통패턴 발견

movies = soup.select('#old_content > table > tbody > tr')

#패턴 앞부분 공통태그까지 해당하는 값들 불러옴 배열값이라 select_one이 아닌 select

for movie in movies:

a = movie.select_one('td.title > div > a') #배열값중 하위태그 별로 각 값에 넣어줌

if a is not None: #비어있는 값이 아닐때

title = a.text #.text는 태그를 제외한 텍스트만 추출

rank = movie.select_one('td:nth-of-type(1) > img')['alt'] # nth-child를 nth-of-type로 변경함 (지원되면안해도됨)

star = movie.select_one('td.point').text

print(rank, title, star)

# 선택자를 사용하는 방법 (copy selector)

# soup.select('태그명')

# soup.select('.클래스명')

# soup.select('#아이디명')

# soup.select('상위태그명 > 하위태그명 > 하위태그명')

# soup.select('상위태그명.클래스명 > 하위태그명.클래스명')

# 태그와 속성값으로 찾는 방법

# soup.select('태그명[속성="값"]')

# 한 개만 가져오고 싶은 경우

# soup.select_one('위와 동일')

파이썬 - mongoDB

!pip install pymongo dnspython #mongoDB 관련 라이브러리 설치

#pymongo 기본 코드

from pymongo import MongoClient

client = MongoClient('...')

# mongoDB에서 connect > connect your application > driver:python version:3.6 or later 해놓고 아래 코드 복붙

# <password>를 설정한 암호로 바꾸고 ?retry 앞에 Cluster0 넣음

db = client.dbsparta #sparta db에 넣음

# 값 입력/저장하기

doc = {

'name':'bob',

'age':27

}

db.users.insert_one(doc) # 위의 값을 db의 users폴더 에 넣음

# db+ID명/users/에 데이터 저장됨

# 몽고db가서 Browse Collections 에서 넣은 데이터 확인가능

# 값 입력/저장하기2

db.users.insert_one({'name':'kane','age':32})

db.users.insert_one({'name':'john','age':27})

db.users.insert_one({'name':'ann','age':24}) # 이렇게도 추가가능 (위방법이 일반적)

# 모든 데이터 뽑아보기

all_users = list(db.users.find({},{'_id':False})) #_id는 몽고db가 자동으로 생성하는 것으로 볼 필요 없음

print(all_users[0]) # 특정 1개 값 보기

print(all_users[0]['name']) #

for user in all_users: # 모든 값 보기

print(user)

# 27살만 모두 찾기

#여러개를 다 찾을때는 find_one말고 find로 찾기

all_users = list(db.users.find({'age':27},{'_id':False})) #

for user in all_users:

print(user)

# 특정 값 찾기

user = db.users.find_one({'name':'kane'})

print(user)

# 특정 값 수정하기

db.users.update_one({'name':'kane'},{'$set':{'age':19}})

user = db.users.find_one({'name':'kane'})

print(user)

# 특정 값 삭제하기

db.users.delete_one({'name':'kane'})

user = db.users.find_one({'name':'kane'})

print(user)

3-14 지니순위 스크래핑해서 db에 넣기

import requests

from bs4 import BeautifulSoup

headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}

data = requests.get('https://www.genie.co.kr/chart/top200?ditc=M&rtm=N&ymd=20210701',headers=headers)

soup = BeautifulSoup(data.text, 'html.parser')

from pymongo import MongoClient

client = MongoClient('...')

db = client.dbsparta

musicranks = soup.select('#body-content > div.newest-list > div > table > tbody > tr')

#body-content > div.newest-list > div > table > tbody > tr:nth-child(1) > td.number

#body-content > div.newest-list > div > table > tbody > tr:nth-child(2) > td.number

#body-content > div.newest-list > div > table > tbody > tr:nth-child(1) > td.info > a.title.ellipsis

#body-content > div.newest-list > div > table > tbody > tr:nth-child(2) > td.info > a.title.ellipsis

#body-content > div.newest-list > div > table > tbody > tr:nth-child(1) > td.info > a.artist.ellipsis

#body-content > div.newest-list > div > table > tbody > tr:nth-child(2) > td.info > a.artist.ellipsis

for musicrank in musicranks:

a = musicrank.select_one('td.info > a.title.ellipsis')

if a is not None:

music = musicrank.select_one('td.info > a.title.ellipsis').text.strip() # 쓸데없는 공백(space)또는 동일한 문자제거는 .strip()이용

rank = musicrank.select_one('td.number').text[0:2].strip() # 앞에서 2글자만 추출

artist = musicrank.select_one('td.info > a.artist.ellipsis').text.strip()

print(rank, music, artist)

doc = { # 여기에서 db 넣기 작업 시작

'rank':rank,

'artist':artist,

'title':music,

}

db.music.insert_one(doc) # music폴더에 넣자

저작자표시 비영리 변경금지 (새창열림)

'개발일지' 카테고리의 다른 글

웹프로그래밍 - 서버(Flask)만들기, API, DB 관련 4주차 정리 (0)	2022.12.22
파이썬 데이터분석 4주차 정리 (주식 그래프 백테스팅) (0)	2022.12.19
파이썬 데이터분석 3주차 정리 (OpenDart의 API다루기) (0)	2022.12.18
웹프로그래밍 - jQuer, Ajax 기초 2주차 정리 (0)	2022.12.17
웹프로그래밍 - HTML/CSS/자바스크립트 기초 1주차 정리 (0)	2022.12.15

메타버스에서 살아남기

웹프로그래밍 - 파이썬 API다루기, 크롤링, DB처리 3주차 정리

스파르타코딩클럽 웹프로그래밍 공부 3주차 개발일지 기록

파이썬 - API가져오기 기본 (requests)

파이썬 - 크롤링 기초 (BeautifulSoup (bs4))

파이썬 - mongoDB

3-14 지니순위 스크래핑해서 db에 넣기

'개발일지' 카테고리의 다른 글

티스토리툴바

웹프로그래밍 - 파이썬 API다루기, 크롤링, DB처리 3주차 정리

스파르타코딩클럽 웹프로그래밍 공부 3주차 개발일지 기록

파이썬 - API가져오기 기본 (requests)

파이썬 - 크롤링 기초 (BeautifulSoup (bs4))

파이썬 - mongoDB

3-14 지니순위 스크래핑해서 db에 넣기

'개발일지' 카테고리의 다른 글

관련글

티스토리툴바