#주피터 노트북 블로그게시용 함수
from IPython.core.display import display, HTML
display(HTML("<style> .container{width:100% !important;}</style>"))

# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt #data visualization
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

# Input data files are available in the "../영화관객수예측/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../영화관객수예측/"))

# Any results you write to the current directory are saved as output.

import sys

print ('Python version ->', sys.version)
print ('Numpy version ->', np.__version__)
print ('Pandas version ->', pd.__version__)

['.ipynb_checkpoints', 'movies_sub.csv', 'movies_test.csv', 'movies_train.csv', 'rate_test.csv', 'rate_train.csv', 'submission.csv', '네이버 평점 API 활용 정리.ipynb', '네이버 평점 API 활용.ipynb', '영화 관객수 예측 정리.ipynb', '영화 관객수 예측.ipynb', '영화 관객수 예측.py', '영화진흥위원회 API.ipynb']
Python version -> 3.7.0 (default, Jun 28 2018, 08:04:48) [MSC v.1912 64 bit (AMD64)]
Numpy version -> 1.16.0
Pandas version -> 0.24.2

1. Data set 불러오기¶

train_df = pd.read_csv('../영화관객수예측/movies_train.csv') # training dataframe
test_df = pd.read_csv('../영화관객수예측/movies_test.csv')# testing dataframe

#원본데이터는 보존하기 위함
train = train_df.copy()
test = test_df.copy()

print("train.csv. Shape: ",train.shape)
print("test.csv. Shape: ",test.shape)

train.csv. Shape:  (600, 12)
test.csv. Shape:  (243, 11)

train_null = train.drop('box_off_num', axis = 1).isnull().sum()/len(train)*100
test_null = test.isnull().sum()/len(test)*100
pd.DataFrame({'train_null_count' : train_null, 'test_null_count' : test_null})

missing data는 dir_prev_bfnum가 55%정도 있습니다. 그외에는 없습니다.

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 12 columns):
title             600 non-null object
distributor       600 non-null object
genre             600 non-null object
release_time      600 non-null object
time              600 non-null int64
screening_rat     600 non-null object
director          600 non-null object
dir_prev_bfnum    270 non-null float64
dir_prev_num      600 non-null int64
num_staff         600 non-null int64
num_actor         600 non-null int64
box_off_num       600 non-null int64
dtypes: float64(1), int64(5), object(6)
memory usage: 56.3+ KB

NAVER 검색 API사용¶

naver에서 제공하는 검색 API에서 평점이라는 데이터를 가지고 새로운 피쳐를 생성합니다.

train¶

# import os
# import sys
# import urllib.request
# import json
# import time
# import timeit

# start = timeit.default_timer()
# count = 0

# #API 사용 아이디 및 비밀번호
# client_id = "Fo_P8tuHi5_qUQUIGIWp"
# client_secret = "nwKNomCW5F"

# train_title = train.loc[:,'title']
# train_director = train.loc[:,'director']
# train['user_rating'] = 0
# for movie, name in zip(train_title, train_director):
        
#     if count == 30:
#         count = 0
    
#     title = movie
#     director = name + '|'
#     encText = urllib.parse.quote(title)
#     display = '&display=100'
#     yearfrom = '&yearfrom=2010'
#     yearto = '&yearto=2015'
#     url = "https://openapi.naver.com/v1/search/movie?query=" + encText + display + yearfrom + yearto # json 결과
#     # url = "https://openapi.naver.com/v1/search/blog.xml?query=" + encText # xml 결과
#     request = urllib.request.Request(url)
#     request.add_header("X-Naver-Client-Id",client_id)
#     request.add_header("X-Naver-Client-Secret",client_secret)
#     response = urllib.request.urlopen(request)
#     rescode = response.getcode()
#     if(rescode==200):
#         response_body = response.read()
#     else:
#         print("Error Code:" + rescode)
#         break
        
#     result = json.loads(response_body)
#     for i in range(len(result['items'])):
#         if result['items'][i]['director'] == director:
#             train.loc[train['title']==title, 'user_rating'] = result['items'][i]['userRating']
            
#     count += 1
#     if count == 30:
#         time.sleep(1)

# stop = timeit.default_timer()
# print('불러오는데 걸린 시간 : {}초'.format(stop - start))
# print('rating이 0인 row 갯수 : {}개'.format(len(train[train['user_rating']==0])))

감독이나, 년도가 달라서 데이터가 0인 경우는 다시 조건을 넓혀 검색해봄

# import os
# import sys
# import urllib.request
# import json

# start = timeit.default_timer()
# count = 0

# client_id = "Fo_P8tuHi5_qUQUIGIWp"
# client_secret = "nwKNomCW5F"

# train_title = train.loc[train['user_rating']==0,'title']
# train_director = train.loc[train['user_rating']==0,'director']
# for movie, name in zip(train_title, train_director):
    
#     if count == 30:
#         count = 0
    
#     title = movie
#     director = name + '|'
#     encText = urllib.parse.quote(title)
#     display = '&display=100'
#     url = "https://openapi.naver.com/v1/search/movie?query=" + encText # json 결과
#     # url = "https://openapi.naver.com/v1/search/blog.xml?query=" + encText # xml 결과
#     request = urllib.request.Request(url)
#     request.add_header("X-Naver-Client-Id",client_id)
#     request.add_header("X-Naver-Client-Secret",client_secret)
#     response = urllib.request.urlopen(request)
#     rescode = response.getcode()
#     if(rescode==200):
#         response_body = response.read()
#     else:
#         print("Error Code:" + rescode)
#         break
        
#     result = json.loads(response_body)
#     for i in range(len(result['items'])):
#         if result['items'][i]['director'] == director:
#             train.loc[train['title']==title, 'user_rating'] = result['items'][i]['userRating']
            
#     count += 1
#     if count == 30:
#         time.sleep(1)
        
# stop = timeit.default_timer()
# print('불러오는데 걸린 시간 : {}초'.format(stop - start))
# print('rating이 0인 row 갯수 : {}개'.format(len(train[train['user_rating']==0])))

test¶

# start = timeit.default_timer()
# count = 0

# client_id = "Fo_P8tuHi5_qUQUIGIWp"
# client_secret = "nwKNomCW5F"

# test_title = test.loc[:,'title']
# test_director = test.loc[:,'director']
# test['user_rating'] = 0
# for movie, name in zip(test_title, test_director):
    
#     if count == 30:
#         count = 0
    
#     title = movie
#     director = name + '|'
#     encText = urllib.parse.quote(title)
#     display = '&display=100'
#     yearfrom = '&yearfrom=2010'
#     yearto = '&yearto=2015'
#     url = "https://openapi.naver.com/v1/search/movie?query=" + encText + display + yearfrom + yearto # json 결과
#     # url = "https://openapi.naver.com/v1/search/blog.xml?query=" + encText # xml 결과
#     request = urllib.request.Request(url)
#     request.add_header("X-Naver-Client-Id",client_id)
#     request.add_header("X-Naver-Client-Secret",client_secret)
#     response = urllib.request.urlopen(request)
#     rescode = response.getcode()
#     if(rescode==200):
#         response_body = response.read()
#     else:
#         print("Error Code:" + rescode)
#         break
        
#     result = json.loads(response_body)
#     for i in range(len(result['items'])):
#         if result['items'][i]['director'] == director:
#             test.loc[test['title']==title, 'user_rating'] = result['items'][i]['userRating']
            
#     count += 1
#     if count == 30:
#         time.sleep(1)
        
# stop = timeit.default_timer()
# print('불러오는데 걸린 시간 : {}초'.format(stop - start))
# print('rating이 0인 row 갯수 : {}개'.format(len(test[test['user_rating']==0])))

# start = timeit.default_timer()
# count = 0

# client_id = "Fo_P8tuHi5_qUQUIGIWp"
# client_secret = "nwKNomCW5F"

# test_title = test.loc[test['user_rating']==0,'title']
# test_director = test.loc[test['user_rating']==0,'director']
# for movie, name in zip(test_title, test_director):
    
#     if count == 30:
#         count = 0
    
#     title = movie
#     director = name + '|'
#     encText = urllib.parse.quote(title)
#     display = '&display=100'
#     url = "https://openapi.naver.com/v1/search/movie?query=" + encText # json 결과
#     # url = "https://openapi.naver.com/v1/search/blog.xml?query=" + encText # xml 결과
#     request = urllib.request.Request(url)
#     request.add_header("X-Naver-Client-Id",client_id)
#     request.add_header("X-Naver-Client-Secret",client_secret)
#     response = urllib.request.urlopen(request)
#     rescode = response.getcode()
#     if(rescode==200):
#         response_body = response.read()
#     else:
#         print("Error Code:" + rescode)
#     result = json.loads(response_body)
#     for i in range(len(result['items'])):
#         if result['items'][i]['director'] == director:
#             test.loc[test['title']==title, 'user_rating'] = result['items'][i]['userRating']
            
#     count += 1
#     if count == 30:
#         time.sleep(1)
        
# stop = timeit.default_timer()
# print('불러오는데 걸린 시간 : {}초'.format(stop - start))
# print('rating이 0인 row 갯수 : {}개'.format(len(test[test['user_rating']==0])))

# train[['title', 'user_rating']].to_csv('rate_train.csv', index=False)
# test[['title', 'user_rating']].to_csv('rate_test.csv', index=False)

#외부데이터(user_rating)이 포함된 csv파일
rate_train = pd.read_csv('../영화관객수예측/rate_train.csv') # user rating train 
rate_test = pd.read_csv('../영화관객수예측/rate_test.csv')# user rating test

train = pd.merge(train, rate_train, on='title')
test = pd.merge(test, rate_test, on='title')

2. Exploratory Data Analysis AND Processing¶

Column_name Description¶

title : 영화의 제목
distributor : 배급사
genre : 장르
release_time : 개봉일
time : 상영시간(분)
screening_rat : 상영등급
director : 감독이름
dir_prev_bfnum : 해당 감독이 이 영화를 만들기 전 제작에 참여한 영화에서의 평균 관객수(단 관객수가 알려지지 않은 영화 제외)
dir_prev_num : 해당 감독이 이 영화를 만들기 전 제작에 참여한 영화의 개수(단 관객수가 알려지지 않은 영화 제외)
num_staff : 스텝수
num_actor : 주연배우수
box_off_num : 관객수

user_rating(외부데이터) : 네이버에서 제공하는 평점 데이터

데이터 탐색 과정에서 이상한 값들이 있어서.. 미리 삭제하고 시작하겠습니다.

#슈퍼레이서 엔지의 time이 release_time데이터로 있어서 값을 수정하겠습니다.
test.loc[test['title'] == '슈퍼레이서 엔지', 'time'] = 63

#시리즈물중에 중복되어있어서 값을 수정하는 것 보다 삭제를 하겠습니다.
train.drop([10, 311], inplace=True)

2.1 Target Variable (Dependent Variable)¶

box_off_num : 관객수¶

train['box_off_num'].describe()

count    5.980000e+02
mean     7.105431e+05
std      1.830608e+06
min      1.000000e+00
25%      1.311250e+03
50%      1.282300e+04
75%      4.811002e+05
max      1.426277e+07
Name: box_off_num, dtype: float64

총 600개의 데이터
평균 : 708181.8 약 70만
표준편차 : 1828006 약 180만
min : 1
max : 14262770 약 1400만

f, ax = plt.subplots(figsize = (8,6))
sns.distplot(train['box_off_num'])
print("%s -> Skewness: %f, Kurtosis: %f" %  ('box_off_num',train['box_off_num'].skew(), 
                                                     train['box_off_num'].kurt()))

box_off_num -> Skewness: 4.324482, Kurtosis: 21.865598

왜도(Skewness) : 왼쪽으로 치우쳐져 있을수록 값이크고, 오른쪽으로 치우쳐져 있을 수록 값이 작아진다. 즉, 0에 가까울수록 좋은 형태
첨도(Kurtosis) : 첨도 값이 3에 가까울 경우 정규분포에 가까우며, 첨도 값이 클수록 뾰족하고 값이 작을 수록 완만해진다

train['box_off_num'] = np.log1p(train['box_off_num'])
print("%s -> Skewness: %f, Kurtosis: %f" %  ('box_off_num',train['box_off_num'].skew(), 
                                                     train['box_off_num'].kurt()))

box_off_num -> Skewness: 0.128244, Kurtosis: -1.051438

f, ax = plt.subplots(figsize = (8,6))
sns.distplot(train['box_off_num'])

<matplotlib.axes._subplots.AxesSubplot at 0x14a94563fd0>

2.2 Missing data¶

missing data가 있는 dir_prev_bfnum 부터 확인하겠습니다.

dir_prev_bfnum¶

감독이 해당 영화를 만들 기 전의 평균 관객수

#Nan값을 제외한 데이터 히스토그램 분포
f, ax = plt.subplots(figsize = (8,6))
train_bfnum = train[train['dir_prev_bfnum'].notnull()]
sns.distplot(train_bfnum['dir_prev_bfnum'])
print("%s -> Skewness: %f, Kurtosis: %f" %  ('dir_prev_bfnum',train_bfnum['dir_prev_bfnum'].skew(), 
                                                     train_bfnum['dir_prev_bfnum'].kurt()))

dir_prev_bfnum -> Skewness: 4.418835, Kurtosis: 30.071183

대부분이 작은 값을 가지고 있습니다.

dir_prev_bfnum은 감독이 해당영화 만들기전의 평균 관객수입니다.

하나의 영화를 만들었다면 과거데이터가 없어서 값이 없을 수도 있습니다.

따라서 1개의 영화를 만든 감독은 0의 데이터를 집어넣겠습니다.

#train과 test를 합침
all_data = pd.concat([train, test], sort=False).reset_index(drop=True)
print('Null count :',all_data['dir_prev_bfnum'].isnull().sum())

#dir_prev_bfnum에서 NaN값을 가지고 있는 데이터를 따로 만듬
bfnum_null = all_data[all_data['dir_prev_bfnum'].isnull()]

#NaN 값중 감독별 영화갯수가 몇개인지 확인
director_count = bfnum_null[['title','director']].groupby('director').count()
director_count = director_count.reset_index()
print('Director Null count :',director_count['director'].count())

#영화가 1개인 경우는 이전의 영화가 없어서 NaN값일 수 도 있으므로 감독의 이름을 가져오기 위해
#또 새로운 데이터프레임을 만들고 해당하는 감독의 영화는 0으로 채워 넣는다.
dire_1 = director_count[director_count['title']==1]['director']
for name in dire_1:
    all_data.loc[all_data['director']==name, 'dir_prev_bfnum'] = all_data.loc[all_data['director']==name, 'dir_prev_bfnum'].fillna(0)

Null count : 464
Director Null count : 362

나머지 null값에 대해서 확인해보겠습니다.

print('Null count :',all_data['dir_prev_bfnum'].isnull().sum())

#위와 똑같은 과정 반복
#한 감독이 2개 이상 만든 경우의 NaN 데이터프레임 생성
bfnum_null = all_data[all_data['dir_prev_bfnum'].isnull()]

director_count = bfnum_null[['title','director']].groupby('director').count()
director_count = director_count.reset_index()

print('Director Null count :',director_count['director'].count())

#관람객 수를 log취했기 때문에 원래대로 돌려 bfnum을 채우기 위한 용도로 사용.
all_data['box_off_num'] = np.expm1(all_data['box_off_num'])

Null count : 171
Director Null count : 69

#데이터를 순서대로 평균값을 계산하고 채워 넣을 수 있도록 사용자 정의 함수 만듬
def director_bfnum(all_director):
    #만약 첫번째 bfnum의 값이 있다면 그대로 두고, 없다면 다 0으로 채워 넣음
    if pd.isnull(all_director.iloc[0, 7]):
        all_director.iloc[0, 7] = 0 
    count = 1
    num = 0
    #순차적으로 bfnum과 box_off_num을 더하여 갯수만큼 나눌 수 있는 평균을 구하도록함
    for i in range(1, len(all_director)):
        num += all_director['box_off_num'].iloc[i-1]
        bfnum = num/count
        all_director.iloc[i, 7] = bfnum
        count += 1
    return all_director

#2개 이상인 데이터의 감독명을 받아옴
dire_2 = director_count['director']

#test데이터도 계산을 위해 nan값을 0의 값으로 가져온다.
#train과 test로 나눌 때 삭제해야함
all_data['box_off_num'].fillna(0, inplace=True)

for name in dire_2:
    all_director = all_data[all_data['director'] == name].sort_values('release_time')
    all_data[all_data['director']==name] = director_bfnum(all_director)

#원래의 데이터로 다 돌려줌
ntrain = len(train)
train = all_data[:ntrain]
test = all_data[ntrain:]
test.drop('box_off_num', axis=1, inplace=True)
train['box_off_num'] = np.log1p(train['box_off_num'])

train_null = train.drop('box_off_num', axis = 1).isnull().sum()/len(train)*100
test_null = test.isnull().sum()/len(test)*100
pd.DataFrame({'train_null_count' : train_null, 'test_null_count' : test_null})

말끔 ㅎㅎ..

겸사겸사 왜도, 첨도도 조사하여 조정해주겠습니다.

f, ax = plt.subplots(figsize = (8,6))
sns.distplot(train['dir_prev_bfnum'])
print("%s -> Skewness: %f, Kurtosis: %f" %  ('dir_prev_bfnum',train_bfnum['dir_prev_bfnum'].skew(), 
                                                     train_bfnum['dir_prev_bfnum'].kurt()))

dir_prev_bfnum -> Skewness: 4.418835, Kurtosis: 30.071183

train['dir_prev_bfnum'] = np.log1p(train['dir_prev_bfnum'])
test['dir_prev_bfnum'] = np.log1p(test['dir_prev_bfnum'])
print("%s -> Skewness: %f, Kurtosis: %f" %  ('dir_prev_bfnum',train['dir_prev_bfnum'].skew(), 
                                                     train['dir_prev_bfnum'].kurt()))

dir_prev_bfnum -> Skewness: 0.195287, Kurtosis: -1.696559

f, ax = plt.subplots(figsize = (8,6))
sns.distplot(train['dir_prev_bfnum'])

<matplotlib.axes._subplots.AxesSubplot at 0x14a978af160>

dir_prev_num¶

dir_prev_num이 0인데 dir_prev_bfnum이 값을 가진 경우가 있습니다.

이 경우에 감독이 영화를 하나만 만든경우는 1을 넣어주겠습니다.

#train과 test를 합침
all_data = pd.concat([train, test], sort=False).reset_index(drop=True)

#num이 0인데 값을 가진 경우를 새로운 데이터프레임으로 만듬
num_group = all_data[(all_data['dir_prev_num']==0) & 
                     (all_data['dir_prev_bfnum']>0)].groupby('director')[['dir_prev_num']].count().reset_index()

print('num_zero count : ', num_group['dir_prev_num'].count())

#해당하는 감독의 이름을 가져오기 위한 데이터프레임을 새로 생성
dire_1 = num_group[num_group['dir_prev_num']==1]['director']
for name in dire_1:
    all_data.loc[all_data['director']==name, 'dir_prev_num'] = 1

num_zero count :  57

그 외 데이터에 대해서 첫번째 num 값 이후로 순차적으로 데이터를 넣어주겠습니다.

def director_num(prev_num):
    count = prev_num.iloc[0, 8]
    for i in range(1, len(prev_num)):
        prev_num.iloc[i, 8] = count
        count+=1
    return prev_num

director = all_data['director']
for name in director:
    prev_num = all_data[all_data['director'] == name].sort_values('release_time')
    all_data[all_data['director']==name] = director_num(prev_num)

#원래의 데이터로 다 돌려줌
ntrain = len(train)
train = all_data[:ntrain]
test = all_data[ntrain:]
test.drop('box_off_num', axis=1, inplace=True)

f, ax = plt.subplots(figsize = (8,6))
sns.boxplot(train['dir_prev_num'], train['box_off_num'])

<matplotlib.axes._subplots.AxesSubplot at 0x14a9792a3c8>

columns 상관관계¶

#상관관계 확인
k=20 #히트맵 변수 갯수
corrmat = train.corr() #변수간의 상관관계
cols = corrmat.nlargest(k, 'box_off_num')['box_off_num'].index #price기준으로 제일 큰순서대로 20개를 뽑아냄
cm = np.corrcoef(train[cols].values.T)
f, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(data = cm, annot=True, square=True, fmt = '.2f', linewidths=.5, cmap='Reds', 
            yticklabels = cols.values, xticklabels = cols.values)

<matplotlib.axes._subplots.AxesSubplot at 0x14a9775f470>

2.3 Categorical Variable¶

distributor (배급사)¶

object인 데이터들 우선적으로 탐색하며 데이터를 변형하겠습니다.

dis_unique = train['distributor'].unique()
len(dis_unique)

168

train_dist = train[['box_off_num','distributor']].groupby('distributor').mean().sort_values('box_off_num').reset_index()
train_dist.head()

dist_num = {}
for i in range(len(train_dist)):
    distributor = train_dist['distributor'].iloc[i]
    dist_num[distributor] = i

train_test_data = [train, test]
for dataset in train_test_data:
    dataset['distributor'] = dataset['distributor'].map(dist_num)

test에는 train의 distributor가 없는 경우도 있기때문에

NaN값은 우선 0으로 채워넣겠습니다.

test['distributor'].fillna(0, inplace = True)

screening_rat¶

train['screening_rat'].unique()

array(['청소년 관람불가', '15세 관람가', '전체 관람가', '12세 관람가'], dtype=object)

replace_name = {'청소년 관람불가' : 'No Youth', '15세 관람가' : '15 years old', 
                        '전체 관람가' : 'G rating', '12세 관람가' : '12 years old' }
train.replace({'screening_rat' : replace_name}, inplace = True)
test.replace({'screening_rat' : replace_name}, inplace = True)
train.head()

f, ax = plt.subplots(figsize = (8,6))
sns.boxplot(x='screening_rat', y='box_off_num', data=train[['screening_rat', 'box_off_num']])

<matplotlib.axes._subplots.AxesSubplot at 0x14a979a89b0>

데이터의 중앙값에 비해 편차들이 아주 큽니다. 각 관람가에 해당하는 관객수의 평균을 확인해보겠습니다.

train[['box_off_num','screening_rat']].groupby('screening_rat').mean()

마지막에 라벨인코더로 처리하겠습니다.

라벨인코더에서가 가장 성능이 좋게 나왔습니다.

아마 test세트에서는 평균이 낮은 경우가 관람객수가 많고, 평균이 높은 경우 관람객수가 적은 듯함.

genre¶

train['genre'].unique()

array(['액션', '느와르', '코미디', '다큐멘터리', '뮤지컬', '드라마', '공포', '서스펜스', '멜로/로맨스',
       '애니메이션', '미스터리', 'SF'], dtype=object)

replace_name = {'액션' : 'Action', '느와르' : 'noir', '코미디' : 'comedy', '다큐멘터리' : 'documentary',
               '뮤지컬' : 'musical', '드라마' : 'drama', '멜로/로맨스' : 'melo/romance', '공포' : 'horror',
               '서스펜스' : 'suspense', '애니메이션' : 'animation', '미스터리' : 'mistery'}
train.replace({'genre' : replace_name}, inplace = True)
test.replace({'genre' : replace_name}, inplace = True)
train.head()

train_genre = train[['box_off_num','genre']].groupby('genre').mean().sort_values('box_off_num').reset_index()
train_genre

genre_num = {}
for i in range(len(train_genre)):
    genre = train_genre['genre'].iloc[i]
    genre_num[genre] = i
    
genre_num

{'documentary': 0,
 'musical': 1,
 'melo/romance': 2,
 'mistery': 3,
 'drama': 4,
 'animation': 5,
 'SF': 6,
 'suspense': 7,
 'horror': 8,
 'comedy': 9,
 'noir': 10,
 'Action': 11}

train.replace({'genre' : genre_num}, inplace = True)
test.replace({'genre' : genre_num}, inplace = True)
train.head()

f, ax = plt.subplots(figsize = (8,6))
sns.boxplot(x='genre', y='box_off_num', data=train[['genre', 'box_off_num']])

<matplotlib.axes._subplots.AxesSubplot at 0x14a97a641d0>

director¶

len(train['director'].unique())

472

director는 그냥.. 삭제하는 부분이 제일 좋네요..

release_time¶

train_test_data = [train, test]

for dataset in train_test_data:
    #date -> 년, 월, 일 단위로 새로운 칼럼 만듦
    dataset['release_time'] = dataset['release_time'].map(lambda x : x.replace('-', ''))
    dataset['year'] = dataset['release_time'].str[:4]
    dataset['year'] = dataset['year'].astype(int)
    dataset['month'] = dataset['release_time'].str[4:6]
    dataset['month'] = dataset['month'].astype(int)
    dataset['day'] = dataset['release_time'].str[6:8]
    dataset['day'] = dataset['day'].astype(int)
    dataset['release_time'] = dataset['release_time'].astype(int)

f, ax = plt.subplots(figsize = (8,6))
sns.boxplot(x='year', y='box_off_num', data=train[['year', 'box_off_num']])

<matplotlib.axes._subplots.AxesSubplot at 0x14a97b93da0>

f, ax = plt.subplots(figsize = (8,6))
sns.countplot(train['year'])

<matplotlib.axes._subplots.AxesSubplot at 0x14a97c65748>

2.4 Numeric Variable¶

num_staff¶

f, ax = plt.subplots(figsize = (8,6))
sns.distplot(train['num_staff'])

<matplotlib.axes._subplots.AxesSubplot at 0x14a97cbcf60>

time¶

f, ax = plt.subplots(figsize = (8,6))
sns.distplot(train['time'])

<matplotlib.axes._subplots.AxesSubplot at 0x14a97ba0860>

num_actor¶

f, ax = plt.subplots(figsize = (8,6))
sns.distplot(train['num_actor'])

<matplotlib.axes._subplots.AxesSubplot at 0x14a97de4fd0>

user_rating¶

f, ax = plt.subplots(figsize = (8,6))
sns.distplot(train['user_rating'])

<matplotlib.axes._subplots.AxesSubplot at 0x14a97d8da90>

train.head()

f, ax = plt.subplots(figsize = (8,6))
sns.regplot(train['user_rating'], train['box_off_num'])

<matplotlib.axes._subplots.AxesSubplot at 0x14a97f36f28>

train_rating = train[train['user_rating']==0].sort_values('box_off_num')
train_rating.head()

test_rating = test[test['user_rating']==0]
test_rating.head()

user_rating이 제대로 적혀져 있지 않은 데이터에 한해서 네이버에서 웹크롤링으로 채워 넣겠습니다.

from bs4 import BeautifulSoup
import requests
import urllib.request
import json

movie = train_rating['title']
director = train_rating['director']
for title, name in zip(movie, director):
    enctext = urllib.parse.quote(title)
    enctext2 = urllib.parse.quote(name)
    url = 'https://search.naver.com/search.naver?sm=top_hty&fbm=1&ie=utf8&query=' + enctext + enctext2
    r = requests.get(url)
    html = r.text
    soup = BeautifulSoup(html, 'lxml')
    try:
        train.loc[train['title']==title, 'user_rating'] = float(soup.find('dl', {'class':'r_grade'} ).find('em').text)
    except AttributeError:
        pass

movie = test_rating['title']
director = test_rating['director']
for title, name in zip(movie, director):
    enctext = urllib.parse.quote(title)
    enctext2 = urllib.parse.quote(name)
    url = 'https://search.naver.com/search.naver?sm=top_hty&fbm=1&ie=utf8&query=' + enctext + enctext2
    r = requests.get(url)
    html = r.text
    soup = BeautifulSoup(html, 'lxml')
    try:
        test.loc[test['title']==title, 'user_rating'] = float(soup.find('dl', {'class':'r_grade'} ).find('em').text)
    except AttributeError:
        pass

2.5 Preprocessing¶

from sklearn.preprocessing import LabelEncoder
cols = ['screening_rat', 'director']
# process columns, apply LabelEncoder to categorical features

ntrain = len(train)

all_data = pd.concat([train, test], sort=False).reset_index(drop=True)
target = train['box_off_num']
all_data.drop(['box_off_num', 'title'], axis=1, inplace=True)

for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(all_data[c].values)) 
    all_data[c] = lbl.transform(list(all_data[c].values))

# shape        
print('Shape all_data: {}'.format(all_data.shape))

Shape all_data: (841, 14)

ntrain = len(train)
train = all_data[:ntrain]
test = all_data[ntrain:]

#상관관계 확인
k=20 #히트맵 변수 갯수
train = pd.concat([train, target], axis=1)
corrmat = train.corr() #변수간의 상관관계
cols = corrmat.nlargest(k, 'box_off_num')['box_off_num'].index #price기준으로 제일 큰순서대로 20개를 뽑아냄
cm = np.corrcoef(train[cols].values.T)
f, ax = plt.subplots(figsize=(16, 10))
sns.heatmap(data = cm, annot=True, square=True, fmt = '.2f', linewidths=.5, cmap='Reds', 
            yticklabels = cols.values, xticklabels = cols.values)

<matplotlib.axes._subplots.AxesSubplot at 0x14a998b1358>

3. Feature Engineering¶

train_test_data = [train, test]

for dataset in train_test_data:
    #배우 1명당 스태프수
    dataset['staff_per_actor'] = 0
    dataset.loc[dataset['num_actor']>0, 'staff_per_actor'] = dataset['num_staff']/dataset['num_actor']

train.head()

train = train.drop(['director'], axis= 1)
test = test.drop([ 'director'], axis= 1)

train_columns = []
for column in train.columns[:]:
    if train[column].skew() >= 1:
        print("%s -> Skewness: %f, Kurtosis: %f" %  (column,train[column].skew(), 
                                                 train[column].kurt()))
        train_columns.append(column)
    elif train[column].kurt() >= 3:
        print("%s -> Skewness: %f, Kurtosis: %f" %  (column,train[column].skew(), 
                                                 train[column].kurt()))
        train_columns.append(column)

dir_prev_num -> Skewness: 1.115659, Kurtosis: 0.926548
num_staff -> Skewness: 1.229350, Kurtosis: 1.093740
num_actor -> Skewness: 2.947182, Kurtosis: 16.556836
staff_per_actor -> Skewness: 2.588273, Kurtosis: 12.309039

#정규분포모형을 가질 수 있도록 첨도와 왜도를 조정
#조정하는 방법에는 square root, quarter root, log 등이 있다.
#log에서 0의 값이 들어왔을 때 무한으로 가는 것을 방지하도록 1 더해주는 log1p를 사용

for column in train_columns :
    train[column] = np.log1p(train[column])
    test[column] = np.log1p(test[column])
    print("%s -> Skewness: %f, Kurtosis: %f" %  (column,train[column].skew(), 
                                                 train[column].kurt()))

dir_prev_num -> Skewness: 0.327423, Kurtosis: -1.153786
num_staff -> Skewness: -0.708332, Kurtosis: -0.469748
num_actor -> Skewness: 0.066009, Kurtosis: 1.569499
staff_per_actor -> Skewness: -0.381775, Kurtosis: -0.927274

#상관관계 확인
k=20 #히트맵 변수 갯수
corrmat = train.corr() #변수간의 상관관계
cols = corrmat.nlargest(k, 'box_off_num')['box_off_num'].index #price기준으로 제일 큰순서대로 20개를 뽑아냄
cm = np.corrcoef(train[cols].values.T)
f, ax = plt.subplots(figsize=(16, 10))
sns.heatmap(data = cm, annot=True, square=True, fmt = '.2f', linewidths=.5, cmap='Reds', 
            yticklabels = cols.values, xticklabels = cols.values)

<matplotlib.axes._subplots.AxesSubplot at 0x14a9a140d68>

4. Modeling¶

train.head()

from sklearn.linear_model import ElasticNet, Lasso
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

D:\Users\WIN10\Anaconda3\lib\site-packages\sklearn\ensemble\weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
  from numpy.core.umath_tests import inner1d

target = train['box_off_num']
del train['box_off_num']

#cross validation score
n_folds = 5

def cv_score(models):
    kfold = KFold(n_splits=n_folds, shuffle=True ,random_state=42).get_n_splits(train.values)
    for m in models:
        cvs = np.mean(cross_val_score(m['model'], train.values, target, cv=kfold))
        rmse = np.mean(np.sqrt(-cross_val_score(m['model'], train.values, np.expm1(target), scoring = "neg_mean_squared_error", cv = kfold)))
        print("Model {} CV score : {:.4f}".format(m['name'], cvs))
        print("RMSE : {:.4f}".format(rmse))

lasso = make_pipeline(RobustScaler(), Lasso(alpha = 0.0005, random_state=42))
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=42))
forest = RandomForestRegressor(random_state=42)
gboost = GradientBoostingRegressor(random_state=42)
xgboost = xgb.XGBRegressor(random_state=42)
lightgbm = lgb.LGBMRegressor(random_state=42)

models = [{'model':gboost, 'name':'GradientBoosting'}, {'model':xgboost, 'name':'XGBoost'},
          {'model':lightgbm, 'name':'LightGBM'}, {'model' : lasso, 'name' : 'LASSO Regression'}, 
          {'model' : ENet, 'name' : 'Elastic Net Regression'}, {'model' : forest, 'name' : 'RandomForset'}]

cv_score(models)

Model GradientBoosting CV score : 0.7908
RMSE : 1395554.7432
Model XGBoost CV score : 0.7943
RMSE : 1310919.8599
Model LightGBM CV score : 0.7861
RMSE : 1358511.2751
Model LASSO Regression CV score : 0.7685
RMSE : 1557034.0707
Model Elastic Net Regression CV score : 0.7685
RMSE : 1557020.8942
Model RandomForset CV score : 0.7713
RMSE : 1447661.4729

#x.values 는 배열로 데이터를 뽑아옴
#여러개의 모델로 만들어진 predict 데이터들을 구한다.

models = [{'model':xgboost, 'name':'XGBoost'},
          {'model':lightgbm, 'name':'LightGBM'}]

def AveragingBlending(models, x, y, sub_x):
    for m in models : 
        m['model'].fit(x.values, y)
    
    predictions = np.column_stack([m['model'].predict(sub_x.values) for m in models])
    return predictions

y_test_pred = AveragingBlending(models, train, target, train)
y_test_pred = (y_test_pred[:, 1]*0.9 + y_test_pred[:, 0]*0.1)
print(np.sqrt(mean_squared_error(np.expm1(target), np.expm1(y_test_pred))))

758445.1753594752

y_test_pred = AveragingBlending(models, train, target, test)
y_test_pred = (y_test_pred[:, 1]*0.9 + y_test_pred[:, 0]*0.1)
predictions = y_test_pred

# lightgbm.fit(train.values, target)
# predictions = lightgbm.predict(test.values)

sub = pd.read_csv('../영화관객수예측/submission.csv')

sub['box_off_num'] = np.expm1(predictions)

sub.to_csv('movies_sub.csv', index=False)

	distributor	box_off_num
0	인피니티엔터테인먼트	1.098612
1	고구마공작소	2.197225
2	사람과 사람들	3.761200
3	위드시네마	3.850148
4	나우콘텐츠	4.007333

	title	distributor	genre	release_time	time	screening_rat	director	dir_prev_bfnum	dir_prev_num	num_staff	num_actor	box_off_num	user_rating
0	개들의 전쟁	150	액션	2012-11-22	96	No Youth	조병옥	0.000000	0	91	2	10.060449	7.23
1	내부자들	164	느와르	2015-11-19	130	No Youth	우민호	13.965312	1	387	3	15.771725	9.04
2	은밀하게 위대하게	164	액션	2013-06-05	123	15 years old	장철수	12.304905	3	343	4	15.755558	7.11
3	나는 공무원이다	156	코미디	2012-07-12	101	G rating	구자홍	10.081425	2	20	6	12.291640	6.12
4	불량남녀	162	코미디	2010-11-04	108	15 years old	신근호	0.693147	1	251	2	13.088575	8.14

	title	distributor	genre	release_time	time	screening_rat	director	dir_prev_bfnum	dir_prev_num	num_staff	num_actor	box_off_num	user_rating
0	개들의 전쟁	150	Action	2012-11-22	96	No Youth	조병옥	0.000000	0	91	2	10.060449	7.23
1	내부자들	164	noir	2015-11-19	130	No Youth	우민호	13.965312	1	387	3	15.771725	9.04
2	은밀하게 위대하게	164	Action	2013-06-05	123	15 years old	장철수	12.304905	3	343	4	15.755558	7.11
3	나는 공무원이다	156	comedy	2012-07-12	101	G rating	구자홍	10.081425	2	20	6	12.291640	6.12
4	불량남녀	162	comedy	2010-11-04	108	15 years old	신근호	0.693147	1	251	2	13.088575	8.14

	title	distributor	genre	release_time	time	screening_rat	director	dir_prev_bfnum	dir_prev_num	num_staff	num_actor	box_off_num	user_rating
0	개들의 전쟁	150	11	2012-11-22	96	No Youth	조병옥	0.000000	0	91	2	10.060449	7.23
1	내부자들	164	10	2015-11-19	130	No Youth	우민호	13.965312	1	387	3	15.771725	9.04
2	은밀하게 위대하게	164	11	2013-06-05	123	15 years old	장철수	12.304905	3	343	4	15.755558	7.11
3	나는 공무원이다	156	9	2012-07-12	101	G rating	구자홍	10.081425	2	20	6	12.291640	6.12
4	불량남녀	162	9	2010-11-04	108	15 years old	신근호	0.693147	1	251	2	13.088575	8.14

	title	distributor	genre	release_time	time	screening_rat	director	dir_prev_bfnum	dir_prev_num	num_staff	num_actor	box_off_num	user_rating	year	month	day
0	개들의 전쟁	150	11	20121122	96	No Youth	조병옥	0.000000	0	91	2	10.060449	7.23	2012	11	22
1	내부자들	164	10	20151119	130	No Youth	우민호	13.965312	1	387	3	15.771725	9.04	2015	11	19
2	은밀하게 위대하게	164	11	20130605	123	15 years old	장철수	12.304905	3	343	4	15.755558	7.11	2013	6	5
3	나는 공무원이다	156	9	20120712	101	G rating	구자홍	10.081425	2	20	6	12.291640	6.12	2012	7	12
4	불량남녀	162	9	20101104	108	15 years old	신근호	0.693147	1	251	2	13.088575	8.14	2010	11	4

인기포스트 MORE POST

ABOUT ME

CasimIT CasimIT

1. Data set 불러오기¶

NAVER 검색 API사용¶

train¶

test¶

2. Exploratory Data Analysis AND Processing¶

Column_name Description¶

2.1 Target Variable (Dependent Variable)¶

box_off_num : 관객수¶

2.2 Missing data¶

dir_prev_bfnum¶

dir_prev_num¶

columns 상관관계¶

2.3 Categorical Variable¶

distributor (배급사)¶

screening_rat¶

genre¶

director¶

release_time¶

2.4 Numeric Variable¶

num_staff¶

time¶

num_actor¶

user_rating¶

2.5 Preprocessing¶

3. Feature Engineering¶

4. Modeling¶

'IT 지식 창고' 카테고리의 다른 글

티스토리툴바

	box_off_num
screening_rat
12 years old	10.291901
15 years old	11.218375
G rating	8.943622
No Youth	8.998506

	genre	box_off_num
0	documentary	8.232809
1	musical	8.393272
2	melo/romance	9.225152
3	mistery	9.249889
4	drama	9.690253
5	animation	10.058547
6	SF	10.412894
7	suspense	10.955169
8	horror	11.107524
9	comedy	11.448555
10	noir	12.813247
11	Action	12.897125

	title	distributor	genre	release_time	time	screening_rat	director	dir_prev_bfnum	dir_prev_num	num_staff	num_actor	box_off_num	year	month	day
310	집	146	5	20110317	83	15 years old	박미선	0.000000	0	198	6	4.406719	2011	3	17
414	은실이	146	5	20120308	71	No Youth	김선아	0.000000	0	17	3	5.389072	2012	3	8
306	당신은 아름답다	31	0	20100826	75	G rating	백승창	13.084579	4	0	1	5.484797	2010	8	26
146	정글피쉬2	59	4	20110303	101	15 years old	김정환	0.000000	1	111	5	5.817111	2011	3	3
110	여행	72	4	20100520	148	G rating	배창호	0.000000	0	61	6	6.186209	2010	5	20

	title	distributor	genre	release_time	time	screening_rat	director	dir_prev_bfnum	dir_prev_num	num_staff	num_actor	year	month	day
600	하모니	146.0	4	20100128	115	12 years old	강대규	15.252168	3	419	7	2010	1	28
606	아마존의 눈물 극장판	0.0	0	20100325	88	15 years old	김진만	0.000000	0	76	1	2010	3	25
638	량강도 아이들	0.0	4	20111117	95	G rating	김성훈	12.002701	1	131	9	2011	11	17
640	더 킥	162.0	11	20111103	105	12 years old	프라챠 핀카엡	0.000000	0	18	7	2011	11	3
658	나는 아빠다	150.0	11	20110414	99	No Youth	전만배	12.995377	1	423	5	2011	4	14

주가등락예측 프로젝트(캡스톤 디자인) (0)	2019.07.07
(DACON) 아파트 실거래가 예측 튜토리얼 대회 (0)	2019.07.07
(Kaggle) 2019 2nd ML month KaKR - House Price (0)	2019.04.21
주가 예측 딥 러닝을 위한 자료들 (0)	2019.04.02
KNN(K Neighbor Nearest)이란? (0)	2019.03.25

인기포스트 MORE POST

ABOUT ME

1. Data set 불러오기¶

NAVER 검색 API사용¶

train¶

test¶

2. Exploratory Data Analysis AND Processing¶

Column_name Description¶

2.1 Target Variable (Dependent Variable)¶

box_off_num : 관객수¶

2.2 Missing data¶

dir_prev_bfnum¶

dir_prev_num¶

columns 상관관계¶

2.3 Categorical Variable¶

distributor (배급사)¶

screening_rat¶

genre¶

director¶

release_time¶

2.4 Numeric Variable¶

num_staff¶

time¶

num_actor¶

user_rating¶

2.5 Preprocessing¶

3. Feature Engineering¶

4. Modeling¶

'IT 지식 창고' 카테고리의 다른 글

관련글 관련글 더보기

티스토리툴바