# 처리해야 할 문장을 파이썬 리스트에 옮겨담았습니다.
sentences=['i feel hungry', 'i eat lunch', 'now i feel happy']

# 파이썬 split() 메소드를 이용해 단어 단위로 문장을 쪼개 봅니다.
word_list = 'i feel hungry'.split()
print(word_list)

['i', 'feel', 'hungry']


index_to_word={}  # 빈 딕셔너리를 만들어서

# 단어들을 하나씩 채워 봅니다. 채우는 순서는 일단 임의로 하였습니다. 그러나 사실 순서는 중요하지 않습니다. 
# <BOS>, <PAD>, <UNK>는 관례적으로 딕셔너리 맨 앞에 넣어줍니다. 
index_to_word[0]='<PAD>'  # 패딩용 단어
index_to_word[1]='<BOS>'  # 문장의 시작지점
index_to_word[2]='<UNK>'  # 사전에 없는(Unknown) 단어
index_to_word[3]='i'
index_to_word[4]='feel'
index_to_word[5]='hungry'
index_to_word[6]='eat'
index_to_word[7]='lunch'
index_to_word[8]='now'
index_to_word[9]='happy'

print(index_to_word)

{0: '<PAD>', 1: '<BOS>', 2: '<UNK>', 3: 'i', 4: 'feel', 5: 'hungry', 6: 'eat', 7: 'lunch', 8: 'now', 9: 'happy'}


# {텍스트:인덱스 구조로 변경}
word_to_index={word:index for index, word in index_to_word.items()}
print(word_to_index)

{'<PAD>': 0, '<BOS>': 1, '<UNK>': 2, 'i': 3, 'feel': 4, 'hungry': 5, 'eat': 6, 'lunch': 7, 'now': 8, 'happy': 9}


print(word_to_index['feel'])  # 단어 'feel'은 숫자 인덱스 4로 바뀝니다.

4


# 문장 1개를 활용할 딕셔너리와 함께 주면, 단어 인덱스 리스트로 변환해 주는 함수를 만들어 봅시다.
# 단, 모든 문장은 <BOS>로 시작하는 것으로 합니다. 
def get_encoded_sentence(sentence, word_to_index):
    return [word_to_index['<BOS>']]+[word_to_index[word] if word in word_to_index else word_to_index['<UNK>'] for word in sentence.split()]

print(get_encoded_sentence('i eat lunch', word_to_index))

# 인덱스대로 해석 됨

[1, 3, 6, 7]


# 여러 개의 문장 리스트를 한꺼번에 숫자 텐서로 encode해 주는 함수입니다. 
def get_encoded_sentences(sentences, word_to_index):
    return [get_encoded_sentence(sentence, word_to_index) for sentence in sentences]

# sentences=['i feel hungry', 'i eat lunch', 'now i feel happy'] 가 아래와 같이 변환됩니다. 
encoded_sentences = get_encoded_sentences(sentences, word_to_index)
print(encoded_sentences)

[[1, 3, 4, 5], [1, 3, 6, 7], [1, 8, 3, 4, 9]]


# 숫자 벡터로 encode된 문장을 원래대로 decode하는 함수입니다. 
def get_decoded_sentence(encoded_sentence, index_to_word):
    return ' '.join(index_to_word[index] if index in index_to_word else '<UNK>' for index in encoded_sentence[1:])  #[1:]를 통해 <BOS>를 제외

print(get_decoded_sentence([1, 3, 4, 5], index_to_word))

i feel hungry


# 여러개의 숫자 벡터로 encode된 문장을 한꺼번에 원래대로 decode하는 함수입니다. 
def get_decoded_sentences(encoded_sentences, index_to_word):
    return [get_decoded_sentence(encoded_sentence, index_to_word) for encoded_sentence in encoded_sentences]

# encoded_sentences=[[1, 3, 4, 5], [1, 3, 6, 7], [1, 8, 3, 4, 9]] 가 아래와 같이 변환됩니다.
print(get_decoded_sentences(encoded_sentences, index_to_word))

['i feel hungry', 'i eat lunch', 'now i feel happy']


# 아래 코드는 그대로 실행하시면 에러가 발생할 것입니다. 

import numpy as np
import tensorflow as tf
from tensorflow import keras

vocab_size = len(word_to_index)  # 위 예시에서 딕셔너리에 포함된 단어 개수는 10
word_vector_dim = 4    # 위 그림과 같이 4차원의 워드벡터를 가정합니다. 

embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=word_vector_dim, mask_zero=True)



# 숫자로 변환된 텍스트 데이터 [[1, 3, 4, 5], [1, 3, 6, 7], [1, 8, 3, 4, 9]] 에 Embedding 레이어를 적용합니다. 
# list 형태의 sentences는 numpy array로 변환되어야 딥러닝 레이어의 입력이 될 수 있습니다.

raw_inputs = np.array(get_encoded_sentences(sentences, word_to_index))

# 바로 실행하게 되면 에러가 난다. raw_inputs의 문장의 길이가 다르기때문에
# raw_inputs = np.array(get_encoded_sentences(sentences, word_to_index))
# Tensorflow에서는 keras.preprocessing.sequence.pad_sequences라는 편리한 함수를 통해 
# 문장 벡터 뒤에 패딩(<PAD>)을 추가하여 길이를 일정하게 맞춰주는 기능을 제공하므로 거쳐야함  

raw_inputs = keras.preprocessing.sequence.pad_sequences(raw_inputs,
                                                       value=word_to_index['<PAD>'],
                                                       padding='post',
                                                       maxlen=5)
print(raw_inputs)

output = embedding(raw_inputs)

print(output)

[[1 3 4 5 0]
 [1 3 6 7 0]
 [1 8 3 4 9]]
tf.Tensor(
[[[-0.01419399  0.04435832  0.03141144 -0.01476296]
  [ 0.00493108 -0.04027823  0.0009155   0.04399994]
  [-0.03199576 -0.03502614 -0.01556406  0.01899946]
  [-0.02030488  0.0213698   0.04063449 -0.01634901]
  [ 0.03792307  0.04741819  0.02564708  0.01937077]]

 [[-0.01419399  0.04435832  0.03141144 -0.01476296]
  [ 0.00493108 -0.04027823  0.0009155   0.04399994]
  [-0.00091137 -0.01864381  0.00090228 -0.04275799]
  [-0.0268575   0.01646766  0.02127625 -0.02600731]
  [ 0.03792307  0.04741819  0.02564708  0.01937077]]

 [[-0.01419399  0.04435832  0.03141144 -0.01476296]
  [ 0.04059373  0.04142031  0.04290563  0.0032341 ]
  [ 0.00493108 -0.04027823  0.0009155   0.04399994]
  [-0.03199576 -0.03502614 -0.01556406  0.01899946]
  [-0.02797966  0.02402735  0.03957239 -0.03447608]]], shape=(3, 5, 4), dtype=float32)

/home/ssac24/anaconda3/envs/aiffel/lib/python3.7/site-packages/ipykernel_launcher.py:17: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray


# RNN 모델로 텍스트 데이터 처리
vocab_size = 10  # 어휘 사전의 크기입니다(10개의 단어)
word_vector_dim = 4  # 단어 하나를 표현하는 임베딩 벡터의 차원수입니다. 

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model.add(keras.layers.LSTM(8))   # 가장 널리 쓰이는 RNN인 LSTM 레이어를 사용하였습니다. 이때 LSTM state 벡터의 차원수는 8로 하였습니다. (변경가능)
model.add(keras.layers.Dense(8, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))  # 최종 출력은 긍정/부정을 나타내는 1dim 입니다.

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_2 (Embedding)      (None, None, 4)           40        
_________________________________________________________________
lstm_1 (LSTM)                (None, 8)                 416       
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 72        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 9         
=================================================================
Total params: 537
Trainable params: 537
Non-trainable params: 0
_________________________________________________________________


# CNN으로 텍스트 처리
vocab_size = 10  # 어휘 사전의 크기입니다(10개의 단어)
word_vector_dim = 4   # 단어 하나를 표현하는 임베딩 벡터의 차원수입니다. 

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model.add(keras.layers.Conv1D(16, 7, activation='relu'))
model.add(keras.layers.MaxPooling1D(5))
model.add(keras.layers.Conv1D(16, 7, activation='relu'))
model.add(keras.layers.GlobalMaxPooling1D())
model.add(keras.layers.Dense(8, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))  # 최종 출력은 긍정/부정을 나타내는 1dim 입니다.

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_3 (Embedding)      (None, None, 4)           40        
_________________________________________________________________
conv1d (Conv1D)              (None, None, 16)          464       
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, None, 16)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 16)          1808      
_________________________________________________________________
global_max_pooling1d (Global (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 9         
=================================================================
Total params: 2,457
Trainable params: 2,457
Non-trainable params: 0
_________________________________________________________________


# GlobalMaxPooling1D()를 이용해 전체 문장 중에서 단 하나의 가장 중요한 단어만 피처로 추출하여 그것으로 문장의 긍정/부정을 평가하는 방식
vocab_size = 10  # 어휘 사전의 크기입니다(10개의 단어)
word_vector_dim = 4   # 단어 하나를 표현하는 임베딩 벡터의 차원수입니다. 

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model.add(keras.layers.GlobalMaxPooling1D())
model.add(keras.layers.Dense(8, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))  # 최종 출력은 긍정/부정을 나타내는 1dim 입니다.

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_4 (Embedding)      (None, None, 4)           40        
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 4)                 0         
_________________________________________________________________
dense_6 (Dense)              (None, 8)                 40        
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 9         
=================================================================
Total params: 89
Trainable params: 89
Non-trainable params: 0
_________________________________________________________________


# 데이터 전처리
import tensorflow as tf
from tensorflow import keras
import numpy as np

print(tf.__version__)
imdb = keras.datasets.imdb

# IMDB 데이터셋 다운로드 
# 10000개의 데이터셋으로 딕셔너리형태로 생성
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)
print("훈련 샘플 개수: {}, 테스트 개수: {}".format(len(x_train), len(x_test)))

2.2.0
훈련 샘플 개수: 25000, 테스트 개수: 25000


print(x_train[0])  # 1번째 리뷰데이터
print('라벨: ', y_train[0])  # 1번째 리뷰데이터의 라벨
print('1번째 리뷰 문장 길이: ', len(x_train[0]))
print('2번째 리뷰 문장 길이: ', len(x_train[1]))

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
라벨:  1
1번째 리뷰 문장 길이:  218
2번째 리뷰 문장 길이:  189


word_to_index = imdb.get_word_index()
index_to_word = {index:word for word, index in word_to_index.items()}
print(index_to_word[1])     # 'the' 가 출력됩니다. 
print(word_to_index['the'])  # 1 이 출력됩니다.

the
1


# word_to_index는 IMDB 텍스트 데이터셋의 단어 출현 빈도 기준으로 내림차수 정렬되어 있습니다.
#실제 인코딩 인덱스는 제공된 word_to_index에서 index 기준으로 3씩 뒤로 밀려 있습니다.  
word_to_index = {k:(v+3) for k,v in word_to_index.items()}

# 처음 몇 개 인덱스는 사전에 정의되어 있습니다
word_to_index["<PAD>"] = 0
word_to_index["<BOS>"] = 1
word_to_index["<UNK>"] = 2  # unknown
word_to_index["<UNUSED>"] = 3

index_to_word[0] = "<PAD>"
index_to_word[1] = "<BOS>"
index_to_word[2] = "<UNK>"
index_to_word[3] = "<UNUSED>"

index_to_word = {index:word for word, index in word_to_index.items()}

print(index_to_word[1])     # '<BOS>' 가 출력됩니다. 
print(word_to_index['the'])  # 4 이 출력됩니다. 
print(index_to_word[4])     # 'the' 가 출력됩니다.

<BOS>
4
the


print(get_decoded_sentence(x_train[0], index_to_word))
print('라벨: ', y_train[0])  # 1번째 리뷰데이터의 라벨

this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <UNK> is an amazing actor and now the same being director <UNK> father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for <UNK> and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also <UNK> to the two little boy's that played the <UNK> of norman and paul they were just brilliant children are often left out of the <UNK> list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you think the whole story was so lovely because it was true and was someone's life after all that was shared with us all
라벨:  1


# 전체문장 길이를 알아낸 후 적절한 값을 넣는 것이 성능에 도움이 됨

total_data_text = list(x_train) + list(x_test)
# 텍스트데이터 문장길이의 리스트를 생성한 후
num_tokens = [len(tokens) for tokens in total_data_text]
num_tokens = np.array(num_tokens)
# 문장길이의 평균값, 최대값, 표준편차를 계산해 본다. 
print('문장길이 평균 : ', np.mean(num_tokens))
print('문장길이 최대 : ', np.max(num_tokens))
print('문장길이 표준편차 : ', np.std(num_tokens))

# 예를들어, 최대 길이를 (평균 + 2*표준편차)로 한다면,  
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
maxlen = int(max_tokens)
print('pad_sequences maxlen : ', maxlen)
print('전체 문장의 {}%가 maxlen 설정값 이내에 포함됩니다. '.format(np.sum(num_tokens < max_tokens) / len(num_tokens)))

문장길이 평균 :  234.75892
문장길이 최대 :  2494
문장길이 표준편차 :  172.91149458735703
pad_sequences maxlen :  580
전체 문장의 0.94536%가 maxlen 설정값 이내에 포함됩니다.


# padding의 위치에 따라 달라질 수 있다.
# pre
x_train = keras.preprocessing.sequence.pad_sequences(x_train,
                                                        value=word_to_index["<PAD>"],
                                                        padding='post', # 혹은 'pre'
                                                        maxlen=maxlen)

x_test = keras.preprocessing.sequence.pad_sequences(x_test,
                                                       value=word_to_index["<PAD>"],
                                                       padding='post', # 혹은 'pre'
                                                       maxlen=maxlen)

print(x_train.shape)

(25000, 580)


# RNN 모델 설계
vocab_size = 10000    # 어휘 사전의 크기입니다(10,000개의 단어)
word_vector_dim = 16  # 워드 벡터의 차원수 (변경가능한 하이퍼파라미터)

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model.add(keras.layers.LSTM(8))   # 가장 널리 쓰이는 RNN인 LSTM 레이어를 사용하였습니다. 이때 LSTM state 벡터의 차원수는 8로 하였습니다. (변경가능)
model.add(keras.layers.Dense(8, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))  # 최종 출력은 긍정/부정을 나타내는 1dim 입니다.

model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_5 (Embedding)      (None, None, 16)          160000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 8)                 800       
_________________________________________________________________
dense_8 (Dense)              (None, 8)                 72        
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 9         
=================================================================
Total params: 160,881
Trainable params: 160,881
Non-trainable params: 0
_________________________________________________________________


# 데이터 train, test셋 분리
# validation set 10000건 분리
x_val = x_train[:10000]   
y_val = y_train[:10000]

# validation set을 제외한 나머지 15000건
partial_x_train = x_train[10000:]  
partial_y_train = y_train[10000:]

print(partial_x_train.shape)
print(partial_y_train.shape)

(15000, 580)
(15000,)


# 모델 학습
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
              
epochs=10 # 몇 epoch를 훈련하면 좋을지 결과를 보면서 바꾸어 봅시다. 

history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=epochs,
                    batch_size=512,
                    validation_data=(x_val, y_val),
                    verbose=1)

Epoch 1/10
30/30 [==============================] - 2s 59ms/step - loss: 0.6932 - accuracy: 0.4967 - val_loss: 0.6932 - val_accuracy: 0.4973
Epoch 2/10
30/30 [==============================] - 1s 46ms/step - loss: 0.6929 - accuracy: 0.5147 - val_loss: 0.6931 - val_accuracy: 0.4986
Epoch 3/10
30/30 [==============================] - 1s 44ms/step - loss: 0.6925 - accuracy: 0.5152 - val_loss: 0.6933 - val_accuracy: 0.4995
Epoch 4/10
30/30 [==============================] - 1s 47ms/step - loss: 0.6913 - accuracy: 0.5303 - val_loss: 0.6924 - val_accuracy: 0.5122
Epoch 5/10
30/30 [==============================] - 1s 48ms/step - loss: 0.6786 - accuracy: 0.5505 - val_loss: 0.6410 - val_accuracy: 0.6393
Epoch 6/10
30/30 [==============================] - 1s 42ms/step - loss: 0.6447 - accuracy: 0.6934 - val_loss: 0.6180 - val_accuracy: 0.7009
Epoch 7/10
30/30 [==============================] - 1s 48ms/step - loss: 0.6907 - accuracy: 0.5624 - val_loss: 0.6995 - val_accuracy: 0.5233
Epoch 8/10
30/30 [==============================] - 1s 47ms/step - loss: 0.6905 - accuracy: 0.5209 - val_loss: 0.6802 - val_accuracy: 0.5413
Epoch 9/10
30/30 [==============================] - 2s 52ms/step - loss: 0.6618 - accuracy: 0.5757 - val_loss: 0.6406 - val_accuracy: 0.6340
Epoch 10/10
30/30 [==============================] - 1s 45ms/step - loss: 0.6140 - accuracy: 0.6927 - val_loss: 0.6145 - val_accuracy: 0.7058


# 평가
results = model.evaluate(x_test,  y_test, verbose=2)

print(results)

782/782 - 6s - loss: 0.6143 - accuracy: 0.7069
[0.6142594218254089, 0.7069200277328491]


history_dict = history.history
print(history_dict.keys()) # epoch에 따른 그래프를 그려볼 수 있는 항목들

dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])


import matplotlib.pyplot as plt

acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

# "bo"는 "파란색 점"입니다
plt.plot(epochs, loss, 'bo', label='Training loss')
# b는 "파란 실선"입니다
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()


plt.clf()   # 그림을 초기화합니다

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()


embedding_layer = model.layers[0]
weights = embedding_layer.get_weights()[0]
print(weights.shape)    # shape: (vocab_size, embedding_dim)

(10000, 16)


import os

# 학습한 Embedding 파라미터를 파일에 써서 저장합니다. 
word2vec_file_path = os.getenv('HOME')+'/aiffel/sentiment_classification/word2vec.txt'
f = open(word2vec_file_path, 'w')
f.write('{} {}\n'.format(vocab_size-4, word_vector_dim))  # 몇개의 벡터를 얼마 사이즈로 기재할지 타이틀을 씁니다.

# 단어 개수(에서 특수문자 4개는 제외하고)만큼의 워드 벡터를 파일에 기록합니다. 
vectors = model.get_weights()[0]
for i in range(4,vocab_size):
    f.write('{} {}\n'.format(index_to_word[i], ' '.join(map(str, list(vectors[i, :])))))
f.close()


from gensim.models.keyedvectors import Word2VecKeyedVectors

word_vectors = Word2VecKeyedVectors.load_word2vec_format(word2vec_file_path, binary=False)
vector = word_vectors['computer']
vector

array([ 0.02751031, -0.06956553, -0.02257856, -0.04380266,  0.08661791,
       -0.07683262,  0.08058365,  0.03842867,  0.01229863,  0.08696323,
        0.01145777, -0.05076722, -0.05425929,  0.0280428 ,  0.02824249,
       -0.0505434 ], dtype=float32)


# love와 비슷한 단어를 찾게 시켰는데 정확도가 좋지 못하다
word_vectors.similar_by_word("love")

[('superb', 0.8798491954803467),
 ('ego', 0.8724830150604248),
 ('guts', 0.8551943898200989),
 ('vengeance', 0.8418107032775879),
 ('replacement', 0.8314927220344543),
 ('italy', 0.8285509347915649),
 ('masterful', 0.8257768750190735),
 ('tracking', 0.8253258466720581),
 ('futuristic', 0.8218926787376404),
 ('gripping', 0.821360170841217)]


from gensim.models import KeyedVectors
word2vec_path = os.getenv('HOME')+'/aiffel/sentiment_classification/GoogleNews-vectors-negative300.bin.gz'
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True, limit=1000000)
vector = word2vec['computer']
vector     # 무려 300dim의 워드 벡터입니다.
# limit으로 100만개만 가져옴

array([ 1.07421875e-01, -2.01171875e-01,  1.23046875e-01,  2.11914062e-01,
       -9.13085938e-02,  2.16796875e-01, -1.31835938e-01,  8.30078125e-02,
        2.02148438e-01,  4.78515625e-02,  3.66210938e-02, -2.45361328e-02,
        2.39257812e-02, -1.60156250e-01, -2.61230469e-02,  9.71679688e-02,
       -6.34765625e-02,  1.84570312e-01,  1.70898438e-01, -1.63085938e-01,
       -1.09375000e-01,  1.49414062e-01, -4.65393066e-04,  9.61914062e-02,
        1.68945312e-01,  2.60925293e-03,  8.93554688e-02,  6.49414062e-02,
        3.56445312e-02, -6.93359375e-02, -1.46484375e-01, -1.21093750e-01,
       -2.27539062e-01,  2.45361328e-02, -1.24511719e-01, -3.18359375e-01,
       -2.20703125e-01,  1.30859375e-01,  3.66210938e-02, -3.63769531e-02,
       -1.13281250e-01,  1.95312500e-01,  9.76562500e-02,  1.26953125e-01,
        6.59179688e-02,  6.93359375e-02,  1.02539062e-02,  1.75781250e-01,
       -1.68945312e-01,  1.21307373e-03, -2.98828125e-01, -1.15234375e-01,
        5.66406250e-02, -1.77734375e-01, -2.08984375e-01,  1.76757812e-01,
        2.38037109e-02, -2.57812500e-01, -4.46777344e-02,  1.88476562e-01,
        5.51757812e-02,  5.02929688e-02, -1.06933594e-01,  1.89453125e-01,
       -1.16210938e-01,  8.49609375e-02, -1.71875000e-01,  2.45117188e-01,
       -1.73828125e-01, -8.30078125e-03,  4.56542969e-02, -1.61132812e-02,
        1.86523438e-01, -6.05468750e-02, -4.17480469e-02,  1.82617188e-01,
        2.20703125e-01, -1.22558594e-01, -2.55126953e-02, -3.08593750e-01,
        9.13085938e-02,  1.60156250e-01,  1.70898438e-01,  1.19628906e-01,
        7.08007812e-02, -2.64892578e-02, -3.08837891e-02,  4.06250000e-01,
       -1.01562500e-01,  5.71289062e-02, -7.26318359e-03, -9.17968750e-02,
       -1.50390625e-01, -2.55859375e-01,  2.16796875e-01, -3.63769531e-02,
        2.24609375e-01,  8.00781250e-02,  1.56250000e-01,  5.27343750e-02,
        1.50390625e-01, -1.14746094e-01, -8.64257812e-02,  1.19140625e-01,
       -7.17773438e-02,  2.73437500e-01, -1.64062500e-01,  7.29370117e-03,
        4.21875000e-01, -1.12792969e-01, -1.35742188e-01, -1.31835938e-01,
       -1.37695312e-01, -7.66601562e-02,  6.25000000e-02,  4.98046875e-02,
       -1.91406250e-01, -6.03027344e-02,  2.27539062e-01,  5.88378906e-02,
       -3.24218750e-01,  5.41992188e-02, -1.35742188e-01,  8.17871094e-03,
       -5.24902344e-02, -1.74713135e-03, -9.81445312e-02, -2.86865234e-02,
        3.61328125e-02,  2.15820312e-01,  5.98144531e-02, -3.08593750e-01,
       -2.27539062e-01,  2.61718750e-01,  9.86328125e-02, -5.07812500e-02,
        1.78222656e-02,  1.31835938e-01, -5.35156250e-01, -1.81640625e-01,
        1.38671875e-01, -3.10546875e-01, -9.71679688e-02,  1.31835938e-01,
       -1.16210938e-01,  7.03125000e-02,  2.85156250e-01,  3.51562500e-02,
       -1.01562500e-01, -3.75976562e-02,  1.41601562e-01,  1.42578125e-01,
       -5.68847656e-02,  2.65625000e-01, -2.09960938e-01,  9.64355469e-03,
       -6.68945312e-02, -4.83398438e-02, -6.10351562e-02,  2.45117188e-01,
       -9.66796875e-02,  1.78222656e-02, -1.27929688e-01, -4.78515625e-02,
       -7.26318359e-03,  1.79687500e-01,  2.78320312e-02, -2.10937500e-01,
       -1.43554688e-01, -1.27929688e-01,  1.73339844e-02, -3.60107422e-03,
       -2.04101562e-01,  3.63159180e-03, -1.19628906e-01, -6.15234375e-02,
        5.93261719e-02, -3.23486328e-03, -1.70898438e-01, -3.14941406e-02,
       -8.88671875e-02, -2.89062500e-01,  3.44238281e-02, -1.87500000e-01,
        2.94921875e-01,  1.58203125e-01, -1.19628906e-01,  7.61718750e-02,
        6.39648438e-02, -4.68750000e-02, -6.83593750e-02,  1.21459961e-02,
       -1.44531250e-01,  4.54101562e-02,  3.68652344e-02,  3.88671875e-01,
        1.45507812e-01, -2.55859375e-01, -4.46777344e-02, -1.33789062e-01,
       -1.38671875e-01,  6.59179688e-02,  1.37695312e-01,  1.14746094e-01,
        2.03125000e-01, -4.78515625e-02,  1.80664062e-02, -8.54492188e-02,
       -2.48046875e-01, -3.39843750e-01, -2.83203125e-02,  1.05468750e-01,
       -2.14843750e-01, -8.74023438e-02,  7.12890625e-02,  1.87500000e-01,
       -1.12304688e-01,  2.73437500e-01, -3.26171875e-01, -1.77734375e-01,
       -4.24804688e-02, -2.69531250e-01,  6.64062500e-02, -6.88476562e-02,
       -1.99218750e-01, -7.03125000e-02, -2.43164062e-01, -3.66210938e-02,
       -7.37304688e-02, -1.77734375e-01,  9.17968750e-02, -1.25000000e-01,
       -1.65039062e-01, -3.57421875e-01, -2.85156250e-01, -1.66992188e-01,
        1.97265625e-01, -1.53320312e-01,  2.31933594e-02,  2.06054688e-01,
        1.80664062e-01, -2.74658203e-02, -1.92382812e-01, -9.61914062e-02,
       -1.06811523e-02, -4.73632812e-02,  6.54296875e-02, -1.25732422e-02,
        1.78222656e-02, -8.00781250e-02, -2.59765625e-01,  9.37500000e-02,
       -7.81250000e-02,  4.68750000e-02, -2.22167969e-02,  1.86767578e-02,
        3.11279297e-02,  1.04980469e-02, -1.69921875e-01,  2.58789062e-02,
       -3.41796875e-02, -1.44042969e-02, -5.46875000e-02, -8.78906250e-02,
        1.96838379e-03,  2.23632812e-01, -1.36718750e-01,  1.75781250e-01,
       -1.63085938e-01,  1.87500000e-01,  3.44238281e-02, -5.63964844e-02,
       -2.27689743e-05,  4.27246094e-02,  5.81054688e-02, -1.07910156e-01,
       -3.88183594e-02, -2.69531250e-01,  3.34472656e-02,  9.81445312e-02,
        5.63964844e-02,  2.23632812e-01, -5.49316406e-02,  1.46484375e-01,
        5.93261719e-02, -2.19726562e-01,  6.39648438e-02,  1.66015625e-02,
        4.56542969e-02,  3.26171875e-01, -3.80859375e-01,  1.70898438e-01,
        5.66406250e-02, -1.04492188e-01,  1.38671875e-01, -1.57226562e-01,
        3.23486328e-03, -4.80957031e-02, -2.48046875e-01, -6.20117188e-02],
      dtype=float32)


# 이전 모델을 가져와서 word2vec으로 학습시킬 것

vocab_size = 10000    # 어휘 사전의 크기입니다(10,000개의 단어)
word_vector_dim = 300  # 워드 벡터의 차원수 (변경가능한 하이퍼파라미터)

embedding_matrix = np.random.rand(vocab_size, word_vector_dim)

# embedding_matrix에 Word2Vec 워드벡터를 단어 하나씩마다 차례차례 카피한다.
for i in range(4,vocab_size):
    if index_to_word[i] in word2vec:
        embedding_matrix[i] = word2vec[index_to_word[i]]


from tensorflow.keras.initializers import Constant

vocab_size = 10000    # 어휘 사전의 크기입니다(10,000개의 단어)
word_vector_dim = 300  # 워드 벡터의 차원수 (변경가능한 하이퍼파라미터)

# 모델 구성
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 
                                 word_vector_dim, 
                                 embeddings_initializer=Constant(embedding_matrix),  # 카피한 임베딩을 여기서 활용
                                 input_length=maxlen, 
                                 trainable=True))   # trainable을 True로 주면 Fine-tuning
model.add(keras.layers.Conv1D(16, 7, activation='relu'))
model.add(keras.layers.MaxPooling1D(5))
model.add(keras.layers.Conv1D(16, 7, activation='relu'))
model.add(keras.layers.GlobalMaxPooling1D())
model.add(keras.layers.Dense(8, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid')) 

model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_6 (Embedding)      (None, 580, 300)          3000000   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 574, 16)           33616     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 114, 16)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 108, 16)           1808      
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 16)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 8)                 136       
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 9         
=================================================================
Total params: 3,035,569
Trainable params: 3,035,569
Non-trainable params: 0
_________________________________________________________________


# 학습의 진행
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
              
epochs=20  # 몇 epoch를 훈련하면 좋을지 결과를 보면서 바꾸어 봅시다. 

history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=epochs,
                    batch_size=512,
                    validation_data=(x_val, y_val),
                    verbose=1)

Epoch 1/20
30/30 [==============================] - 20s 661ms/step - loss: 0.7208 - accuracy: 0.5023 - val_loss: 0.6937 - val_accuracy: 0.5171
Epoch 2/20
30/30 [==============================] - 9s 300ms/step - loss: 0.6808 - accuracy: 0.5794 - val_loss: 0.6812 - val_accuracy: 0.5692
Epoch 3/20
30/30 [==============================] - 9s 307ms/step - loss: 0.6459 - accuracy: 0.6627 - val_loss: 0.6370 - val_accuracy: 0.6452
Epoch 4/20
30/30 [==============================] - 9s 284ms/step - loss: 0.5371 - accuracy: 0.7676 - val_loss: 0.4801 - val_accuracy: 0.7812
Epoch 5/20
30/30 [==============================] - 9s 285ms/step - loss: 0.3681 - accuracy: 0.8497 - val_loss: 0.3816 - val_accuracy: 0.8245
Epoch 6/20
30/30 [==============================] - 9s 284ms/step - loss: 0.2540 - accuracy: 0.9017 - val_loss: 0.3385 - val_accuracy: 0.8514
Epoch 7/20
30/30 [==============================] - 9s 283ms/step - loss: 0.1765 - accuracy: 0.9404 - val_loss: 0.3342 - val_accuracy: 0.8579
Epoch 8/20
30/30 [==============================] - 9s 284ms/step - loss: 0.1254 - accuracy: 0.9625 - val_loss: 0.3290 - val_accuracy: 0.8661
Epoch 9/20
30/30 [==============================] - 9s 285ms/step - loss: 0.0836 - accuracy: 0.9814 - val_loss: 0.3396 - val_accuracy: 0.8664
Epoch 10/20
30/30 [==============================] - 9s 284ms/step - loss: 0.0549 - accuracy: 0.9911 - val_loss: 0.3561 - val_accuracy: 0.8662
Epoch 11/20
30/30 [==============================] - 8s 283ms/step - loss: 0.0369 - accuracy: 0.9963 - val_loss: 0.3772 - val_accuracy: 0.8660
Epoch 12/20
30/30 [==============================] - 9s 284ms/step - loss: 0.0244 - accuracy: 0.9987 - val_loss: 0.3944 - val_accuracy: 0.8676
Epoch 13/20
30/30 [==============================] - 9s 284ms/step - loss: 0.0177 - accuracy: 0.9990 - val_loss: 0.4133 - val_accuracy: 0.8654
Epoch 14/20
30/30 [==============================] - 9s 284ms/step - loss: 0.0130 - accuracy: 0.9994 - val_loss: 0.4302 - val_accuracy: 0.8639
Epoch 15/20
30/30 [==============================] - 9s 285ms/step - loss: 0.0097 - accuracy: 0.9995 - val_loss: 0.4447 - val_accuracy: 0.8645
Epoch 16/20
30/30 [==============================] - 9s 284ms/step - loss: 0.0075 - accuracy: 0.9996 - val_loss: 0.4653 - val_accuracy: 0.8652
Epoch 17/20
30/30 [==============================] - 9s 283ms/step - loss: 0.0059 - accuracy: 0.9997 - val_loss: 0.4723 - val_accuracy: 0.8662
Epoch 18/20
30/30 [==============================] - 9s 284ms/step - loss: 0.0045 - accuracy: 0.9997 - val_loss: 0.4851 - val_accuracy: 0.8650
Epoch 19/20
30/30 [==============================] - 9s 284ms/step - loss: 0.0035 - accuracy: 0.9999 - val_loss: 0.4970 - val_accuracy: 0.8656
Epoch 20/20
30/30 [==============================] - 8s 283ms/step - loss: 0.0029 - accuracy: 0.9999 - val_loss: 0.5057 - val_accuracy: 0.8647


# 테스트셋을 통한 모델 평가
results = model.evaluate(x_test,  y_test, verbose=2)

print(results)

782/782 - 9s - loss: 0.5255 - accuracy: 0.8612
[0.5254648923873901, 0.861240029335022]


import pandas as pd
import urllib.request
%matplotlib inline
import matplotlib.pyplot as plt
import re
from konlpy.tag import Okt
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import Counter

# 데이터를 읽어봅시다. 
train_data = pd.read_table('~/aiffel/sentiment_classification/ratings_train.txt')
test_data = pd.read_table('~/aiffel/sentiment_classification/ratings_test.txt')

train_data.head()


from konlpy.tag import Mecab
tokenizer = Mecab()
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

def load_data(train_data, test_data, num_words=10000):
    train_data.drop_duplicates(subset=['document'], inplace=True)
    train_data = train_data.dropna(how = 'any') 
    test_data.drop_duplicates(subset=['document'], inplace=True)
    test_data = test_data.dropna(how = 'any') 

    X_train = []
    for sentence in train_data['document']:
        temp_X = tokenizer.morphs(sentence) # 토큰화
        temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
        X_train.append(temp_X)

    X_test = []
    for sentence in test_data['document']:
        temp_X = tokenizer.morphs(sentence) # 토큰화
        temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
        X_test.append(temp_X)

    words = np.concatenate(X_train).tolist()
    counter = Counter(words)
    counter = counter.most_common(10000-4)
    vocab = ['<PAD>', '<BOS>', '<UNK>', '<UNUSED>'] + [key for key, _ in counter]
    word_to_index = {word:index for index, word in enumerate(vocab)}

    def wordlist_to_indexlist(wordlist):
        return [word_to_index[word] if word in word_to_index else word_to_index['<UNK>'] for word in wordlist]

    X_train = list(map(wordlist_to_indexlist, X_train))
    X_test = list(map(wordlist_to_indexlist, X_test))

    return X_train, np.array(list(train_data['label'])), X_test, np.array(list(test_data['label'])), word_to_index

X_train, y_train, X_test, y_test, word_to_index = load_data(train_data, test_data) 

print("훈련 샘플 개수: {}, 테스트 개수: {}".format(len(X_train), len(X_test)))

훈련 샘플 개수: 146182, 테스트 개수: 49157


index_to_word = {index:word for word, index in word_to_index.items()}


# 문장 1개를 활용할 딕셔너리와 함께 주면, 단어 인덱스 리스트 벡터로 변환해 주는 함수입니다. 
# 단, 모든 문장은 <BOS>로 시작하는 것으로 합니다. 
def get_encoded_sentence(sentence, word_to_index):
    return [word_to_index['<BOS>']]+[word_to_index[word] if word in word_to_index else word_to_index['<UNK>'] for word in sentence.split()]

# 여러 개의 문장 리스트를 한꺼번에 단어 인덱스 리스트 벡터로 encode해 주는 함수입니다. 
def get_encoded_sentences(sentences, word_to_index):
    return [get_encoded_sentence(sentence, word_to_index) for sentence in sentences]

# 숫자 벡터로 encode된 문장을 원래대로 decode하는 함수입니다. 
def get_decoded_sentence(encoded_sentence, index_to_word):
    return ' '.join(index_to_word[index] if index in index_to_word else '<UNK>' for index in encoded_sentence[1:])  #[1:]를 통해 <BOS>를 제외

# 여러개의 숫자 벡터로 encode된 문장을 한꺼번에 원래대로 decode하는 함수입니다. 
def get_decoded_sentences(encoded_sentences, index_to_word):
    return [get_decoded_sentence(encoded_sentence, index_to_word) for encoded_sentence in encoded_sentences]


# 전체문장 길이를 알아낸 후 적절한 값을 넣는 것이 성능에 도움이 됨

total_data_text = list(X_train) + list(X_test)
# 텍스트데이터 문장길이의 리스트를 생성한 후
num_tokens = [len(tokens) for tokens in total_data_text]
num_tokens = np.array(num_tokens)
# 문장길이의 평균값, 최대값, 표준편차를 계산해 본다. 
print('문장길이 평균 : ', np.mean(num_tokens))
print('문장길이 최대 : ', np.max(num_tokens))
print('문장길이 표준편차 : ', np.std(num_tokens))

# 예를들어, 최대 길이를 (평균 + 2*표준편차)로 한다면,  
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
maxlen = int(max_tokens)
print('pad_sequences maxlen : ', maxlen)
print('전체 문장의 {}%가 maxlen 설정값 이내에 포함됩니다. '.format(np.sum(num_tokens < max_tokens) / len(num_tokens)))

문장길이 평균 :  15.96938143432699
문장길이 최대 :  116
문장길이 표준편차 :  12.843571939469296
pad_sequences maxlen :  41
전체 문장의 0.9342988343341575%가 maxlen 설정값 이내에 포함됩니다.


# padding의 위치에 따라 달라질 수 있다.
# pre
X_train = keras.preprocessing.sequence.pad_sequences(X_train,
                                                        value=word_to_index["<PAD>"],
                                                        padding='post', # 혹은 'pre'
                                                        maxlen=maxlen)

X_test = keras.preprocessing.sequence.pad_sequences(X_test,
                                                       value=word_to_index["<PAD>"],
                                                       padding='post', # 혹은 'pre'
                                                       maxlen=maxlen)

print(X_train.shape)

(146182, 41)


# RNN 모델 설계
vocab_size = 10000  # 어휘 사전의 크기입니다(10,000개의 단어)
word_vector_dim = 16  # 워드 벡터의 차원수 (변경가능한 하이퍼파라미터)

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model.add(keras.layers.LSTM(8))   # 가장 널리 쓰이는 RNN인 LSTM 레이어를 사용하였습니다. 이때 LSTM state 벡터의 차원수는 8로 하였습니다. (변경가능)
model.add(keras.layers.Dense(8, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))  # 최종 출력은 긍정/부정을 나타내는 1dim 입니다.

model.summary()

Model: "sequential_26"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_26 (Embedding)     (None, None, 16)          160000    
_________________________________________________________________
lstm_17 (LSTM)               (None, 8)                 800       
_________________________________________________________________
dense_36 (Dense)             (None, 8)                 72        
_________________________________________________________________
dense_37 (Dense)             (None, 1)                 9         
=================================================================
Total params: 160,881
Trainable params: 160,881
Non-trainable params: 0
_________________________________________________________________


# validation set 10000건 분리
x_val = X_train[:100000]   
y_val = y_train[:100000]

# validation set을 제외한 나머지 15000건
partial_x_train = X_train[100000:]  
partial_y_train = y_train[100000:]

print(partial_x_train.shape)
print(partial_y_train.shape)

(46182, 41)
(46182,)


model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
              
epochs=20  # 몇 epoch를 훈련하면 좋을지 결과를 보면서 바꾸어 봅시다. 

history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=epochs,
                    batch_size=512,
                    validation_data=(x_val, y_val),
                    verbose=1)

Epoch 1/20
91/91 [==============================] - 2s 27ms/step - loss: 0.6905 - accuracy: 0.5235 - val_loss: 0.6564 - val_accuracy: 0.7095
Epoch 2/20
91/91 [==============================] - 2s 24ms/step - loss: 0.5047 - accuracy: 0.7924 - val_loss: 0.4340 - val_accuracy: 0.8217
Epoch 3/20
91/91 [==============================] - 2s 25ms/step - loss: 0.3645 - accuracy: 0.8573 - val_loss: 0.3844 - val_accuracy: 0.8356
Epoch 4/20
91/91 [==============================] - 2s 25ms/step - loss: 0.3167 - accuracy: 0.8782 - val_loss: 0.3833 - val_accuracy: 0.8352
Epoch 5/20
91/91 [==============================] - 2s 24ms/step - loss: 0.2894 - accuracy: 0.8904 - val_loss: 0.3893 - val_accuracy: 0.8331
Epoch 6/20
91/91 [==============================] - 2s 25ms/step - loss: 0.2734 - accuracy: 0.8976 - val_loss: 0.4118 - val_accuracy: 0.8308
Epoch 7/20
91/91 [==============================] - 2s 24ms/step - loss: 0.2633 - accuracy: 0.9005 - val_loss: 0.4082 - val_accuracy: 0.8292
Epoch 8/20
91/91 [==============================] - 2s 24ms/step - loss: 0.2532 - accuracy: 0.9055 - val_loss: 0.4275 - val_accuracy: 0.8272
Epoch 9/20
91/91 [==============================] - 2s 24ms/step - loss: 0.2425 - accuracy: 0.9094 - val_loss: 0.4374 - val_accuracy: 0.8269
Epoch 10/20
91/91 [==============================] - 2s 25ms/step - loss: 0.2348 - accuracy: 0.9136 - val_loss: 0.4691 - val_accuracy: 0.8234
Epoch 11/20
91/91 [==============================] - 2s 27ms/step - loss: 0.2271 - accuracy: 0.9163 - val_loss: 0.4424 - val_accuracy: 0.8242
Epoch 12/20
91/91 [==============================] - 2s 25ms/step - loss: 0.2212 - accuracy: 0.9188 - val_loss: 0.4794 - val_accuracy: 0.8217
Epoch 13/20
91/91 [==============================] - 2s 25ms/step - loss: 0.2112 - accuracy: 0.9215 - val_loss: 0.5015 - val_accuracy: 0.8203
Epoch 14/20
91/91 [==============================] - 2s 25ms/step - loss: 0.2049 - accuracy: 0.9248 - val_loss: 0.5185 - val_accuracy: 0.8197
Epoch 15/20
91/91 [==============================] - 3s 28ms/step - loss: 0.1964 - accuracy: 0.9275 - val_loss: 0.5540 - val_accuracy: 0.8189
Epoch 16/20
91/91 [==============================] - 3s 31ms/step - loss: 0.1895 - accuracy: 0.9297 - val_loss: 0.5314 - val_accuracy: 0.8182
Epoch 17/20
91/91 [==============================] - 2s 24ms/step - loss: 0.1819 - accuracy: 0.9321 - val_loss: 0.5563 - val_accuracy: 0.8146
Epoch 18/20
91/91 [==============================] - 2s 25ms/step - loss: 0.1769 - accuracy: 0.9335 - val_loss: 0.5740 - val_accuracy: 0.8177
Epoch 19/20
91/91 [==============================] - 2s 25ms/step - loss: 0.1706 - accuracy: 0.9362 - val_loss: 0.5693 - val_accuracy: 0.8160
Epoch 20/20
91/91 [==============================] - 2s 25ms/step - loss: 0.1686 - accuracy: 0.9359 - val_loss: 0.5815 - val_accuracy: 0.8163


results = model.evaluate(X_test,  y_test, verbose=2)

print(results)

1537/1537 - 3s - loss: 0.5873 - accuracy: 0.8133
[0.5872633457183838, 0.8132717609405518]


history_dict = history.history
import matplotlib.pyplot as plt

acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

# "bo"는 "파란색 점"입니다
plt.plot(epochs, loss, 'bo', label='Training loss')
# b는 "파란 실선"입니다
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()


plt.clf()   # 그림을 초기화합니다

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

	id	document	label
0	9976970	아 더빙.. 진짜 짜증나네요 목소리	0
1	3819312	흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나	1
2	10265843	너무재밓었다그래서보는것을추천한다	0
3	9045019	교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정	0
4	6483659	사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...	1

Explolation 6 작사가 만들기 (0)	2021.01.21
Exploration5 오디오 음성 데이터 분류 (0)	2021.01.19
Exploration 3 카메라 스티커 인식 (0)	2021.01.12
Explolation2 아이리스 분류하기 (0)	2021.01.08
Explolation1 가위바위보 분류하기 (0)	2021.01.08

A.I

A.I

Exploration4 영화 리뷰 감성 분류 본문

Exploration4 영화 리뷰 감성 분류

영화 리뷰 텍스트 감성분석하기¶

1. 텍스트를 숫자로 표현하는 방법¶

2. RNN으로 시퀀스데이터 다루기¶

3. 영화리뷰 감성 분석¶

3-1.Word2Vec의 적용¶

4. 네이버 영화리뷰 감성분석 도전하기¶

4-1. 네이버 영화 댓글 데이터셋 다운로드¶

4-2. 데이터 전처리¶

4-3. 데이터 로더 구성¶

정리¶

'AIFFEL' 카테고리의 다른 글

티스토리툴바

« 2024/07 »
일	월	화	수	목	금	토
	1	2	3	4	5	6
7	8	9	10	11	12	13
14	15	16	17	18	19	20
21	22	23	24	25	26	27
28	29	30	31