import nltk
nltk.download('stopwords')

import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from bs4 import BeautifulSoup 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
import urllib.request

[nltk_data] Downloading package stopwords to /home/ssac24/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


data = pd.read_csv(os.getenv("HOME")+"/aiffel/news_summarization/data/Reviews.csv", nrows = 100000)
print('전체 샘플수 :',(len(data)))

전체 샘플수 : 100000


data.head()


data = data[['Text','Summary']]
data.head()

#랜덤한 15개 샘플 출력
data.sample(15)


print('Text 열에서 중복을 배제한 유일한 샘플의 수 :', data['Text'].nunique())
print('Summary 열에서 중복을 배제한 유일한 샘플의 수 :', data['Summary'].nunique())

Text 열에서 중복을 배제한 유일한 샘플의 수 : 88426
Summary 열에서 중복을 배제한 유일한 샘플의 수 : 72348


# Text가 중복된 항목 제거
data.drop_duplicates(subset = ['Text'], inplace = True)
print('전체 샘플수 :',(len(data)))

전체 샘플수 : 88426


# 데이터프레임에 Null값이 있는지 확인
print(data.isnull().sum())

Text       0
Summary    1
dtype: int64


# Null 값 제거
data.dropna(axis = 0, inplace = True)
print('전체 샘플수 :',(len(data)))

전체 샘플수 : 88425


contractions = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
                           "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
                           "you're": "you are", "you've": "you have"}

print("정규화 사전의 수: ",len(contractions))

정규화 사전의 수:  120


print('불용어 개수 :', len(stopwords.words('english') ))
print(stopwords.words('english'))

불용어 개수 : 179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]


#데이터 전처리 함수
def preprocess_sentence(sentence, remove_stopwords=True):
    sentence = sentence.lower() # 텍스트 소문자화
    sentence = BeautifulSoup(sentence, "lxml").text # <br />, <a href = ...> 등의 html 태그 제거
    sentence = re.sub(r'\([^)]*\)', '', sentence) # 괄호로 닫힌 문자열 (...) 제거 Ex) my husband (and myself!) for => my husband for
    sentence = re.sub('"','', sentence) # 쌍따옴표 " 제거
    sentence = ' '.join([contractions[t] if t in contractions else t for t in sentence.split(" ")]) # 약어 정규화
    sentence = re.sub(r"'s\b","",sentence) # 소유격 제거. Ex) roland's -> roland
    sentence = re.sub("[^a-zA-Z]", " ", sentence) # 영어 외 문자(숫자, 특수문자 등) 공백으로 변환
    sentence = re.sub('[m]{2,}', 'mm', sentence) # m이 3개 이상이면 2개로 변경. Ex) ummmmmmm yeah -> umm yeah
    
    # 불용어 제거 (Text)
    if remove_stopwords:
        tokens = ' '.join(word for word in sentence.split() if not word in stopwords.words('english') if len(word) > 1)
    # 불용어 미제거 (Summary)
    else:
        tokens = ' '.join(word for word in sentence.split() if len(word) > 1)
    return tokens


temp_text = 'Everything I bought was great, infact I ordered twice and the third ordered was<br />for my mother and father.'
temp_summary = 'Great way to start (or finish) the day!!!'

print(preprocess_sentence(temp_text))
print(preprocess_sentence(temp_summary, False))  # 불용어를 제거하지 않습니다.

everything bought great infact ordered twice third ordered wasfor mother father
great way to start the day


clean_text = []

# 전체 Text 데이터에 대한 전처리 : 10분 이상 시간이 걸릴 수 있습니다. 
for s in data['Text']:
    clean_text.append(preprocess_sentence(s))

# 전처리 후 출력
clean_text[:5]

['bought several vitality canned dog food products found good quality product looks like stew processed meat smells better labrador finicky appreciates product better',
 'product arrived labeled jumbo salted peanuts peanuts actually small sized unsalted sure error vendor intended represent product jumbo',
 'confection around centuries light pillowy citrus gelatin nuts case filberts cut tiny squares liberally coated powdered sugar tiny mouthful heaven chewy flavorful highly recommend yummy treat familiar story lewis lion witch wardrobe treat seduces edmund selling brother sisters witch',
 'looking secret ingredient robitussin believe found got addition root beer extract ordered made cherry soda flavor medicinal',
 'great taffy great price wide assortment yummy taffy delivery quick taffy lover deal']


clean_summary = []

# 전체 Summary 데이터에 대한 전처리 : 5분 이상 시간이 걸릴 수 있습니다. 
for s in data['Summary']:
    clean_summary.append(preprocess_sentence(s, False))

clean_summary[:5]

['good quality dog food',
 'not as advertised',
 'delight says it all',
 'cough medicine',
 'great taffy']


# 정제 과정에서 문장의 모든 단어가 사라지는 경우가 있을 수 있으므로 빈 값을 Null로 변환
data['Text'] = clean_text
data['Summary'] = clean_summary

# 빈 값을 Null 값으로 변환
data.replace('', np.nan, inplace=True)


data.isnull().sum()

Text        0
Summary    70
dtype: int64


data.dropna(axis=0, inplace=True)
print('전체 샘플수 :',(len(data)))#데이터 전처리 함수

전체 샘플수 : 88355


# 길이 분포 출력
import matplotlib.pyplot as plt

text_len = [len(s.split()) for s in data['Text']]
summary_len = [len(s.split()) for s in data['Summary']]

print('텍스트의 최소 길이 : {}'.format(np.min(text_len)))
print('텍스트의 최대 길이 : {}'.format(np.max(text_len)))
print('텍스트의 평균 길이 : {}'.format(np.mean(text_len)))
print('요약의 최소 길이 : {}'.format(np.min(summary_len)))
print('요약의 최대 길이 : {}'.format(np.max(summary_len)))
print('요약의 평균 길이 : {}'.format(np.mean(summary_len)))

plt.subplot(1,2,1)
plt.boxplot(summary_len)
plt.title('Summary')
plt.subplot(1,2,2)
plt.boxplot(text_len)
plt.title('Text')
plt.tight_layout()
plt.show()

plt.title('Summary')
plt.hist(summary_len, bins = 40)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

plt.title('Text')
plt.hist(text_len, bins = 40)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

텍스트의 최소 길이 : 2
텍스트의 최대 길이 : 1235
텍스트의 평균 길이 : 38.792428272310566
요약의 최소 길이 : 1
요약의 최대 길이 : 28
요약의 평균 길이 : 4.010729443721352


# 최대길이 임의 설정
text_max_len = 50
summary_max_len = 8


def below_threshold_len(max_len, nested_list):
  cnt = 0
  for s in nested_list:
    if(len(s.split()) <= max_len):
        cnt = cnt + 1
  print('전체 샘플 중 길이가 %s 이하인 샘플의 비율: %s'%(max_len, (cnt / len(nested_list))))


below_threshold_len(text_max_len, data['Text'])
below_threshold_len(summary_max_len,  data['Summary'])

전체 샘플 중 길이가 50 이하인 샘플의 비율: 0.7745119121724859
전체 샘플 중 길이가 8 이하인 샘플의 비율: 0.9424593967517402


# 정해진 길이보다 길면 제외
data = data[data['Text'].apply(lambda x: len(x.split()) <= text_max_len)]
data = data[data['Summary'].apply(lambda x: len(x.split()) <= summary_max_len)]
print('전체 샘플수 :',(len(data)))

전체 샘플수 : 65818


#요약 데이터에는 시작 토큰과 종료 토큰을 추가한다.
data['decoder_input'] = data['Summary'].apply(lambda x : 'sostoken '+ x)
data['decoder_target'] = data['Summary'].apply(lambda x : x + ' eostoken')
data.head()


encoder_input = np.array(data['Text']) # 인코더의 입력
decoder_input = np.array(data['decoder_input']) # 디코더의 입력
decoder_target = np.array(data['decoder_target']) # 디코더의 레이블


# 랜덤 정수 시퀀스를 이용하여 나눔
indices = np.arange(encoder_input.shape[0])
np.random.shuffle(indices)
print(indices)

[10552  5222 22054 ... 62103 29032 47749]


encoder_input = encoder_input[indices]
decoder_input = decoder_input[indices]
decoder_target = decoder_target[indices]


n_of_val = int(len(encoder_input)*0.2)
print('테스트 데이터의 수 :',n_of_val)

테스트 데이터의 수 : 13163


encoder_input_train = encoder_input[:-n_of_val]
decoder_input_train = decoder_input[:-n_of_val]
decoder_target_train = decoder_target[:-n_of_val]

encoder_input_test = encoder_input[-n_of_val:]
decoder_input_test = decoder_input[-n_of_val:]
decoder_target_test = decoder_target[-n_of_val:]

print('훈련 데이터의 개수 :', len(encoder_input_train))
print('훈련 레이블의 개수 :',len(decoder_input_train))
print('테스트 데이터의 개수 :',len(encoder_input_test))
print('테스트 레이블의 개수 :',len(decoder_input_test))

훈련 데이터의 개수 : 52655
훈련 레이블의 개수 : 52655
테스트 데이터의 개수 : 13163
테스트 레이블의 개수 : 13163


src_tokenizer = Tokenizer() # 토크나이저 정의
src_tokenizer.fit_on_texts(encoder_input_train) # 입력된 데이터로부터 단어 집합 생성


threshold = 7 # 등장 빈도수
total_cnt = len(src_tokenizer.word_index) # 단어의 수
rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
for key, value in src_tokenizer.word_counts.items():
    total_freq = total_freq + value

    # 단어의 등장 빈도수가 threshold보다 작으면
    if(value < threshold):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

print('단어 집합(vocabulary)의 크기 :',total_cnt)
print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))
print('단어 집합에서 희귀 단어를 제외시킬 경우의 단어 집합의 크기 %s'%(total_cnt - rare_cnt))
print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)

단어 집합(vocabulary)의 크기 : 31994
등장 빈도가 6번 이하인 희귀 단어의 수: 23749
단어 집합에서 희귀 단어를 제외시킬 경우의 단어 집합의 크기 8245
단어 집합에서 희귀 단어의 비율: 74.22954303931986
전체 등장 빈도에서 희귀 단어 등장 빈도 비율: 3.3914079397936905


src_vocab = 8000
src_tokenizer = Tokenizer(num_words = src_vocab) # 단어 집합의 크기를 8,000으로 제한
src_tokenizer.fit_on_texts(encoder_input_train) # 단어 집합 재생성.


# 텍스트 시퀀스를 정수 시퀀스로 변환
encoder_input_train = src_tokenizer.texts_to_sequences(encoder_input_train) 
encoder_input_test = src_tokenizer.texts_to_sequences(encoder_input_test)

#잘 진행되었는지 샘플 출력
print(encoder_input_train[:3])

[[892, 448, 425, 70, 94, 109, 11, 5896, 448, 264, 26, 235, 3, 179, 4951, 2, 17, 244], [142, 2, 4, 711, 1387, 639, 351, 985, 203, 278, 149, 219, 82, 178, 149, 219, 377, 6806, 92, 462, 1179, 1388, 86, 2657, 560, 863, 221, 377, 4, 82, 54, 70, 5475, 1298, 122, 41, 48, 361, 283, 1081, 1538, 1446, 234, 16, 2264, 441, 21], [3, 372, 929, 1052, 214, 118, 272, 3895, 395, 57, 2, 4, 246, 46, 2224, 344, 22, 6456, 1111]]


tar_tokenizer = Tokenizer()
tar_tokenizer.fit_on_texts(decoder_input_train)


threshold = 6
total_cnt = len(tar_tokenizer.word_index) # 단어의 수
rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
for key, value in tar_tokenizer.word_counts.items():
    total_freq = total_freq + value

    # 단어의 등장 빈도수가 threshold보다 작으면
    if(value < threshold):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

print('단어 집합(vocabulary)의 크기 :',total_cnt)
print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))
print('단어 집합에서 희귀 단어를 제외시킬 경우의 단어 집합의 크기 %s'%(total_cnt - rare_cnt))
print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)

단어 집합(vocabulary)의 크기 : 10562
등장 빈도가 5번 이하인 희귀 단어의 수: 8190
단어 집합에서 희귀 단어를 제외시킬 경우의 단어 집합의 크기 2372
단어 집합에서 희귀 단어의 비율: 77.54213217193713
전체 등장 빈도에서 희귀 단어 등장 빈도 비율: 5.901060070671378


tar_vocab = 2000
tar_tokenizer = Tokenizer(num_words = tar_vocab) 
tar_tokenizer.fit_on_texts(decoder_input_train)
tar_tokenizer.fit_on_texts(decoder_target_train)

# 텍스트 시퀀스를 정수 시퀀스로 변환
decoder_input_train = tar_tokenizer.texts_to_sequences(decoder_input_train) 
decoder_target_train = tar_tokenizer.texts_to_sequences(decoder_target_train)
decoder_input_test = tar_tokenizer.texts_to_sequences(decoder_input_test)
decoder_target_test = tar_tokenizer.texts_to_sequences(decoder_target_test)

#잘 변환되었는지 확인
print('input')
print('input ',decoder_input_train[:5])
print('target')
print('decoder ',decoder_target_train[:5])

input
input  [[1, 545, 691, 158, 1109], [1, 127, 11], [1, 379, 23, 181, 410], [1, 26], [1, 766, 27, 93, 102]]
target
decoder  [[545, 691, 158, 1109, 2], [127, 11, 2], [379, 23, 181, 410, 2], [26, 2], [766, 27, 93, 102, 2]]


# 빈도수가 낮은 단어들로만 구성되었던 문장들을 제거
drop_train = [index for index, sentence in enumerate(decoder_input_train) if len(sentence) == 1]
drop_test = [index for index, sentence in enumerate(decoder_input_test) if len(sentence) == 1]

print('삭제할 훈련 데이터의 개수 :',len(drop_train))
print('삭제할 테스트 데이터의 개수 :',len(drop_test))

encoder_input_train = np.delete(encoder_input_train, drop_train, axis=0)
decoder_input_train = np.delete(decoder_input_train, drop_train, axis=0)
decoder_target_train = np.delete(decoder_target_train, drop_train, axis=0)

encoder_input_test = np.delete(encoder_input_test, drop_test, axis=0)
decoder_input_test = np.delete(decoder_input_test, drop_test, axis=0)
decoder_target_test = np.delete(decoder_target_test, drop_test, axis=0)

print('훈련 데이터의 개수 :', len(encoder_input_train))
print('훈련 레이블의 개수 :',len(decoder_input_train))
print('테스트 데이터의 개수 :',len(encoder_input_test))
print('테스트 레이블의 개수 :',len(decoder_input_test))

삭제할 훈련 데이터의 개수 : 1295
삭제할 테스트 데이터의 개수 : 319
훈련 데이터의 개수 : 51360
훈련 레이블의 개수 : 51360
테스트 데이터의 개수 : 12844
테스트 레이블의 개수 : 12844

/home/ssac24/anaconda3/envs/aiffel/lib/python3.7/site-packages/numpy/core/_asarray.py:83: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
  return array(a, dtype, copy=False, order=order)


encoder_input_train = pad_sequences(encoder_input_train, maxlen = text_max_len, padding='post')
encoder_input_test = pad_sequences(encoder_input_test, maxlen = text_max_len, padding='post')
decoder_input_train = pad_sequences(decoder_input_train, maxlen = summary_max_len, padding='post')
decoder_target_train = pad_sequences(decoder_target_train, maxlen = summary_max_len, padding='post')
decoder_input_test = pad_sequences(decoder_input_test, maxlen = summary_max_len, padding='post')
decoder_target_test = pad_sequences(decoder_target_test, maxlen = summary_max_len, padding='post')


from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


# 인코더 설계 시작
embedding_dim = 128 # 임베딩벡터의 차원 수
hidden_size = 256 # LSTM에서 얼만큼의 수용력(capacity)를 가질지를 정하는 파라미터로 층 1개의 용량의 크기를 나타냄

# 인코더
encoder_inputs = Input(shape=(text_max_len,))

# 인코더의 임베딩 층
enc_emb = Embedding(src_vocab, embedding_dim)(encoder_inputs)

# 3개로 구성하여 모델의 용량을 늘림
# 인코더의 LSTM 1
encoder_lstm1 = LSTM(hidden_size, return_sequences=True, return_state=True ,dropout = 0.4, recurrent_dropout = 0.4)
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)

# 인코더의 LSTM 2
encoder_lstm2 = LSTM(hidden_size, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)

# 인코더의 LSTM 3
encoder_lstm3 = LSTM(hidden_size, return_state=True, return_sequences=True, dropout=0.4, recurrent_dropout=0.4)
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2)

WARNING:tensorflow:Layer lstm will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU
WARNING:tensorflow:Layer lstm_1 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU
WARNING:tensorflow:Layer lstm_2 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU


# 디코더 설계

decoder_inputs = Input(shape=(None,))

# 디코더의 임베딩 층
dec_emb_layer = Embedding(tar_vocab, embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)

# 디코더의 LSTM
decoder_lstm = LSTM(hidden_size, return_sequences = True, return_state = True, dropout = 0.4, recurrent_dropout=0.2)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state = [state_h, state_c])

WARNING:tensorflow:Layer lstm_3 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU


# 디코더의 출력층
decoder_softmax_layer = Dense(tar_vocab, activation = 'softmax')
decoder_softmax_outputs = decoder_softmax_layer(decoder_outputs) 

# 모델 정의
model = Model([encoder_inputs, decoder_inputs], decoder_softmax_outputs)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_1 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 50, 128)      1024000     input_1[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 50, 256), (N 394240      embedding[0][0]                  
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 50, 256), (N 525312      lstm[0][0]                       
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 128)    256000      input_2[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, 50, 256), (N 525312      lstm_1[0][0]                     
__________________________________________________________________________________________________
lstm_3 (LSTM)                   [(None, None, 256),  394240      embedding_1[0][0]                
                                                                 lstm_2[0][1]                     
                                                                 lstm_2[0][2]                     
__________________________________________________________________________________________________
dense (Dense)                   (None, None, 2000)   514000      lstm_3[0][0]                     
==================================================================================================
Total params: 3,633,104
Trainable params: 3,633,104
Non-trainable params: 0
__________________________________________________________________________________________________


# GitHub의 어텐션 함수 다운로드
urllib.request.urlretrieve("https://raw.githubusercontent.com/thushv89/attention_keras/master/src/layers/attention.py", filename="attention.py")
from attention import AttentionLayer


# 어텐션 층(어텐션 함수)
attn_layer = AttentionLayer(name='attention_layer')
# 인코더와 디코더의 모든 time step의 hidden state를 어텐션 층에 전달하고 결과를 리턴
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs])

# 어텐션의 결과와 디코더의 hidden state들을 연결
decoder_concat_input = Concatenate(axis = -1, name='concat_layer')([decoder_outputs, attn_out])

# 디코더의 출력층
decoder_softmax_layer = Dense(tar_vocab, activation='softmax')
decoder_softmax_outputs = decoder_softmax_layer(decoder_concat_input)

# 모델 정의
model = Model([encoder_inputs, decoder_inputs], decoder_softmax_outputs)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_1 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 50, 128)      1024000     input_1[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 50, 256), (N 394240      embedding[0][0]                  
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 50, 256), (N 525312      lstm[0][0]                       
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 128)    256000      input_2[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, 50, 256), (N 525312      lstm_1[0][0]                     
__________________________________________________________________________________________________
lstm_3 (LSTM)                   [(None, None, 256),  394240      embedding_1[0][0]                
                                                                 lstm_2[0][1]                     
                                                                 lstm_2[0][2]                     
__________________________________________________________________________________________________
attention_layer (AttentionLayer ((None, None, 256),  131328      lstm_2[0][0]                     
                                                                 lstm_3[0][0]                     
__________________________________________________________________________________________________
concat_layer (Concatenate)      (None, None, 512)    0           lstm_3[0][0]                     
                                                                 attention_layer[0][0]            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, None, 2000)   1026000     concat_layer[0][0]               
==================================================================================================
Total params: 4,276,432
Trainable params: 4,276,432
Non-trainable params: 0
__________________________________________________________________________________________________


model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience = 2)
history = model.fit(x = [encoder_input_train, decoder_input_train], y = decoder_target_train, \
          validation_data = ([encoder_input_test, decoder_input_test], decoder_target_test),
          batch_size = 256, callbacks=[es], epochs = 50)

Epoch 1/50
201/201 [==============================] - 68s 337ms/step - loss: 2.7057 - val_loss: 2.4477
Epoch 2/50
201/201 [==============================] - 67s 331ms/step - loss: 2.3914 - val_loss: 2.3154
Epoch 3/50
201/201 [==============================] - 67s 333ms/step - loss: 2.2527 - val_loss: 2.1825
Epoch 4/50
201/201 [==============================] - 66s 330ms/step - loss: 2.1230 - val_loss: 2.0869
Epoch 5/50
201/201 [==============================] - 66s 331ms/step - loss: 2.0366 - val_loss: 2.0419
Epoch 6/50
201/201 [==============================] - 67s 331ms/step - loss: 1.9720 - val_loss: 1.9946
Epoch 7/50
201/201 [==============================] - 67s 332ms/step - loss: 1.9183 - val_loss: 1.9610
Epoch 8/50
201/201 [==============================] - 66s 330ms/step - loss: 1.8734 - val_loss: 1.9417
Epoch 9/50
201/201 [==============================] - 67s 331ms/step - loss: 1.8323 - val_loss: 1.9163
Epoch 10/50
201/201 [==============================] - 67s 333ms/step - loss: 1.7947 - val_loss: 1.9029
Epoch 11/50
201/201 [==============================] - 66s 331ms/step - loss: 1.7611 - val_loss: 1.8904
Epoch 12/50
201/201 [==============================] - 67s 332ms/step - loss: 1.7291 - val_loss: 1.8839
Epoch 13/50
201/201 [==============================] - 66s 331ms/step - loss: 1.7004 - val_loss: 1.8759
Epoch 14/50
201/201 [==============================] - 67s 332ms/step - loss: 1.6715 - val_loss: 1.8699
Epoch 15/50
201/201 [==============================] - 67s 332ms/step - loss: 1.6451 - val_loss: 1.8636
Epoch 16/50
201/201 [==============================] - 67s 332ms/step - loss: 1.6203 - val_loss: 1.8608
Epoch 17/50
201/201 [==============================] - 67s 331ms/step - loss: 1.5964 - val_loss: 1.8595
Epoch 18/50
201/201 [==============================] - 67s 334ms/step - loss: 1.5732 - val_loss: 1.8588
Epoch 19/50
201/201 [==============================] - 66s 331ms/step - loss: 1.5519 - val_loss: 1.8598
Epoch 20/50
201/201 [==============================] - 67s 333ms/step - loss: 1.5292 - val_loss: 1.8590
Epoch 00020: early stopping


plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()


src_index_to_word = src_tokenizer.index_word # 원문 단어 집합에서 정수 -> 단어를 얻음
tar_word_to_index = tar_tokenizer.word_index # 요약 단어 집합에서 단어 -> 정수를 얻음
tar_index_to_word = tar_tokenizer.index_word # 요약 단어 집합에서 정수 -> 단어를 얻음


# 인코더 설계
encoder_model = Model(inputs=encoder_inputs, outputs=[encoder_outputs, state_h, state_c])

# 이전 시점의 상태들을 저장하는 텐서
decoder_state_input_h = Input(shape=(hidden_size,))
decoder_state_input_c = Input(shape=(hidden_size,))

dec_emb2 = dec_emb_layer(decoder_inputs)
# 문장의 다음 단어를 예측하기 위해서 초기 상태(initial_state)를 이전 시점의 상태로 사용. 이는 뒤의 함수 decode_sequence()에 구현
# 훈련 과정에서와 달리 LSTM의 리턴하는 은닉 상태와 셀 상태인 state_h와 state_c를 버리지 않음.
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])


# 어텐션 함수
decoder_hidden_state_input = Input(shape=(text_max_len, hidden_size))
attn_out_inf, attn_states_inf = attn_layer([decoder_hidden_state_input, decoder_outputs2])
decoder_inf_concat = Concatenate(axis=-1, name='concat')([decoder_outputs2, attn_out_inf])

# 디코더의 출력층
decoder_outputs2 = decoder_softmax_layer(decoder_inf_concat) 

# 최종 디코더 모델
decoder_model = Model(
    [decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs2] + [state_h2, state_c2])


def decode_sequence(input_seq):
    # 입력으로부터 인코더의 상태를 얻음
    e_out, e_h, e_c = encoder_model.predict(input_seq)

     # <SOS>에 해당하는 토큰 생성
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = tar_word_to_index['sostoken']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition: # stop_condition이 True가 될 때까지 루프 반복

        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = tar_index_to_word[sampled_token_index]

        if(sampled_token!='eostoken'):
            decoded_sentence += ' '+sampled_token

        #  <eos>에 도달하거나 최대 길이를 넘으면 중단.
        if (sampled_token == 'eostoken'  or len(decoded_sentence.split()) >= (summary_max_len-1)):
            stop_condition = True

        # 길이가 1인 타겟 시퀀스를 업데이트
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # 상태를 업데이트 합니다.
        e_h, e_c = h, c

    return decoded_sentence


# 원문의 정수 시퀀스를 텍스트 시퀀스로 변환
def seq2text(input_seq):
    temp=''
    for i in input_seq:
        if(i!=0):
            temp = temp + src_index_to_word[i]+' '
    return temp

# 요약문의 정수 시퀀스를 텍스트 시퀀스로 변환
def seq2summary(input_seq):
    temp=''
    for i in input_seq:
        if((i!=0 and i!=tar_word_to_index['sostoken']) and i!=tar_word_to_index['eostoken']):
            temp = temp + tar_index_to_word[i] + ' '
    return temp


for i in range(50, 100):
    print("원문 :", seq2text(encoder_input_test[i]))
    print("실제 요약 :", seq2summary(decoder_input_test[i]))
    print("예측 요약 :", decode_sequence(encoder_input_test[i].reshape(1, text_max_len)))
    print("\n")

원문 : one best store bought cookies ever tasted chocolate lover 
실제 요약 : absolutely delicious and addictive 
예측 요약 :  best cookies ever


원문 : pasta awesome great texture flavor difference pasta wheat pasta highly recommend anyone loves pasta whether gluten free 
실제 요약 : great taste 
예측 요약 :  great pasta


원문 : looking alternative healthy protein snack less fat slim jims texas bbq sounded like awesome flavor bought never tried pick rubber band baste bbq sauce eat soy protein jerky taste like bbq flavored rubber band way ever eat world longer protein choice available 
실제 요약 : good flavor very bad texture 
예측 요약 :  great snack


원문 : far delicious gummy bears largest quantity available two facts together create one worthwhile purchase 
실제 요약 : delicious 
예측 요약 :  gummy bears


원문 : italian greyhound years old eats one greenie day vet tells rare italian greyhound great teeth even need teeth cleaned greenies take care keeping teeth clean also loves 
실제 요약 : greenies keep your dog teeth white 
예측 요약 :  my dog loves these


원문 : september adopted adult border terrier mixes shelter pet project oldest girl breath issues heard greenies disappointed two love treat go work see chew chew chew since removes tartar amazon much better pricing pet love try pet love 
실제 요약 : my dogs love these 
예측 요약 :  great product


원문 : finger sized marinated grape leaves filled rice soft tasty side dish healthy snack salad add large amount keeps well fridge much spice salt taste mint lemon flavors eat cheese pickles cold cuts great price 
실제 요약 : excellent product 
예측 요약 :  great snack


원문 : switched shepherd husky mix newman organic stopped shedding fur still sheds little nothing compared used also dogs coats absolutely gorgeous shinier healthier softer definitely love food organic contains ingredients easy dogs digest opinion best dog food commercially available right love proceeds donated charity newman way 
실제 요약 : less 
예측 요약 :  great dog food


원문 : cannot find grocery store please keep amazon great snack love 
실제 요약 : love them 
예측 요약 :  great snack


원문 : great way keep fruit diet follow weight watcher good snack three point get fruit diet 
실제 요약 : excellent taste 
예측 요약 :  great snack


원문 : bought product based liked taste high sugar content trying lose weight sugar per day severely affects belly fat using couple times gave away rest 
실제 요약 : convenient but watch out the sugar 
예측 요약 :  sugar


원문 : best fiber crunchy flakes great topped fruit plus added skim milk yogurt fresh start day 
실제 요약 : pure health 
예측 요약 :  great snack


원문 : love mint tea always soothes upset tummy tastes refreshing best brand tried worth price though certainly wish lower 
실제 요약 : my favorite brand 
예측 요약 :  love this tea


원문 : last seconds dog chewed piece bottle buy 
실제 요약 : broke 
예측 요약 :  dogs love it


원문 : better buy halves pieces get larger size cheaper product mostly pieces anyway waste money would rate stars could 
실제 요약 : not worth it 
예측 요약 :  not what expected


원문 : find taste goat milk pleasing mixed use liquid form fresh goat milk frankly come anywhere close even reasonable expectations powered form however use kitchen baked goods flavor even slightly sour use certain bread pancakes like well add mix instead milks water good pantry case days 
실제 요약 : little sour but great for baking with 
예측 요약 :  great product


원문 : bad product gave labs beagle times days dogs threw time rest bag 
실제 요약 : will never buy again 
예측 요약 :  dog treats


원문 : happy caviar company shipment arrived next day well packaged wonderfully reusable freezer packs caviar fresh extremely tasty would certainly order 
실제 요약 : yum 
예측 요약 :  great product


원문 : really didnt want huge volume jelly beans price time couldnt beat heck bought lb package approx half price good service amazon usual ty amazon received timely matter tasty sure finish life 
실제 요약 : price 
예측 요약 :  good product but


원문 : great smell appetizing raw heat adds depth flavor compare garlic onion think subtle adds savory flavor unique 
실제 요약 : is wonderful 
예측 요약 :  great seasoning


원문 : sweet month old might food allergies thrown three times exclusively introduced sweet potato pears applesauce squash carrots mixed veggies noticed ceral contains rice soy tuna pork based gelatin going tomorrow check going poor baby would love hear anyone else similar story 
실제 요약 : month old after eating this 
예측 요약 :  great food


원문 : sister law visit weekend thought perfect time test coffee neither us impressed great flavor thought texture bit oily far low acid goes seem like would good department taste made worth drinking 
실제 요약 : not worth it 
예측 요약 :  not my favorite


원문 : coffee really strong mixed regular ground coffee much better nice alternative regular coffee every morning perhaps dessert coffee would better suited cannot caffeine late day 
실제 요약 : pumpkin pie coffee 
예측 요약 :  great coffee


원문 : well earl grey fan drink cup day tried earl grey tazo twinings bigelow etc one beats taste aftertaste makes tea good smooth flavor taste flavor milk sugar 
실제 요약 : excellent tea 
예측 요약 :  great tea


원문 : ok makes great cup chocolate really use baking cookies etc good addition scratch baking project 
실제 요약 : love this mix 
예측 요약 :  good stuff


원문 : three year old daughter loves eat larger ones got smaller ones perfect size tummy thinks cookies 
실제 요약 : love these 
예측 요약 :  my son loves these


원문 : great coffee adore amazon getting favorite coffee long weekend thank amazon coffee personal thing one likes another strong bold tasting coffee excellent bitter aftertaste however mileage may vary 
실제 요약 : great coffee 
예측 요약 :  great coffee


원문 : flavorless dark chocolate ever tasted flat taste melts crumbly mouth prefer rich powerful chocolate smooth texture 
실제 요약 : without flavor 
예측 요약 :  yuck


원문 : milk chocolate coconut curry bar everyone spicy curry powder highest note taste least initially favorite bars spiciness also spice quickly chocolate melts tongue one best milk chocolates ever tasted silky dissolves coconut adds bit crunch texture highly unique bar rewarding like sophisticated taste combinations 
실제 요약 : spicy and sweet 
예측 요약 :  spicy and delicious


원문 : large bag dry pet food highly recommended adult dog also worth purchasing larger sized bag order stock food greater length time 
실제 요약 : excellent dog food 
예측 요약 :  great dog food


원문 : great product delicious also use hair treatment works great well great quality shipment super fast 
실제 요약 : great product 
예측 요약 :  great product


원문 : item arrived quickly baby told jam packed great chocolates enjoyed much baby happy thrilled would use company ordered outside 
실제 요약 : this made my baby happy 
예측 요약 :  great product


원문 : got box yesterday first thought actually spoiled read reviews realized bad usually try throw positive cannot think one tasted good carbs nearly high would happy pay money bad taste really low carb diet friendly star really though wish would stated carb count ad 
실제 요약 : bad 
예측 요약 :  not bad


원문 : using product months consistently good results making yogurt starter tried make follow batches yogurt made previously never good using starter every time one big found good yogurt making sure mix starter thoroughly definitely improves consistency 
실제 요약 : wonderful yogurt 
예측 요약 :  great for cooking


원문 : cat loves food one especially finicky cats food sure please particular cats overall gourmet cat food reasonable price meow 
실제 요약 : kitty yum yum 
예측 요약 :  cat food


원문 : heart eating gravy made searched ingredients online found seems cause lot people also contains msg problem know people cannot companies make food crap drain probably looked ingredients first 
실제 요약 : heart after eating 
예측 요약 :  not for me


원문 : love chips perfectly seasoned course grease also cardboard taste baked chipped popped texture potatoes nice light calories bag lots chips bag great snack ingredient list also pretty wholesome without strange chemicals added fav flavor probably salt pepper bbq 
실제 요약 : delicious 
예측 요약 :  great chips


원문 : always keep one two pouches diaper bag case would get hungry thirsty convenient go meal time pack homemade one 
실제 요약 : essential meal to go 
예측 요약 :  great for on the go


원문 : found stuff wegman grocery store va east coast short time looking since happy found amazon hesitate try fabulous like hot apple cider absolutly love stuff heat cup apple cider use one tea bags heavenly 
실제 요약 : yum 
예측 요약 :  excellent


원문 : really like kashi bars sweet interesting flavors actually help feel satisfied cannot break lunch used eat quaker bars time much better tasting job filling better 
실제 요약 : better than most 
예측 요약 :  great tasting snack


원문 : recently purchased cup machine quite excited drink iced tea rather coffee blend black tea quite bitter looking works advertised right tea 
실제 요약 : very bitter tea 
예측 요약 :  too strong for me


원문 : product contained many still pieces legs head still visible cooked paper sheets merely gotten package shipping understand almost food products trace pieces insect product many need transport manufacture food products 
실제 요약 : very 
예측 요약 :  not good


원문 : coffee great complaints however contrary reviews price target store shop sells everyday sometimes less sale math 
실제 요약 : good coffee bad price 
예측 요약 :  good coffee but pricey


원문 : good bran slightly sweet usually eat half serving bran half serving another cereal husband kinda looks like pellets 
실제 요약 : yummy bran 
예측 요약 :  good cereal


원문 : pretty good tea defiantly top grade tea far teas go style well subtle sweetness light bitterness even extended steeping opens cools good tea flavor molasses wheat straw notes earthy compost back end tea caffeine content appetite meals thermos full keeps going day work school 
실제 요약 : better than average tea 
예측 요약 :  good tea


원문 : ghost chile real deal literally hottest things could ever imagine brought school challenge friends took quarter one chile left tears lunch ever would recommend anyone looking spicy meal fun afternoon friends nice flavor taste buds left 
실제 요약 : ghost 
예측 요약 :  great for cooking


원문 : love beer nuts bar mix ordered since sell stores near jars hard deal beat 
실제 요약 : nothing beer nuts 
예측 요약 :  great candy


원문 : using great organic looks smells fine stomach problems delivered free great price amazon cannot go wrong 
실제 요약 : good stuff 
예측 요약 :  great product


원문 : nice product quick meal makes nice creamy liked much probably keeping box pantry shelf time something quick tasty needed 
실제 요약 : good quick meal 
예측 요약 :  good


원문 : product excellent portion control husband dieting eats one small box every morning milk losing weight case serving boxes convenient pull daily small packs also easy pack traveling calories low particularly size really pleased choice cereal 
실제 요약 : special cereal 
예측 요약 :  great cereal


# 데이터 다운로드
import requests
from summa.summarizer import summarize

text = requests.get('http://rare-technologies.com/the_matrix_synopsis.txt').text


print(text[:1500])

The screen is filled with green, cascading code which gives way to the title, The Matrix.

A phone rings and text appears on the screen: "Call trans opt: received. 2-19-98 13:24:18 REC: Log>" As a conversation takes place between Trinity (Carrie-Anne Moss) and Cypher (Joe Pantoliano), two free humans, a table of random green numbers are being scanned and individual numbers selected, creating a series of digits not unlike an ordinary phone number, as if a code is being deciphered or a call is being traced.

Trinity discusses some unknown person. Cypher taunts Trinity, suggesting she enjoys watching him. Trinity counters that "Morpheus (Laurence Fishburne) says he may be 'the One'," just as the sound of a number being selected alerts Trinity that someone may be tracing their call. She ends the call.

Armed policemen move down a darkened, decrepit hallway in the Heart O' the City Hotel, their flashlight beam bouncing just ahead of them. They come to room 303, kick down the door and find a woman dressed in black, facing away from them. It's Trinity. She brings her hands up from the laptop she's working on at their command.

Outside the hotel a car drives up and three agents appear in neatly pressed black suits. They are Agent Smith (Hugo Weaving), Agent Brown (Paul Goddard), and Agent Jones (Robert Taylor). Agent Smith and the presiding police lieutenant argue. Agent Smith admonishes the policeman that they were given specific orders to contact the agents first, for their


print('Summary:')
print(summarize(text, ratio=0.005))

Summary:
Morpheus, Trinity, Neo, Apoc, Switch, Mouse and Cypher are jacked into the Matrix.
Trinity brings the helicopter down to the floor that Morpheus is on and Neo opens fire on the three Agents.


# 리스트로 출력받기
print('Summary:')
print(summarize(text, ratio=0.005, split=True))

Summary:
['Morpheus, Trinity, Neo, Apoc, Switch, Mouse and Cypher are jacked into the Matrix.', 'Trinity brings the helicopter down to the floor that Morpheus is on and Neo opens fire on the three Agents.']


# 단어 수로도 요약 가능
print('Summary:')
print(summarize(text, words=50))

Summary:
Trinity takes Neo to Morpheus.
Morpheus, Trinity, Neo, Apoc, Switch, Mouse and Cypher are jacked into the Matrix.
Trinity brings the helicopter down to the floor that Morpheus is on and Neo opens fire on the three Agents.


import nltk
nltk.download('stopwords')

import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from bs4 import BeautifulSoup 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
import urllib.request

[nltk_data] Downloading package stopwords to /home/ssac24/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


import urllib.request
urllib.request.urlretrieve("https://raw.githubusercontent.com/sunnysai12345/News_Summary/master/news_summary_more.csv", filename="news_summary_more.csv")
data = pd.read_csv('news_summary_more.csv', encoding='iso-8859-1')


data.sample(10)


print('Text 열에서 중복을 배제한 유일한 샘플의 수 :', data['headlines'].nunique())
print('Summary 열에서 중복을 배제한 유일한 샘플의 수 :', data['text'].nunique())

Text 열에서 중복을 배제한 유일한 샘플의 수 : 98280
Summary 열에서 중복을 배제한 유일한 샘플의 수 : 98360


data.drop_duplicates(subset = ['headlines'], inplace = True)
print('전체 샘플수 :',(len(data)))

전체 샘플수 : 98280


# 데이터프레임에 Null값이 있는지 확인
print(data.isnull().sum())

headlines    0
text         0
dtype: int64


contractions = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
                           "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
                           "you're": "you are", "you've": "you have"}

print("정규화 사전의 수: ",len(contractions))

정규화 사전의 수:  120


#데이터 전처리 함수
def preprocess_sentence(sentence, remove_stopwords=True):
    sentence = sentence.lower() # 텍스트 소문자화
    sentence = BeautifulSoup(sentence, "lxml").text # <br />, <a href = ...> 등의 html 태그 제거
    sentence = re.sub(r'\([^)]*\)', '', sentence) # 괄호로 닫힌 문자열 (...) 제거 Ex) my husband (and myself!) for => my husband for
    sentence = re.sub('"','', sentence) # 쌍따옴표 " 제거
    sentence = ' '.join([contractions[t] if t in contractions else t for t in sentence.split(" ")]) # 약어 정규화
    sentence = re.sub(r"'s\b","",sentence) # 소유격 제거. Ex) roland's -> roland
    sentence = re.sub("[^a-zA-Z]", " ", sentence) # 영어 외 문자(숫자, 특수문자 등) 공백으로 변환
    sentence = re.sub('[m]{2,}', 'mm', sentence) # m이 3개 이상이면 2개로 변경. Ex) ummmmmmm yeah -> umm yeah
    
    # 불용어 제거 (Text)
    if remove_stopwords:
        tokens = ' '.join(word for word in sentence.split() if not word in stopwords.words('english') if len(word) > 1)
    # 불용어 미제거 (Summary)
    else:
        tokens = ' '.join(word for word in sentence.split() if len(word) > 1)
    return tokens


clean_text = []

# 전체 headlines 데이터에 대한 전처리 
for s in data['headlines']:
    clean_text.append(preprocess_sentence(s))

# 전처리 후 출력
clean_text[:5]

['upgrad learner switches career ml al salary hike',
 'delhi techie wins free food swiggy one year cred',
 'new zealand end rohit sharma led india match winning streak',
 'aegon life iterm insurance plan helps customers save tax',
 'known hirani yrs metoo claims true sonam']


clean_summary = []

# 전체 text 데이터에 대한 전처리  
for s in data['text']:
    clean_summary.append(preprocess_sentence(s, False))

clean_summary[:5]

['saurav kant an alumnus of upgrad and iiit pg program in machine learning and artificial intelligence was sr systems engineer at infosys with almost years of work experience the program and upgrad degree career support helped him transition to data scientist at tech mahindra with salary hike upgrad online power learning has powered lakh careers',
 'kunal shah credit card bill payment platform cred gave users chance to win free food from swiggy for one year pranav kaushik delhi techie bagged this reward after spending cred coins users get one cred coin per rupee of bill paid which can be used to avail rewards from brands like ixigo bookmyshow ubereats cult fit and more',
 'new zealand defeated india by wickets in the fourth odi at hamilton on thursday to win their first match of the five match odi series india lost an international match under rohit sharma captaincy after consecutive victories dating back to march the match witnessed india getting all out for their seventh lowest total in odi cricket history',
 'with aegon life iterm insurance plan customers can enjoy tax benefits on your premiums paid and save up to on taxes the plan provides life cover up to the age of years also customers have options to insure against critical illnesses disability and accidental death benefit rider with life cover up to the age of years',
 'speaking about the sexual harassment allegations against rajkumar hirani sonam kapoor said have known hirani for many years what if it is not true the metoo movement will get derailed in the metoo movement always believe woman but in this case we need to reserve our judgment she added hirani has been accused by an assistant who worked in sanju']


# 정제 과정에서 문장의 모든 단어가 사라지는 경우가 있을 수 있으므로 빈 값을 Null로 변환
data['headlines'] = clean_text
data['text'] = clean_summary

# 빈 값을 Null 값으로 변환
data.replace('', np.nan, inplace=True)


data.isnull().sum()

headlines    0
text         0
dtype: int64


# 길이 분포 출력
import matplotlib.pyplot as plt

headlines_len = [len(s.split()) for s in data['headlines']]
text_len = [len(s.split()) for s in data['text']]

print('헤드라인의 최소 길이 : {}'.format(np.min(headlines_len)))
print('헤드라인의 최대 길이 : {}'.format(np.max(headlines_len)))
print('헤드라인의 평균 길이 : {}'.format(np.mean(headlines_len)))
print('텍스트의 최소 길이 : {}'.format(np.min(text_len)))
print('텍스트의 최대 길이 : {}'.format(np.max(text_len)))
print('텍스트의 평균 길이 : {}'.format(np.mean(text_len)))

plt.subplot(1,2,1)
plt.boxplot(text_len)
plt.title('text')
plt.subplot(1,2,2)
plt.boxplot(headlines_len)
plt.title('headlines')
plt.tight_layout()
plt.show()

plt.title('text')
plt.hist(text_len, bins = 40)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

plt.title('headlines')
plt.hist(headlines_len, bins = 40)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

헤드라인의 최소 길이 : 1
헤드라인의 최대 길이 : 13
헤드라인의 평균 길이 : 7.1367317867317865
텍스트의 최소 길이 : 1
텍스트의 최대 길이 : 69
텍스트의 평균 길이 : 56.18174603174603


# 최대길이 임의 설정
headlines_max_len = 11
text_max_len = 60


def below_threshold_len(max_len, nested_list):
  cnt = 0
  for s in nested_list:
    if(len(s.split()) <= max_len):
        cnt = cnt + 1
  print('전체 샘플 중 길이가 %s 이하인 샘플의 비율: %s'%(max_len, (cnt / len(nested_list))))


below_threshold_len(headlines_max_len, data['headlines'])
below_threshold_len(text_max_len,  data['text'])

전체 샘플 중 길이가 11 이하인 샘플의 비율: 0.9997456247456248
전체 샘플 중 길이가 60 이하인 샘플의 비율: 0.9443630443630444


# 정해진 길이보다 길면 제외
data = data[data['headlines'].apply(lambda x: len(x.split()) <= headlines_max_len)]
data = data[data['text'].apply(lambda x: len(x.split()) <= text_max_len)]
print('전체 샘플수 :',(len(data)))

전체 샘플수 : 92790


#요약 데이터에는 시작 토큰과 종료 토큰을 추가한다.
data['decoder_input'] = data['headlines'].apply(lambda x : 'sostoken '+ x)
data['decoder_target'] = data['headlines'].apply(lambda x : x + ' eostoken')
data.head()


encoder_input = np.array(data['headlines']) # 인코더의 입력
decoder_input = np.array(data['decoder_input']) # 디코더의 입력
decoder_target = np.array(data['decoder_target']) # 디코더의 레이블


# 랜덤 정수 시퀀스를 이용하여 나눔
indices = np.arange(encoder_input.shape[0])
np.random.shuffle(indices)
print(indices)

[65749 30509 44557 ... 29191 64870 27888]


encoder_input = encoder_input[indices]
decoder_input = decoder_input[indices]
decoder_target = decoder_target[indices]


n_of_val = int(len(encoder_input)*0.2)
print('테스트 데이터의 수 :',n_of_val)

테스트 데이터의 수 : 18558


encoder_input_train = encoder_input[:-n_of_val]
decoder_input_train = decoder_input[:-n_of_val]
decoder_target_train = decoder_target[:-n_of_val]

encoder_input_test = encoder_input[-n_of_val:]
decoder_input_test = decoder_input[-n_of_val:]
decoder_target_test = decoder_target[-n_of_val:]

print('훈련 데이터의 개수 :', len(encoder_input_train))
print('훈련 레이블의 개수 :',len(decoder_input_train))
print('테스트 데이터의 개수 :',len(encoder_input_test))
print('테스트 레이블의 개수 :',len(decoder_input_test))

훈련 데이터의 개수 : 74232
훈련 레이블의 개수 : 74232
테스트 데이터의 개수 : 18558
테스트 레이블의 개수 : 18558


src_tokenizer = Tokenizer() # 토크나이저 정의
src_tokenizer.fit_on_texts(encoder_input_train) # 입력된 데이터로부터 단어 집합 생성


threshold = 5 # 등장 빈도수
total_cnt = len(src_tokenizer.word_index) # 단어의 수
rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
for key, value in src_tokenizer.word_counts.items():
    total_freq = total_freq + value

    # 단어의 등장 빈도수가 threshold보다 작으면
    if(value < threshold):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

print('단어 집합(vocabulary)의 크기 :',total_cnt)
print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))
print('단어 집합에서 희귀 단어를 제외시킬 경우의 단어 집합의 크기 %s'%(total_cnt - rare_cnt))
print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)

단어 집합(vocabulary)의 크기 : 29408
등장 빈도가 4번 이하인 희귀 단어의 수: 18294
단어 집합에서 희귀 단어를 제외시킬 경우의 단어 집합의 크기 11114
단어 집합에서 희귀 단어의 비율: 62.2075625680087
전체 등장 빈도에서 희귀 단어 등장 빈도 비율: 5.879119010643966


src_vocab = 11000
src_tokenizer = Tokenizer(num_words = src_vocab) # 단어 집합의 크기를 11,000으로 제한
src_tokenizer.fit_on_texts(encoder_input_train) # 단어 집합 재생성.


# 텍스트 시퀀스를 정수 시퀀스로 변환
encoder_input_train = src_tokenizer.texts_to_sequences(encoder_input_train) 
encoder_input_test = src_tokenizer.texts_to_sequences(encoder_input_test)

#잘 진행되었는지 샘플 출력
print(encoder_input_train[:3])
print(encoder_input_test[:3])

[[179, 1379, 47, 3614, 85, 169], [219, 157, 95, 3748, 565, 2726], [2792, 6864, 926, 4, 18]]
[[5558, 934, 5154, 6986], [98, 182, 147, 2187, 198, 805], [121, 2653, 6189, 1563, 323, 4230, 748, 2297]]


tar_tokenizer = Tokenizer()
tar_tokenizer.fit_on_texts(decoder_input_train)


threshold = 5
total_cnt = len(tar_tokenizer.word_index) # 단어의 수
rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
for key, value in tar_tokenizer.word_counts.items():
    total_freq = total_freq + value

    # 단어의 등장 빈도수가 threshold보다 작으면
    if(value < threshold):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

print('단어 집합(vocabulary)의 크기 :',total_cnt)
print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))
print('단어 집합에서 희귀 단어를 제외시킬 경우의 단어 집합의 크기 %s'%(total_cnt - rare_cnt))
print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)

단어 집합(vocabulary)의 크기 : 29409
등장 빈도가 4번 이하인 희귀 단어의 수: 18294
단어 집합에서 희귀 단어를 제외시킬 경우의 단어 집합의 크기 11115
단어 집합에서 희귀 단어의 비율: 62.20544731204733
전체 등장 빈도에서 희귀 단어 등장 빈도 비율: 5.15602787525723


tar_vocab = 11000
tar_tokenizer = Tokenizer(num_words = tar_vocab) 
tar_tokenizer.fit_on_texts(decoder_input_train)
tar_tokenizer.fit_on_texts(decoder_target_train)

# 텍스트 시퀀스를 정수 시퀀스로 변환
decoder_input_train = tar_tokenizer.texts_to_sequences(decoder_input_train) 
decoder_target_train = tar_tokenizer.texts_to_sequences(decoder_target_train)
decoder_input_test = tar_tokenizer.texts_to_sequences(decoder_input_test)
decoder_target_test = tar_tokenizer.texts_to_sequences(decoder_target_test)

#잘 변환되었는지 확인
print('input')
print('input ',decoder_input_train[:5])
print('target')
print('decoder ',decoder_target_train[:5])

input
input  [[1, 181, 1381, 49, 3616, 87, 171], [1, 221, 159, 97, 3750, 567, 2728], [1, 2794, 6866, 928, 6, 20], [1, 2729, 971, 48, 1034, 501, 2794, 489], [1, 1198, 14, 65, 3124, 6, 20, 15, 1382]]
target
decoder  [[181, 1381, 49, 3616, 87, 171, 2], [221, 159, 97, 3750, 567, 2728, 2], [2794, 6866, 928, 6, 20, 2], [2729, 971, 48, 1034, 501, 2794, 489, 2], [1198, 14, 65, 3124, 6, 20, 15, 1382, 2]]


# 빈도수가 낮은 단어들로만 구성되었던 문장들을 제거
drop_train = [index for index, sentence in enumerate(decoder_input_train) if len(sentence) == 1]
drop_test = [index for index, sentence in enumerate(decoder_input_test) if len(sentence) == 1]

print('삭제할 훈련 데이터의 개수 :',len(drop_train))
print('삭제할 테스트 데이터의 개수 :',len(drop_test))

encoder_input_train = np.delete(encoder_input_train, drop_train, axis=0)
decoder_input_train = np.delete(decoder_input_train, drop_train, axis=0)
decoder_target_train = np.delete(decoder_target_train, drop_train, axis=0)

encoder_input_test = np.delete(encoder_input_test, drop_test, axis=0)
decoder_input_test = np.delete(decoder_input_test, drop_test, axis=0)
decoder_target_test = np.delete(decoder_target_test, drop_test, axis=0)

print('훈련 데이터의 개수 :', len(encoder_input_train))
print('훈련 레이블의 개수 :',len(decoder_input_train))
print('테스트 데이터의 개수 :',len(encoder_input_test))
print('테스트 레이블의 개수 :',len(decoder_input_test))

삭제할 훈련 데이터의 개수 : 0
삭제할 테스트 데이터의 개수 : 0
훈련 데이터의 개수 : 74231
훈련 레이블의 개수 : 74231
테스트 데이터의 개수 : 18557
테스트 레이블의 개수 : 18557


encoder_input_train = pad_sequences(encoder_input_train, maxlen = headlines_max_len, padding='post')
encoder_input_test = pad_sequences(encoder_input_test, maxlen = headlines_max_len, padding='post')
decoder_input_train = pad_sequences(decoder_input_train, maxlen = text_max_len, padding='post')
decoder_target_train = pad_sequences(decoder_target_train, maxlen = text_max_len, padding='post')
decoder_input_test = pad_sequences(decoder_input_test, maxlen = text_max_len, padding='post')
decoder_target_test = pad_sequences(decoder_target_test, maxlen = text_max_len, padding='post')


from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


# 인코더 설계 시작
embedding_dim = 128 # 임베딩벡터의 차원 수
hidden_size = 128 # LSTM에서 얼만큼의 수용력(capacity)를 가질지를 정하는 파라미터로 층 1개의 용량의 크기를 나타냄

# 인코더
encoder_inputs = Input(shape=(headlines_max_len,))

# 인코더의 임베딩 층
enc_emb = Embedding(src_vocab, embedding_dim)(encoder_inputs)

# 4개로 구성하여 모델의 용량을 늘림
# 인코더의 LSTM 1
encoder_lstm1 = LSTM(hidden_size, return_sequences=True, return_state=True ,dropout = 0.4, recurrent_dropout = 0.4)
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)

# 인코더의 LSTM 2
encoder_lstm2 = LSTM(hidden_size, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)

# 인코더의 LSTM 3
encoder_lstm3 = LSTM(hidden_size, return_state=True, return_sequences=True, dropout=0.4, recurrent_dropout=0.4)
encoder_output3, state_h3, state_c3= encoder_lstm3(encoder_output2)

# 인코더의 LSTM 4
encoder_lstm4 = LSTM(hidden_size, return_state=True, return_sequences=True, dropout=0.4, recurrent_dropout=0.4)
encoder_outputs, state_h, state_c= encoder_lstm4(encoder_output3)

WARNING:tensorflow:Layer lstm_12 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU
WARNING:tensorflow:Layer lstm_13 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU
WARNING:tensorflow:Layer lstm_14 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU
WARNING:tensorflow:Layer lstm_15 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU


# 디코더 설계

decoder_inputs = Input(shape=(None,))

# 디코더의 임베딩 층
dec_emb_layer = Embedding(tar_vocab, embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)

# 디코더의 LSTM
decoder_lstm = LSTM(hidden_size, return_sequences = True, return_state = True, dropout = 0.4, recurrent_dropout=0.2)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state = [state_h, state_c])

WARNING:tensorflow:Layer lstm_16 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU


# 디코더의 출력층
decoder_softmax_layer = Dense(tar_vocab, activation = 'softmax')
decoder_softmax_outputs = decoder_softmax_layer(decoder_outputs) 

# 모델 정의
model = Model([encoder_inputs, decoder_inputs], decoder_softmax_outputs)
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_6 (InputLayer)            [(None, 11)]         0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 11, 128)      1408000     input_6[0][0]                    
__________________________________________________________________________________________________
lstm_12 (LSTM)                  [(None, 11, 128), (N 131584      embedding_5[0][0]                
__________________________________________________________________________________________________
lstm_13 (LSTM)                  [(None, 11, 128), (N 131584      lstm_12[0][0]                    
__________________________________________________________________________________________________
input_7 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
lstm_14 (LSTM)                  [(None, 11, 128), (N 131584      lstm_13[0][0]                    
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, None, 128)    1408000     input_7[0][0]                    
__________________________________________________________________________________________________
lstm_15 (LSTM)                  [(None, 11, 128), (N 131584      lstm_14[0][0]                    
__________________________________________________________________________________________________
lstm_16 (LSTM)                  [(None, None, 128),  131584      embedding_6[0][0]                
                                                                 lstm_15[0][1]                    
                                                                 lstm_15[0][2]                    
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, None, 11000)  1419000     lstm_16[0][0]                    
==================================================================================================
Total params: 4,892,920
Trainable params: 4,892,920
Non-trainable params: 0
__________________________________________________________________________________________________


from attention import AttentionLayer

# 어텐션 층(어텐션 함수)
attn_layer = AttentionLayer(name='attention_layer')
# 인코더와 디코더의 모든 time step의 hidden state를 어텐션 층에 전달하고 결과를 리턴
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs])

# 어텐션의 결과와 디코더의 hidden state들을 연결
decoder_concat_input = Concatenate(axis = -1, name='concat_layer')([decoder_outputs, attn_out])

# 디코더의 출력층
decoder_softmax_layer = Dense(tar_vocab, activation='softmax')
decoder_softmax_outputs = decoder_softmax_layer(decoder_concat_input)

# 모델 정의
model = Model([encoder_inputs, decoder_inputs], decoder_softmax_outputs)
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_6 (InputLayer)            [(None, 11)]         0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 11, 128)      1408000     input_6[0][0]                    
__________________________________________________________________________________________________
lstm_12 (LSTM)                  [(None, 11, 128), (N 131584      embedding_5[0][0]                
__________________________________________________________________________________________________
lstm_13 (LSTM)                  [(None, 11, 128), (N 131584      lstm_12[0][0]                    
__________________________________________________________________________________________________
input_7 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
lstm_14 (LSTM)                  [(None, 11, 128), (N 131584      lstm_13[0][0]                    
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, None, 128)    1408000     input_7[0][0]                    
__________________________________________________________________________________________________
lstm_15 (LSTM)                  [(None, 11, 128), (N 131584      lstm_14[0][0]                    
__________________________________________________________________________________________________
lstm_16 (LSTM)                  [(None, None, 128),  131584      embedding_6[0][0]                
                                                                 lstm_15[0][1]                    
                                                                 lstm_15[0][2]                    
__________________________________________________________________________________________________
attention_layer (AttentionLayer ((None, None, 128),  32896       lstm_15[0][0]                    
                                                                 lstm_16[0][0]                    
__________________________________________________________________________________________________
concat_layer (Concatenate)      (None, None, 256)    0           lstm_16[0][0]                    
                                                                 attention_layer[0][0]            
__________________________________________________________________________________________________
dense_3 (Dense)                 (None, None, 11000)  2827000     concat_layer[0][0]               
==================================================================================================
Total params: 6,333,816
Trainable params: 6,333,816
Non-trainable params: 0
__________________________________________________________________________________________________


model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience = 2)
history = model.fit(x = [encoder_input_train, decoder_input_train], y = decoder_target_train, \
          validation_data = ([encoder_input_test, decoder_input_test], decoder_target_test),
          batch_size = 256, callbacks=[es], epochs = 5)

Epoch 1/5
290/290 [==============================] - 140s 483ms/step - loss: 1.1246 - val_loss: 0.9097
Epoch 2/5
290/290 [==============================] - 139s 478ms/step - loss: 0.9029 - val_loss: 0.8577
Epoch 3/5
290/290 [==============================] - 139s 479ms/step - loss: 0.8547 - val_loss: 0.8165
Epoch 4/5
290/290 [==============================] - 139s 478ms/step - loss: 0.8131 - val_loss: 0.7749
Epoch 5/5
290/290 [==============================] - 139s 478ms/step - loss: 0.7669 - val_loss: 0.7376


plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()


src_index_to_word = src_tokenizer.index_word # 원문 단어 집합에서 정수 -> 단어를 얻음
tar_word_to_index = tar_tokenizer.word_index # 요약 단어 집합에서 단어 -> 정수를 얻음
tar_index_to_word = tar_tokenizer.index_word # 요약 단어 집합에서 정수 -> 단어를 얻음


# 인코더 설계
encoder_model = Model(inputs=encoder_inputs, outputs=[encoder_outputs, state_h, state_c])

# 이전 시점의 상태들을 저장하는 텐서
decoder_state_input_h = Input(shape=(hidden_size,))
decoder_state_input_c = Input(shape=(hidden_size,))

dec_emb2 = dec_emb_layer(decoder_inputs)
# 문장의 다음 단어를 예측하기 위해서 초기 상태(initial_state)를 이전 시점의 상태로 사용. 이는 뒤의 함수 decode_sequence()에 구현
# 훈련 과정에서와 달리 LSTM의 리턴하는 은닉 상태와 셀 상태인 state_h와 state_c를 버리지 않음.
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])


# 어텐션 함수
decoder_hidden_state_input = Input(shape=(text_max_len, hidden_size))
attn_out_inf, attn_states_inf = attn_layer([decoder_hidden_state_input, decoder_outputs2])
decoder_inf_concat = Concatenate(axis=-1, name='concat')([decoder_outputs2, attn_out_inf])

# 디코더의 출력층
decoder_outputs2 = decoder_softmax_layer(decoder_inf_concat) 

# 최종 디코더 모델
decoder_model = Model(
    [decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs2] + [state_h2, state_c2])


def decode_sequence(input_seq):
    # 입력으로부터 인코더의 상태를 얻음
    e_out, e_h, e_c = encoder_model.predict(input_seq)

     # <SOS>에 해당하는 토큰 생성
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = tar_word_to_index['sostoken']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition: # stop_condition이 True가 될 때까지 루프 반복

        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = tar_index_to_word[sampled_token_index]

        if(sampled_token!='eostoken'):
            decoded_sentence += ' '+sampled_token

        #  <eos>에 도달하거나 최대 길이를 넘으면 중단.
        if (sampled_token == 'eostoken'  or len(decoded_sentence.split()) >= (text_max_len-1)):
            stop_condition = True

        # 길이가 1인 타겟 시퀀스를 업데이트
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # 상태를 업데이트 합니다.
        e_h, e_c = h, c

    return decoded_sentence


# 원문의 정수 시퀀스를 텍스트 시퀀스로 변환
def seq2text(input_seq):
    temp=''
    for i in input_seq:
        if(i!=0):
            temp = temp + src_index_to_word[i]+' '
    return temp

# 요약문의 정수 시퀀스를 텍스트 시퀀스로 변환
def seq2summary(input_seq):
    temp=''
    for i in input_seq:
        if((i!=0 and i!=tar_word_to_index['sostoken']) and i!=tar_word_to_index['eostoken']):
            temp = temp + tar_index_to_word[i] + ' '
    return temp


for i in range(10, 20):
    print("원문 :", seq2text(encoder_input_test[i]))
    print("실제 요약 :", seq2summary(decoder_input_test[i]))
    print("예측 요약 :", decode_sequence(encoder_input_test[i].reshape(1, headlines_max_len)))
    print("\n")

원문 : lg star acs help save power bills 
실제 요약 : lg star acs help save power bills 
예측 요약 :  want like like people due due sc


원문 : abhishek backs jp dutta day shoot 
실제 요약 : abhishek backs jp dutta day shoot 
예측 요약 :  happy star play time award award


원문 : pak newspaper posed us raw agents indian cleric 
실제 요약 : pak newspaper posed us raw agents indian cleric 
예측 요약 :  pak prez calls us prez st time years


원문 : hot air balloon tourists crashes lake 
실제 요약 : hot air balloon tourists crashes lake 
예측 요약 :  tata army army found plane study


원문 : sc directs file document taj mahal 
실제 요약 : sc directs file document taj mahal 
예측 요약 :  sc orders plea aadhaar water water


원문 : younis first pakistani batsman score test runs 
실제 요약 : younis first pakistani batsman score test runs 
예측 요약 :  kohli wins st ever odi test win


원문 : bible quran placed next kalam statue along gita 
실제 요약 : bible quran placed next kalam statue along gita 
예측 요약 :  sensex slams dhoni takes catch catch catch study


원문 : loves sleep watson dhoni 
실제 요약 : loves sleep watson dhoni 
예측 요약 :  david hits sehwag study


원문 : kartarpur corridor mistake made pm modi 
실제 요약 : kartarpur corridor mistake made pm modi 
예측 요약 :  sensex kumar singh pm modi pm modi


원문 : ex un chief annan get state burial home country ghana 
실제 요약 : ex un chief annan get state burial home country ghana 
예측 요약 :  ex prez chief chief chief amid amid amid amid ban


from summa.summarizer import summarize

data = pd.read_csv('news_summary_more.csv', encoding='iso-8859-1', dtype={'headlines':'str', 'text':'str'})
data = data[['headlines','text']]
data = pd.DataFrame(data)

print(data.type)

print('Summary:')
#print(summarize(data2['headlines']))

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-71-414fa8163a6a> in <module>
      5 data = pd.DataFrame(data)
      6 
----> 7 print(data.type)
      8 
      9 print('Summary:')

~/anaconda3/envs/aiffel/lib/python3.7/site-packages/pandas/core/generic.py in __getattr__(self, name)
   5139             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   5140                 return self[name]
-> 5141             return object.__getattribute__(self, name)
   5142 
   5143     def __setattr__(self, name: str, value) -> None:

AttributeError: 'DataFrame' object has no attribute 'type'

	Id	ProductId	UserId	ProfileName	HelpfulnessNumerator	HelpfulnessDenominator	Score	Time	Summary	Text
0	1	B001E4KFG0	A3SGXH7AUHU8GW	delmartian	1	1	5	1303862400	Good Quality Dog Food	I have bought several of the Vitality canned d...
1	2	B00813GRG4	A1D87F6ZCVE5NK	dll pa	0	0	1	1346976000	Not as Advertised	Product arrived labeled as Jumbo Salted Peanut...
2	3	B000LQOCH0	ABXLMWJIXXAIN	Natalia Corres "Natalia Corres"	1	1	4	1219017600	"Delight" says it all	This is a confection that has been around a fe...
3	4	B000UA0QIQ	A395BORC6FGVXV	Karl	3	3	2	1307923200	Cough Medicine	If you are looking for the secret ingredient i...
4	5	B006K2ZZ7K	A1UQRSCLF8GW1T	Michael D. Bigham "M. Wassir"	0	0	5	1350777600	Great taffy	Great taffy at a great price. There was a wid...

	Text	Summary
80722	the noodles are super al dente and bouncy, unl...	Al dente
54275	The Dolce Gusto Cappuccino is weak and watery,...	Not impressed
28109	This is a quick an easy breakfast. I just had ...	Nice breakfast
73542	This flavor is my favorite. I put it in my cof...	Torani sugar-free borwn sugar cinnamon syrup
33962	Was told by a friend to try Nettle tea for all...	Really works for hayfever (allergies)...
17545	This makes for a great cup of coffee! It's th...	My favorite!
45504	Like most other reviewers have stated, the gin...	Recommended for ginger fans.
64062	I was ver disappointed when I opened this to f...	Unreal
30728	These are very delicious, not quite the standa...	nice 100-calorie snack
93992	My dogs love these treats and I was buying the...	Great Product, Great deal
33427	Oatmeal cookies can be found in pretty much an...	Nothing special - except the price
80064	We were turned on to African Nectar by our son...	Flavorful change of pace
56087	This arrived at least a week late with no refr...	Bad meat.
8080	Very tasty, I used it for years, they recently...	Excellent seasoning
43628	I bought this cat food because of the wonderfu...	Terrible cat food

	Text	Summary	decoder_input	decoder_target
0	bought several vitality canned dog food produc...	good quality dog food	sostoken good quality dog food	good quality dog food eostoken
1	product arrived labeled jumbo salted peanuts p...	not as advertised	sostoken not as advertised	not as advertised eostoken
2	confection around centuries light pillowy citr...	delight says it all	sostoken delight says it all	delight says it all eostoken
3	looking secret ingredient robitussin believe f...	cough medicine	sostoken cough medicine	cough medicine eostoken
4	great taffy great price wide assortment yummy ...	great taffy	sostoken great taffy	great taffy eostoken

	headlines	text
24119	Pakistan election results tainted: Former PM N...	Former Pakistan PM Nawaz Sharif has rejected t...
4578	Someone shoot me: Irani jokes after Janhvi Kap...	Union Minister Smriti Irani took to Instagram ...
8542	'I don't believe it': Trump on own administrat...	US President Donald Trump has said he doesn't ...
50525	Film based on player's retirement poem nominat...	'Dear Basketball', an animated version of five...
8944	Harbhajan didn't slap, he hit me with back of ...	Recalling the 2008 IPL slapgate controversy in...
94719	SC notice to 6 states on plea seeking ban on c...	The Supreme Court on Friday gave three-week ti...
87164	Malware 'Judy' hits 3.65 crore Android users	A malware named 'Judy' has attacked over 3.65 ...
41188	Special status synonymous with jobs for youth:...	YSR Congress Party chief Jagan Mohan Reddy on ...
76975	Amazon sells toilet paper with US Prez Donald ...	Amazon is selling toilet paper with US Preside...
72239	NASA to use Earth as slingshot to propel space...	In a first-of-its-kind mission to bring back a...

	headlines	text	decoder_input	decoder_target
0	upgrad learner switches career ml al salary hike	saurav kant an alumnus of upgrad and iiit pg p...	sostoken upgrad learner switches career ml al ...	upgrad learner switches career ml al salary hi...
1	delhi techie wins free food swiggy one year cred	kunal shah credit card bill payment platform c...	sostoken delhi techie wins free food swiggy on...	delhi techie wins free food swiggy one year cr...
2	new zealand end rohit sharma led india match w...	new zealand defeated india by wickets in the f...	sostoken new zealand end rohit sharma led indi...	new zealand end rohit sharma led india match w...
3	aegon life iterm insurance plan helps customer...	with aegon life iterm insurance plan customers...	sostoken aegon life iterm insurance plan helps...	aegon life iterm insurance plan helps customer...
4	known hirani yrs metoo claims true sonam	speaking about the sexual harassment allegatio...	sostoken known hirani yrs metoo claims true sonam	known hirani yrs metoo claims true sonam eostoken

A.I

A.I

Explolation11 텍스트 요약 본문

Explolation11 텍스트 요약

텍스트 요약¶

1. 추출적 요약(Extractive Summarization)¶

2. 추상적 요약(Abstractive Summarization)¶

1. Seq2Seq 모델을 이용한 텍스트 추상적 요약¶

seq2seq란?¶

훈련 요약¶

어텐션 메커니즘이란?¶

데이터 준비¶

데이터 전처리¶

텍스트 정규화(text normalization)¶

훈련데이터와 테스트데이터 나누기¶

시작 토큰과 종료 토큰 추가하기¶

훈련, 테스트 데이터 분리¶

단어 집합(vocaburary) 만들기 및 정수 인코딩 - 각 단어에 고유정수를 매핑¶

패딩¶

모델 설계¶

중간 모델 정리¶

어텐션 매커니즘¶

모델 훈련¶

인퍼런스 모델 구현하기 - 정수 데이터를 다시 실제 데이터로 복원¶

모델 테스트¶

2. summarize 모듈을 이용한 추출적 요약¶

summarize 사용하기¶

3. 프로젝트: 뉴스기사 요약해보기¶

3-1. 추상적 요약¶

데이터 수집하기¶

데이터 분리¶

단어집합 만들기¶

3-2. 추출적 요약¶

정리¶

'AIFFEL' 카테고리의 다른 글

티스토리툴바

Explolation15 챗봇 만들기 (0)	2021.03.03
Explolation14 의료영상 진단 (0)	2021.02.25
Explolation13 주식 가격 예측 (0)	2021.02.23
Explolation12 생성자 모델링 (0)	2021.02.23
Explolation10 인물사진 배경바꿔보기 (0)	2021.02.04

« 2024/07 »
일	월	화	수	목	금	토
	1	2	3	4	5	6
7	8	9	10	11	12	13
14	15	16	17	18	19	20
21	22	23	24	25	26	27
28	29	30	31