A.I
활성화 함수의 이해 본문
활성화 함수의 이해¶
- wget https://aiffelstaticprd.blob.core.windows.net/media/original_images/jindo_dog.jpg
- mkdir -p ~/aiffel/activation
- mv jindo_dog.jpg ~/aiffel/activation
In [1]:
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from itertools import product
import tensorflow as tf
tf.random.set_seed(7879)
print('Ready to activate?⤴')
Ready to activate?⤴
In [3]:
def binary_step(x, threshold=0):
# threshold가 있는 함수를 쓰면 꼭 defualt 값을 설정해주세요
return 0 if x<threshold else 1
In [4]:
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
def plot_and_visulize(image_url, function, derivative=False):
X = [-10 + x/100 for x in range(2000)]
y = [function(y) for y in X]
plt.figure(figsize=(12,12))
# 함수 그래프
plt.subplot(3,2,1)
plt.title('function')
plt.plot(X,y)
# 함수의 미분 그래프
plt.subplot(3,2,2)
plt.title('derivative')
if derivative:
dev_y = [derivative(y) for y in X]
plt.plot(X,dev_y)
# 무작위 샘플들 분포
samples = np.random.rand(1000)
samples -= np.mean(samples)
plt.subplot(3,2,3)
plt.title('samples')
plt.hist(samples,100)
# 활성화 함수를 통과한 샘플들 분포
act_values = [function(y) for y in samples]
plt.subplot(3,2,4)
plt.title('activation values')
plt.hist(act_values,100)
# 원본 이미지
image = np.array(Image.open(image_url), dtype=np.float64)[:,:,0]/255. # 구분을 위해 gray-scale해서 확인
image -= np.median(image)
plt.subplot(3,2,5)
plt.title('origin image')
plt.imshow(image, cmap='gray')
# 활성화 함수를 통과한 이미지
activation_image = np.zeros(image.shape)
h, w = image.shape
for i in range(w):
for j in range(h):
activation_image[j][i] += function(image[j][i])
plt.subplot(3,2,6)
plt.title('activation results')
plt.imshow(activation_image, cmap='gray')
return plt
In [5]:
import os
img_path = os.getenv('HOME')+'/aiffel/activation/jindo_dog.jpg'
ax = plot_and_visulize(img_path, binary_step)
ax.show()
In [6]:
# 퍼셉트론
class Perceptron(object):
def __init__(self, input_size, activation_ftn, threshold=0, learning_rate=0.01):
self.weights = np.random.randn(input_size)
self.bias = np.random.randn(1)
self.activation_ftn = np.vectorize(activation_ftn)
self.learning_rate = learning_rate
self.threshold = threshold
def train(self, training_inputs, labels, epochs=100, verbose=1):
'''
verbose : 0-매 에포크 결과 출력,
1-마지막 결과만 출력
'''
for epoch in range(epochs):
for inputs, label in zip(training_inputs, labels):
prediction = self.__call__(inputs)
self.weights += self.learning_rate * (label - prediction) * inputs
self.bias += self.learning_rate * (label - prediction)
if verbose == 1:
pred = self.__call__(training_inputs)
accuracy = np.sum(pred==labels)/len(pred)
print(f'{epoch}th epoch, accuracy : {accuracy}')
if verbose == 0:
pred = self.__call__(training_inputs)
accuracy = np.sum(pred==labels)/len(pred)
print(f'{epoch}th epoch, accuracy : {accuracy}')
def get_weights(self):
return self.weights, self.bias
def __call__(self, inputs):
summation = np.dot(inputs, self.weights) + self.bias
return self.activation_ftn(summation, self.threshold)
In [7]:
def scatter_plot(plt, X, y, threshold = 0, three_d=False):
ax = plt
if not three_d:
area1 = np.ma.masked_where(y <= threshold, y)
area2 = np.ma.masked_where(y > threshold, y+1)
ax.scatter(X[:,0], X[:,1], s = area1*10, label='True')
ax.scatter(X[:,0], X[:,1], s = area2*10, label='False')
ax.legend()
else:
area1 = np.ma.masked_where(y <= threshold, y)
area2 = np.ma.masked_where(y > threshold, y+1)
ax.scatter(X[:,0], X[:,1], y-threshold, s = area1, label='True')
ax.scatter(X[:,0], X[:,1], y-threshold, s = area2, label='False')
ax.scatter(X[:,0], X[:,1], 0, s = 0.05, label='zero', c='gray')
ax.legend()
return ax
In [8]:
# AND gate, OR gate
X = np.array([[0,0], [1,0], [0,1], [1,1]])
plt.figure(figsize=(10,5))
# OR gate
or_y = np.array([x1 | x2 for x1,x2 in X])
ax1 = plt.subplot(1,2,1)
ax1.set_title('OR gate ' + str(or_y))
ax1 = scatter_plot(ax1, X, or_y)
# AND gate
and_y = np.array([x1 & x2 for x1,x2 in X])
ax2 = plt.subplot(1,2,2)
ax2.set_title('AND gate ' + str(and_y))
ax2 = scatter_plot(ax2, X, and_y)
plt.show()
In [9]:
# OR gate
or_p = Perceptron(input_size=2, activation_ftn=binary_step)
or_p.train(X, or_y, epochs=1000, verbose=0)
print(or_p.get_weights()) # 가중치와 편향값은 훈련마다 달라질 수 있습니다.
# AND gate
and_p = Perceptron(input_size=2, activation_ftn=binary_step)
and_p.train(X, and_y, epochs=1000, verbose=0)
print(and_p.get_weights()) # 가중치와 편향값은 훈련마다 달라질 수 있습니다.
999th epoch, accuracy : 1.0 (array([0.01277841, 0.58898094]), array([-0.00408796])) 999th epoch, accuracy : 1.0 (array([1.31756056, 0.00349803]), array([-1.32064038]))
In [10]:
from itertools import product
# 그래프로 그려보기
test_X = np.array([[x/100,y/100] for (x,y) in product(range(101),range(101))])
pred_or_y = or_p(test_X)
pred_and_y = and_p(test_X)
plt.figure(figsize=(10,10))
ax1 = plt.subplot(2,2,1)
ax1.set_title('predict OR gate')
ax1 = scatter_plot(ax1, test_X, pred_or_y)
ax2 = plt.subplot(2,2,2, projection='3d')
ax2.set_title('predict OR gate 3D')
ax2 = scatter_plot(ax2, test_X, pred_or_y, three_d=True)
ax3 = plt.subplot(2,2,3)
ax3.set_title('predict AND gate')
ax3 = scatter_plot(ax3, test_X, pred_and_y)
ax4 = plt.subplot(2,2,4, projection='3d')
ax4.set_title('predict AND gate 3D')
ax4 = scatter_plot(ax4, test_X, pred_and_y, three_d=True)
plt.show()
이진 계단 함수의 한계¶
- 단층 퍼셉트론의 한계로 XOR gate의 문제를 해결 할 수 없음
- 역전파 알고리즘(backpropagation algorithm)을 사용하지 못함. 미분을 할 수 없어 역전파에서 가중치들이 업데이트되지 않음.
- 다중 출력은 할 수 없음.
In [11]:
# XOR gate
threshold = 0
X = np.array([[0,0], [1,0], [0,1], [1,1]])
plt.figure(figsize=(5,5))
xor_y = np.array([x1 ^ x2 for x1,x2 in X])
plt.title('XOR gate '+ str(xor_y))
plt = scatter_plot(plt, X, xor_y)
plt.show()
In [12]:
# XOR gate가 풀릴까?
xor_p = Perceptron(input_size=2, activation_ftn=binary_step, threshold=threshold)
xor_p.train(X, xor_y, epochs=1000, verbose=0)
print(xor_p.get_weights())
# 그래프로 그려보기
test_X = np.array([[x/100,y/100] for (x,y) in product(range(101),range(101))])
pred_xor_y = xor_p(test_X)
plt.figure(figsize=(10,5))
ax1 = plt.subplot(1,2,1)
ax1.set_title('predict XOR gate?')
ax1 = scatter_plot(ax1, test_X, pred_xor_y)
ax2 = plt.subplot(1,2,2, projection='3d')
ax2.set_title('predict XOR gate 3D?')
ax2 = scatter_plot(ax2, test_X, pred_xor_y, three_d=True)
plt.show()
999th epoch, accuracy : 0.5 (array([ 0.00612787, -0.02724664]), array([0.00342468]))
2. 선형 활성화 함수¶
In [13]:
import os
img_path = os.getenv('HOME')+'/aiffel/activation/jindo_dog.jpg'
# 선형 함수
def linear(x):
return x
def dev_linear(x):
return 1
# 시각화
ax = plot_and_visulize(img_path, linear, dev_linear)
ax.show()
In [14]:
# AND gate, OR gate
threshold = 0
X = np.array([[0,0], [1,0], [0,1], [1,1]])
plt.figure(figsize=(10,5))
# OR gate
or_y = np.array([x1 | x2 for x1,x2 in X])
ax1 = plt.subplot(1,2,1)
ax1.set_title('OR gate ' + str(or_y))
ax1 = scatter_plot(ax1, X, or_y)
# AND gate
and_y = np.array([x1 & x2 for x1,x2 in X])
ax2 = plt.subplot(1,2,2)
ax2.set_title('AND gate ' + str(and_y))
ax2 = scatter_plot(ax2, X, and_y)
plt.show()
In [15]:
import tensorflow as tf
# OR gate model
or_linear_model = tf.keras.Sequential([
tf.keras.layers.Input(shape=(2,), dtype='float64'),
tf.keras.layers.Dense(1, activation='linear')
])
or_linear_model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy'])
or_linear_model.summary()
# AND gate model
and_linear_model = tf.keras.Sequential([
tf.keras.layers.Input(shape=(2,), dtype='float64'),
tf.keras.layers.Dense(1, activation='linear')
])
and_linear_model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy'])
and_linear_model.summary()
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dense (Dense) (None, 1) 3 ================================================================= Total params: 3 Trainable params: 3 Non-trainable params: 0 _________________________________________________________________ Model: "sequential_1" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dense_1 (Dense) (None, 1) 3 ================================================================= Total params: 3 Trainable params: 3 Non-trainable params: 0 _________________________________________________________________
In [16]:
or_linear_model.fit(X, or_y, epochs=1000, verbose=0)
and_linear_model.fit(X, and_y, epochs=1000, verbose=0)
print('done')
done
In [17]:
# 그래프로 그려보기
test_X = np.array([[x/100,y/100] for (x,y) in product(range(101),range(101))])
pred_or_y = or_linear_model(test_X)
pred_and_y = and_linear_model(test_X)
plt.figure(figsize=(10,10))
ax1 = plt.subplot(2,2,1)
ax1.set_title('predict OR gate')
ax1 = scatter_plot(ax1, test_X, pred_or_y, threshold=0.5)
ax2 = plt.subplot(2,2,2, projection='3d')
ax2.set_title('predict OR gate 3D')
ax2 = scatter_plot(ax2, test_X, pred_or_y, threshold=0.5, three_d=True)
ax3 = plt.subplot(2,2,3)
ax3.set_title('predict AND gate')
ax3 = scatter_plot(ax3, test_X, pred_and_y, threshold=0.5)
ax4 = plt.subplot(2,2,4, projection='3d')
ax4.set_title('predict AND gate 3D')
ax4 = scatter_plot(ax4, test_X, pred_and_y, threshold=0.5, three_d=True)
plt.show()
/home/ssac24/anaconda3/envs/aiffel/lib/python3.7/site-packages/matplotlib/collections.py:922: RuntimeWarning: invalid value encountered in sqrt scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor
In [18]:
# XOR gate
xor_linear_model = tf.keras.Sequential([
tf.keras.layers.Input(shape=(2,), dtype='float64'),
tf.keras.layers.Dense(1, activation='linear')
])
xor_linear_model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy'])
xor_linear_model.fit(X, xor_y, epochs=1000, verbose=0)
# 그래프로 그려보기
test_X = np.array([[x/100,y/100] for (x,y) in product(range(101),range(101))])
pred_xor_y = xor_linear_model(test_X)
plt.figure(figsize=(10,5))
ax1 = plt.subplot(1,2,1)
ax1.set_title('predict XOR gate')
ax1 = scatter_plot(ax1, test_X, pred_xor_y, threshold=0.5)
ax2 = plt.subplot(1,2,2, projection='3d')
ax2.set_title('predict XOR gate 3D')
ax2 = scatter_plot(ax2, test_X, pred_xor_y, threshold=0.5, three_d=True)
plt.show()
3. 비선형 활성화 함수-시그모이드, Softmax¶
- 시그모이드는 Binary Classification 이진분류시 많이 쓰이고,
- SoftMax는 10가지, 100가지 class 등 class의 수에 제한 없이 "각 class의 확률"을 구할 때 쓰입니다.
In [19]:
import os
img_path = os.getenv('HOME')+'/aiffel/activation/jindo_dog.jpg'
# 시그모이드 함수
def sigmoid(x):
return 1/(1+np.exp(-x).astype(np.float64))
def dev_sigmoid(x):
return sigmoid(x)*(1-sigmoid(x))
# 시각화
ax = plot_and_visulize(img_path, sigmoid, dev_sigmoid)
ax.show()
In [22]:
# 수치 미분
def num_derivative(x, function):
h = 1e-15 # 이 값을 바꾸어 가며 그래프를 확인해 보세요
numerator = function(x+h)-function(x)
return numerator/h
# 두 그래프의 차이
diff_X = [-5+x/100 for x in range(1001)]
dev_y = np.array([dev_sigmoid(x) for x in diff_X])
num_dev_y = np.array([num_derivative(x, sigmoid) for x in diff_X])
diff_y = dev_y - num_dev_y
plt.plot(diff_X, num_dev_y, label='numerical')
plt.plot(diff_X, dev_y, label='analytic')
plt.plot(diff_X, diff_y, label='differnce')
plt.legend()
plt.show()
시그모이드 함수의 단점¶
- 0 또는 1에서 포화(saturate)되어 그래디언트가 0과 아주 가까워지기때문에 역전파시 가중치 업데이트가 일어나지 못한다.
- 0이 중심(zero-centered)이 아니라서 훈련의 시간이 오래걸리게 된다는 것입니다.
In [23]:
# OR gate
or_sigmoid_model = tf.keras.Sequential([
tf.keras.layers.Input(shape=(2,)),
tf.keras.layers.Dense(1, activation='sigmoid')
])
or_sigmoid_model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy'])
or_sigmoid_model.fit(X, or_y, epochs=1000, verbose=0)
# AND gate
and_sigmoid_model = tf.keras.Sequential([
tf.keras.layers.Input(shape=(2,)),
tf.keras.layers.Dense(1, activation='sigmoid')
])
and_sigmoid_model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy'])
and_sigmoid_model.fit(X, and_y, epochs=1000, verbose=0)
# XOR gate
xor_sigmoid_model = tf.keras.Sequential([
tf.keras.layers.Input(shape=(2,)),
tf.keras.layers.Dense(1, activation='sigmoid')
])
xor_sigmoid_model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy'])
xor_sigmoid_model.fit(X, xor_y, epochs=1000, verbose=0)
# 그래프로 그려보기
test_X = np.array([[x/100,y/100] for (x,y) in product(range(101),range(101))])
pred_or_y = or_sigmoid_model(test_X)
pred_and_y = and_sigmoid_model(test_X)
pred_xor_y = xor_sigmoid_model(test_X)
plt.figure(figsize=(10,15))
ax1 = plt.subplot(3,2,1)
ax1.set_title('predict OR gate')
ax1 = scatter_plot(ax1, test_X, pred_or_y, threshold=0.5)
ax2 = plt.subplot(3,2,2, projection='3d')
ax2.set_title('predict OR gate 3D')
ax2 = scatter_plot(ax2, test_X, pred_or_y, threshold=0.5, three_d=True)
ax3 = plt.subplot(3,2,3)
ax3.set_title('predict AND gate')
ax3 = scatter_plot(ax3, test_X, pred_and_y, threshold=0.5)
ax4 = plt.subplot(3,2,4, projection='3d')
ax4.set_title('predict AND gate 3D')
ax4 = scatter_plot(ax4, test_X, pred_and_y, threshold=0.5, three_d=True)
ax5 = plt.subplot(3,2,5)
ax5.set_title('predict XOR gate')
ax5 = scatter_plot(ax5, test_X, pred_xor_y, threshold=0.5)
ax6 = plt.subplot(3,2,6, projection='3d')
ax6.set_title('predict XOR gate 3D')
ax6 = scatter_plot(ax6, test_X, pred_xor_y, threshold=0.5, three_d=True)
plt.show()
In [24]:
# 레이어를 추가했을 때
# XOR gate
xor_sigmoid_model = tf.keras.Sequential([
tf.keras.layers.Input(shape=(2,)),
tf.keras.layers.Dense(2, activation='sigmoid'), # 2 nodes로 변경
tf.keras.layers.Dense(1)
])
xor_sigmoid_model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy'])
xor_sigmoid_model.fit(X, xor_y, epochs=1000, verbose=0)
plt.figure(figsize=(10,5))
pred_xor_y = xor_sigmoid_model(test_X)
ax1 = plt.subplot(1,2,1)
ax1.set_title('predict XOR gate')
ax1 = scatter_plot(ax1, test_X, pred_xor_y, threshold=0.5)
ax2 = plt.subplot(1,2,2, projection='3d')
ax2.set_title('predict XOR gate 3D')
ax2 = scatter_plot(ax2, test_X, pred_xor_y, threshold=0.5, three_d=True)
plt.show()
4. 하이퍼볼릭 탄젠트¶
- -1과 1에서 포화된다는 단점이 존재
In [25]:
import os
img_path = os.getenv('HOME')+'/aiffel/activation/jindo_dog.jpg'
# 하이퍼볼릭 탄젠트 함수
def tanh(x):
return (np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x))
def dev_tanh(x):
return 1-tanh(x)**2
# 시각화
ax = plot_and_visulize(img_path, tanh, dev_tanh)
ax.show()
5. Relu¶
치역은 [ 0 , ∞ ]
함수의 출력값의 중심이 0이 아니라 Dying Relu가 발생함. 학습률(learning rate)을 크게 잡을 때 자주 발생합니다.
In [26]:
import os
img_path = os.getenv('HOME')+'/aiffel/activation/jindo_dog.jpg'
# relu 함수
def relu(x):
return max(0,x)
# 시각화
ax = plot_and_visulize(img_path, relu)
ax.show()
In [27]:
q_X = np.array([-10+x/100 for x in range(2001)])
q_y = np.array([(x)**2 + np.random.randn(1)*10 for x in q_X])
plt.scatter(q_X, q_y, s=0.5)
Out[27]:
<matplotlib.collections.PathCollection at 0x7fc0a4599950>
In [28]:
approx_relu_model_p = tf.keras.Sequential([
tf.keras.layers.Input(shape=(1,)),
tf.keras.layers.Dense(6, activation='relu'), # 6 nodes 병렬 연결
tf.keras.layers.Dense(1)
])
approx_relu_model_p.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=0.005), metrics=['accuracy'])
approx_relu_model_p.fit(q_X, q_y, batch_size=32, epochs=100, verbose=0)
approx_relu_model_s = tf.keras.Sequential([
tf.keras.layers.Input(shape=(1,)),
tf.keras.layers.Dense(2, activation='relu'),# 2 nodes 직렬로 3번 연결
tf.keras.layers.Dense(2, activation='relu'),
tf.keras.layers.Dense(2, activation='relu'),
tf.keras.layers.Dense(1)
])
approx_relu_model_s.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=0.005), metrics=['accuracy'])
approx_relu_model_s.fit(q_X, q_y, batch_size=32, epochs=100, verbose=0)
approx_relu_model_p.summary()
approx_relu_model_s.summary()
Model: "sequential_7" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dense_8 (Dense) (None, 6) 12 _________________________________________________________________ dense_9 (Dense) (None, 1) 7 ================================================================= Total params: 19 Trainable params: 19 Non-trainable params: 0 _________________________________________________________________ Model: "sequential_8" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dense_10 (Dense) (None, 2) 4 _________________________________________________________________ dense_11 (Dense) (None, 2) 6 _________________________________________________________________ dense_12 (Dense) (None, 2) 6 _________________________________________________________________ dense_13 (Dense) (None, 1) 3 ================================================================= Total params: 19 Trainable params: 19 Non-trainable params: 0 _________________________________________________________________
In [29]:
q_test_X = q_X.reshape((*q_X.shape,1))
plt.figure(figsize=(10,5))
ax1 = plt.subplot(1,2,1)
ax1.set_title('parallel')
pred_y_p = approx_relu_model_p(q_test_X)
ax1.plot(q_X, pred_y_p)
ax2 = plt.subplot(1,2,2)
ax2.set_title('serial')
pred_y_s = approx_relu_model_s(q_test_X)
ax2.plot(q_X, pred_y_s)
plt.show()
Leaky ReLU¶
'Dying ReLU'를 발생시켰던 0을 출력하던 부분을 아주 작은 음수값을 출력하게 만들어 주어 해당 문제를 해결
In [30]:
import os
img_path = os.getenv('HOME')+'/aiffel/activation/jindo_dog.jpg'
# leaky relu 함수
def leaky_relu(x):
return max(0.02*x,x)
# 시각화
ax = plot_and_visulize(img_path, leaky_relu)
ax.show()
PReLU¶
- 새로운 파라미터를 추가하여 0 미만일 때의 '기울기'가 훈련되게 함
In [31]:
# PReLU 함수
def prelu(x, alpha):
return max(alpha*x,x)
# 시각화
ax = plot_and_visulize(img_path, lambda x: prelu(x, 0.1)) # parameter alpha=0.1일 때
ax.show()
ELU¶
- 0이 중심점이 아니었던 단점과, 'Dying ReLU'문제를 해결
- exponential 연산이 들어가서 계산 비용이 높아졌다는 단점이 있음
In [32]:
# leaky relu 함수
def elu(x, alpha):
return x if x > 0 else alpha*(np.exp(x)-1)
def dev_elu(x, alpha):
return 1 if x > 0 else elu(x, alpha) + alpha
# 시각화
ax = plot_and_visulize(img_path, lambda x: elu(x, 1), lambda x: dev_elu(x, 1)) # alpha가 1일 때
ax.show()
'파이썬 & AI 학습' 카테고리의 다른 글
파이썬 데이터베이스 만들기 Pandas (0) | 2021.03.14 |
---|---|
딥러닝 레이어에 대한 이해 (0) | 2021.03.12 |
Hadoop & Spark (0) | 2021.03.10 |
Linear, Convolution layer (0) | 2021.03.06 |
MapReduce 원리 (0) | 2021.03.04 |