가중치에 MCP penalty가 적용된 신경망 모형을 구현해보자!
MCP penalty
Minimax Concave Penalty
\[\begin{aligned} P_r(x;\lambda) = \begin{cases} \lambda \vert x \vert - \frac{x^2}{2r} \text{ if } \vert x \vert \leq r \lambda \\ \frac{r \lambda^2}{2} \text{ if } \vert x \vert > r \lambda \end{cases} \end{aligned}\]이를 파이썬 함수로 구현하면 다음과 같다.
@tf.function
def MCP(weight, lambda_, r):
penalty1 = lambda_ * tf.abs(weight) - tf.math.square(weight) / (2. * r)
penalty2 = tf.math.square(lambda_) * r / 2
return tf.reduce_sum(penalty1 * tf.cast(tf.abs(weight) <= r * lambda_, tf.float32) + penalty2 * tf.cast(tf.abs(weight) > r * lambda_, tf.float32))
MCP 함수의 개형을 다음의 코드로 같은 $\lambda$값을 가지는 Lasso penalty 함수와 비교해 볼 수 있다.
plt.figure(figsize=(6, 4))
plt.plot([MCP(tf.cast(x, tf.float32), 3., 1.) for x in np.linspace(-5, 5, 1000)], label='MCP')
plt.plot(3. * np.abs(np.linspace(-5, 5, 1000)), label='lasso')
plt.legend()
Regression with MCP
구현한 MCP penalty가 변수선택을 제대로 수행해주는지 확인하기 위해 간단한 회귀분석을 수행한다.
custom MCP regression layer
단순 선형 회귀분석을 하는 custom layer MCPLayer
를 정의하고, add_loss
를 이용해 MCP를 loss에 추가해준다.
class MCPLayer(K.layers.Layer):
def __init__(self, h, output_dim, lambda_, r, **kwargs):
super(MCPLayer, self).__init__(**kwargs)
self.input_dim = h.shape[-1]
self.output_dim = output_dim
self.lambda_ = lambda_
self.r = r
self.MCP = MCP
w_init = tf.random_normal_initializer()
self.w = tf.Variable(initial_value=w_init(shape=(self.input_dim, 1),
dtype='float32'),
trainable=True)
def call(self, x):
h = tf.matmul(x, self.w)
self.add_loss(self.MCP(self.w, self.lambda_, self.r)) # loss에 MCP 추가
return h
experiment setting
output_dim = 1
p = 100
n = 50
# MCP hyperparameter
lambda_ = 5.
r = 2.
input_layer = layers.Input((p))
custom_layer = MCPLayer(input_layer, output_dim, lambda_, r)
outputs = custom_layer(input_layer)
model = K.models.Model(input_layer, outputs)
model.summary()
Model: "model"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_1 (InputLayer) [(None, 100)] 0
_________________________________________________________________
mcp_layer (MCPLayer) (None, 1) 100
=================================================================
Total params: 100
Trainable params: 100
Non-trainable params: 0
_________________________________________________________________
beta = np.zeros((p, 1))
beta[:4, 0] = np.array([1, 2, 3, -4]) # 처음 4개의 회귀계수만 0이 아니다!
X = np.random.normal(size=(n, p))
y = X @ beta + np.random.normal(size=(n, 1))
training
optimizer = K.optimizers.SGD(0.0007)
for i in range(10000):
with tf.GradientTape() as tape:
yhat = model(X)
loss = tf.reduce_sum(tf.math.square(yhat - y))
loss += model.losses # MCP penalty loss 추가
grad = tape.gradient(loss, model.trainable_weights)
optimizer.apply_gradients(zip(grad, model.trainable_weights))
if i % 100:
diff = tf.reduce_sum(tf.math.square(model.weights[0] - beta)) # 실제 값과의 차이를 이용해 stopping rule 정의
print(diff)
if diff < 1:
break
tf.Tensor(24.235529, shape=(), dtype=float32)
tf.Tensor(22.534918, shape=(), dtype=float32)
tf.Tensor(21.274326, shape=(), dtype=float32)
tf.Tensor(20.300333, shape=(), dtype=float32)
tf.Tensor(19.521292, shape=(), dtype=float32)
tf.Tensor(18.88015, shape=(), dtype=float32)
tf.Tensor(18.340311, shape=(), dtype=float32)
tf.Tensor(17.877214, shape=(), dtype=float32)
tf.Tensor(17.473879, shape=(), dtype=float32)
tf.Tensor(17.118221, shape=(), dtype=float32)
tf.Tensor(16.801067, shape=(), dtype=float32)
tf.Tensor(16.515059, shape=(), dtype=float32)
tf.Tensor(16.254675, shape=(), dtype=float32)
tf.Tensor(16.01548, shape=(), dtype=float32)
tf.Tensor(15.794283, shape=(), dtype=float32)
tf.Tensor(15.58833, shape=(), dtype=float32)
tf.Tensor(15.395755, shape=(), dtype=float32)
tf.Tensor(15.214806, shape=(), dtype=float32)
tf.Tensor(15.044276, shape=(), dtype=float32)
tf.Tensor(14.882601, shape=(), dtype=float32)
tf.Tensor(14.7288685, shape=(), dtype=float32)
tf.Tensor(14.582401, shape=(), dtype=float32)
tf.Tensor(14.442043, shape=(), dtype=float32)
tf.Tensor(14.307769, shape=(), dtype=float32)
tf.Tensor(14.178376, shape=(), dtype=float32)
show more (open the raw output data in a text editor) ...
tf.Tensor(1.007096, shape=(), dtype=float32)
tf.Tensor(1.004746, shape=(), dtype=float32)
tf.Tensor(1.003016, shape=(), dtype=float32)
tf.Tensor(1.0012707, shape=(), dtype=float32)
tf.Tensor(0.99927545, shape=(), dtype=float32)
regression coefficients의 sparsity 확인
plt.figure(figsize=(10, 5))
plt.bar(np.arange(p), model.weights[0].numpy()[:, 0])
처음 4개의 회귀계수가 원래 설정했던 1, 2, 3, -4에 매우 가깝고 나머지 회귀계수들은 거의 0이 된 것을 확인할 수 있다.
Neural Network with MCP
custom MCP layer
MCP를 계산해주는 custom layer를 정의한다.
class MCP(layers.Layer):
def __init__(self, lambda_, r):
super(MCP, self).__init__()
self.lambda_ = lambda_
self.r = r
def call(self, weight):
penalty1 = self.lambda_ * tf.abs(weight) - tf.math.square(weight) / (2. * self.r)
penalty2 = tf.math.square(self.lambda_) * self.r / 2
return tf.reduce_sum(penalty1 * tf.cast(tf.abs(weight) <= self.r * self.lambda_, tf.float32) +
penalty2 * tf.cast(tf.abs(weight) > self.r * self.lambda_, tf.float32))
가중치가 반복되는 Fully Connected Layer 만들기에서 정의했던 custom layer를 활용해보자!
class CustomLayer(layers.Layer):
def __init__(self, output_dim, **kwargs):
super(CustomLayer, self).__init__(**kwargs)
self.output_dim = output_dim
self.lambda_ = lambda_
self.r = r
def build(self, input_shape): # 이전 layer의 output을 받지 않아도 됨
self.w = self.add_weight(shape=(input_shape[-1], 1),
initializer="random_normal",
trainable=True)
self.b = self.add_weight(shape=(),
initializer="random_normal",
trainable=True)
def call(self, x):
w_repeated = tf.repeat(self.w, self.output_dim, axis=-1)
b_repeated = tf.repeat(self.b, self.output_dim)
h = tf.matmul(x, w_repeated) + b_repeated # h = xW + b
# h = tf.nn.relu(h) # nonlinear activation
return h
experiment setting
p = 30
n = 10000
output_dim = 2
# MCP hyperparameter
lambda_ = 10.
r = 1.
beta = np.random.uniform(low=-2., high=2., size=(p, output_dim))
X = np.random.normal(size=(n, p))
y = X @ beta + np.random.normal(size=(n, output_dim))
penalty 없는 모형
input_layer = layers.Input((p))
dense1 = layers.Dense(10, activation='linear')
h = dense1(input_layer)
custom_layer = CustomLayer(output_dim)
outputs = custom_layer(h)
model = K.models.Model(input_layer, outputs)
model.summary()
optimizer = K.optimizers.SGD(0.01)
for i in range(100):
with tf.GradientTape() as tape:
yhat = model(X)
loss = tf.reduce_mean(tf.losses.mean_squared_error(y, yhat))
loss += sum(model.losses)
grad = tape.gradient(loss, model.trainable_weights)
optimizer.apply_gradients(zip(grad, model.trainable_weights))
if i % 10:
print(i, loss)
nopenalty = custom_layer.weights[0]
1 tf.Tensor(113.83301, shape=(), dtype=float32)
2 tf.Tensor(110.04178, shape=(), dtype=float32)
3 tf.Tensor(104.79753, shape=(), dtype=float32)
4 tf.Tensor(96.88495, shape=(), dtype=float32)
5 tf.Tensor(85.199524, shape=(), dtype=float32)
6 tf.Tensor(69.447716, shape=(), dtype=float32)
7 tf.Tensor(51.3426, shape=(), dtype=float32)
8 tf.Tensor(34.696712, shape=(), dtype=float32)
9 tf.Tensor(22.660175, shape=(), dtype=float32)
11 tf.Tensor(10.600689, shape=(), dtype=float32)
12 tf.Tensor(7.650837, shape=(), dtype=float32)
13 tf.Tensor(5.674828, shape=(), dtype=float32)
14 tf.Tensor(4.306686, shape=(), dtype=float32)
15 tf.Tensor(3.332453, shape=(), dtype=float32)
16 tf.Tensor(2.622191, shape=(), dtype=float32)
17 tf.Tensor(2.0938485, shape=(), dtype=float32)
18 tf.Tensor(1.6937089, shape=(), dtype=float32)
19 tf.Tensor(1.3855927, shape=(), dtype=float32)
21 tf.Tensor(0.9534905, shape=(), dtype=float32)
22 tf.Tensor(0.79988265, shape=(), dtype=float32)
23 tf.Tensor(0.6750202, shape=(), dtype=float32)
24 tf.Tensor(0.5725103, shape=(), dtype=float32)
25 tf.Tensor(0.48763466, shape=(), dtype=float32)
26 tf.Tensor(0.41685584, shape=(), dtype=float32)
27 tf.Tensor(0.35747638, shape=(), dtype=float32)
show more (open the raw output data in a text editor) ...
95 tf.Tensor(0.00026946433, shape=(), dtype=float32)
96 tf.Tensor(0.00024825905, shape=(), dtype=float32)
97 tf.Tensor(0.00022875278, shape=(), dtype=float32)
98 tf.Tensor(0.00021083745, shape=(), dtype=float32)
99 tf.Tensor(0.00019436897, shape=(), dtype=float32)
penalty 있는 모형
input_layer = layers.Input((p))
dense1 = layers.Dense(10, activation='linear')
h = dense1(input_layer)
custom_layer = CustomLayer(output_dim)
outputs = custom_layer(h)
model = K.models.Model(input_layer, outputs)
model.summary()
# MCP penalty 추가
mcp = MCP(lambda_, r)
model.add_loss(lambda: mcp(custom_layer.weights[0]))
optimizer = K.optimizers.SGD(0.01)
for i in range(100):
with tf.GradientTape() as tape:
yhat = model(X)
loss = tf.reduce_mean(tf.losses.mean_squared_error(y, yhat))
loss += sum(model.losses)
grad = tape.gradient(loss, model.trainable_weights)
optimizer.apply_gradients(zip(grad, model.trainable_weights))
if i % 10:
print(i, loss)
withpenalty = custom_layer.weights[0]
1 tf.Tensor(116.90747, shape=(), dtype=float32)
2 tf.Tensor(115.836555, shape=(), dtype=float32)
3 tf.Tensor(113.85548, shape=(), dtype=float32)
4 tf.Tensor(110.767944, shape=(), dtype=float32)
5 tf.Tensor(106.348946, shape=(), dtype=float32)
6 tf.Tensor(100.15064, shape=(), dtype=float32)
7 tf.Tensor(90.49426, shape=(), dtype=float32)
8 tf.Tensor(77.75273, shape=(), dtype=float32)
9 tf.Tensor(63.680996, shape=(), dtype=float32)
11 tf.Tensor(40.37275, shape=(), dtype=float32)
12 tf.Tensor(33.966606, shape=(), dtype=float32)
13 tf.Tensor(29.702038, shape=(), dtype=float32)
14 tf.Tensor(26.547482, shape=(), dtype=float32)
15 tf.Tensor(24.385715, shape=(), dtype=float32)
16 tf.Tensor(22.927456, shape=(), dtype=float32)
17 tf.Tensor(21.26757, shape=(), dtype=float32)
18 tf.Tensor(20.356861, shape=(), dtype=float32)
19 tf.Tensor(19.428196, shape=(), dtype=float32)
21 tf.Tensor(18.216291, shape=(), dtype=float32)
22 tf.Tensor(17.15894, shape=(), dtype=float32)
23 tf.Tensor(17.279783, shape=(), dtype=float32)
24 tf.Tensor(16.044693, shape=(), dtype=float32)
25 tf.Tensor(16.963415, shape=(), dtype=float32)
26 tf.Tensor(15.528401, shape=(), dtype=float32)
27 tf.Tensor(16.40659, shape=(), dtype=float32)
show more (open the raw output data in a text editor) ...
95 tf.Tensor(11.041722, shape=(), dtype=float32)
96 tf.Tensor(10.999712, shape=(), dtype=float32)
97 tf.Tensor(11.1277485, shape=(), dtype=float32)
98 tf.Tensor(10.787752, shape=(), dtype=float32)
99 tf.Tensor(11.136098, shape=(), dtype=float32)
custom layer weight 값 비교
plt.figure(figsize=(10, 5))
plt.bar(np.arange(10), nopenalty.numpy()[:, 0], label='original')
plt.bar(np.arange(10), withpenalty.numpy()[:, 0], label='MCP')
plt.legend()
MCP가 포함된 모형의 custom layer weight값들이 penalty가 없을 때 보다 훨씬 sparse한 것을 확인할 수 있다.
Comments