소스코드 원본 링크 : https://pytorch.org/tutorials/beginner/pytorch_with_examples.html
Tensor
# -*- coding: utf-8 -*-
import torch
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10
# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)
# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)
learning_rate = 1e-6
for t in range(500):
# Forward pass: compute predicted y
# 순전파에서 h1 = x * w1 행렬식을 계산해준다. 사이즈를 잘 고려한다.
h = x.mm(w1)
# h 가 계산되면 .clamp(min=0) 를 통해 Relu 를 적용시켜준다. (not sigmoid)
h_relu = h.clamp(min=0)
# y = h_relu * w2
y_pred = h_relu.mm(w2)
# Compute and print loss
# 평균제곱오차(MSE)를 사용했다.
# .item()을 통해 텐서에서 일반 자료형의 숫자를 뽑아내준다.
loss = (y_pred - y).pow(2).sum().item()
print(t, loss)
# Backprop to compute gradients of w1 and w2 with respect to loss
# autograd를 사용하지 않고 직접 구현한 역전파 계산 과정이다.
# 앞에 2.0이 곱해지는 이유는 평균제곱오차에서의 편미분계산 때문이다.
# 지금 이 신경망은 복잡하지않기 때문에 신경망 역전파 행렬을 직접 다뤘지만 신경망이
# 복잡해지면 분명히 Pytorch의 강력한 기능인 autograd의 필요성을 느끼게 될 것이다.
grad_y_pred = 2.0 * (y_pred - y)
grad_w2 = h_relu.t().mm(grad_y_pred)
grad_h_relu = grad_y_pred.mm(w2.t())
grad_h = grad_h_relu.clone()
grad_h[h < 0] = 0
grad_w1 = x.t().mm(grad_h)
# Update weights using gradient descent
# 가중치 업그레이드
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2
Autograd
# -*- coding: utf-8 -*-
import torch
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10
# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)
# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)
learning_rate = 1e-6
for t in range(500):
# Forward pass: compute predicted y using operations on Tensors; these
# are exactly the same operations we used to compute the forward pass using
# Tensors, but we do not need to keep references to intermediate values since
# we are not implementing the backward pass by hand.
y_pred = x.mm(w1).clamp(min=0).mm(w2)
# Compute and print loss using operations on Tensors.
# Now loss is a Tensor of shape (1,)
# loss.item() gets the a scalar value held in the loss.
loss = (y_pred - y).pow(2).sum()
print(t, loss.item())
# Use autograd to compute the backward pass. This call will compute the
# gradient of loss with respect to all Tensors with requires_grad=True.
# After this call w1.grad and w2.grad will be Tensors holding the gradient
# of the loss with respect to w1 and w2 respectively.
# 위의 코드와의 차이점이다.autograd를 이용하여 gradient 값을 쉽게 얻을 수 있다.
loss.backward()
# Manually update weights using gradient descent. Wrap in torch.no_grad()
# because weights have requires_grad=True, but we don't need to track this
# in autograd.
# An alternative way is to operate on weight.data and weight.grad.data.
# Recall that tensor.data gives a tensor that shares the storage with
# tensor, but doesn't track history.
# You can also use torch.optim.SGD to achieve this.
with torch.no_grad():
w1 -= learning_rate * w1.grad
w2 -= learning_rate * w2.grad
# Manually zero the gradients after updating weights
w1.grad.zero_()
w2.grad.zero_()
사용자 정의 Autograd
# -*- coding: utf-8 -*-
import torch
class MyReLU(torch.autograd.Function):
"""
We can implement our own custom autograd Functions by subclassing
torch.autograd.Function and implementing the forward and backward passes
which operate on Tensors.
"""
@staticmethod
def forward(ctx, input):
"""
In the forward pass we receive a Tensor containing the input and return
a Tensor containing the output. ctx is a context object that can be used
to stash information for backward computation. You can cache arbitrary
objects for use in the backward pass using the ctx.save_for_backward method.
"""
# .save_for_backward는 역전파에서 사용할 어떤 값들을 위한 저장 메소드라고 생각하면 된다.
# 입력에 대한 relu 연산을 한 후 반환해준다.
# 순전파의 목적은 입력 텐서를 relu를 연산한 출력 텐서로 반환해주는 것이다.
ctx.save_for_backward(input)
return input.clamp(min=0)
@staticmethod
def backward(ctx, grad_output):
"""
In the backward pass we receive a Tensor containing the gradient of the loss
with respect to the output, and we need to compute the gradient of the loss
with respect to the input.
"""
# save_for_backward에서 저장했던 값들을 이용하여 입력 변화도를 계산한다.
# 역전파의 목적은 출력 변화도를 통해 입력 변화도를 계산하는 것이다.
input, = ctx.saved_tensors
grad_input = grad_output.clone()
grad_input[input < 0] = 0
return grad_input
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10
# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)
# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)
learning_rate = 1e-6
for t in range(500):
# To apply our Function, we use Function.apply method. We alias this as 'relu'.
relu = MyReLU.apply
# Forward pass: compute predicted y using operations; we compute
# ReLU using our custom autograd operation.
y_pred = relu(x.mm(w1)).mm(w2)
# Compute and print loss
loss = (y_pred - y).pow(2).sum()
print(t, loss.item())
# Use autograd to compute the backward pass.
loss.backward()
# Update weights using gradient descent
with torch.no_grad():
w1 -= learning_rate * w1.grad
w2 -= learning_rate * w2.grad
# Manually zero the gradients after updating weights
w1.grad.zero_()
w2.grad.zero_()
NN Module
# -*- coding: utf-8 -*-
import torch
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10
# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
# nn.Sequantial 을 사용하여 모델이 순차적인 절차를 밝도록 도와준다. Linear 모듈은 입출력 텐서를 다룬다.
# 1. D_in에 대한 출력 H
# 2. H의 relu --> H_relu
# 3. H_relu에 대한 출력 D_out
model = torch.nn.Sequential(
torch.nn.Linear(D_in, H),
torch.nn.ReLU(),
torch.nn.Linear(H, D_out),
)
# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
# 손실을 직접 계산하지 않고 nn 패키지에 있는 MSELoss 모듈을 사용한다.
loss_fn = torch.nn.MSELoss(reduction='sum')
learning_rate = 1e-4
for t in range(500):
# Forward pass: compute predicted y by passing x to the model. Module objects
# override the __call__ operator so you can call them like functions. When
# doing so you pass a Tensor of input data to the Module and it produces
# a Tensor of output data.
y_pred = model(x)
# Compute and print loss. We pass Tensors containing the predicted and true
# values of y, and the loss function returns a Tensor containing the
# loss.
loss = loss_fn(y_pred, y)
print(t, loss.item())
# Zero the gradients before running the backward pass.
model.zero_grad()
# Backward pass: compute gradient of the loss with respect to all the learnable
# parameters of the model. Internally, the parameters of each Module are stored
# in Tensors with requires_grad=True, so this call will compute gradients for
# all learnable parameters in the model.
loss.backward()
# Update the weights using gradient descent. Each parameter is a Tensor, so
# we can access its gradients like we did before.
with torch.no_grad():
for param in model.parameters():
param -= learning_rate * param.grad
Optim
# -*- coding: utf-8 -*-
import torch
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10
# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
torch.nn.Linear(D_in, H),
torch.nn.ReLU(),
torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')
# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
# optimizer로 Adam을 선택했다. model.parameters()는 모델의 가중치와 같은 파라미터들에 대한 정보를 가지고 있다.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
# Forward pass: compute predicted y by passing x to the model.
y_pred = model(x)
# Compute and print loss.
loss = loss_fn(y_pred, y)
print(t, loss.item())
# Before the backward pass, use the optimizer object to zero all of the
# gradients for the variables it will update (which are the learnable
# weights of the model). This is because by default, gradients are
# accumulated in buffers( i.e, not overwritten) whenever .backward()
# is called. Checkout docs of torch.autograd.backward for more details.
optimizer.zero_grad()
# Backward pass: compute gradient of the loss with respect to model
# parameters
loss.backward()
# Calling the step function on an Optimizer makes an update to its
# parameters
# optimizer를 업데이트 시켜준다. ( 파라미터에 대한 정보를 갖고 있으므로 파라미터를 업데이트 시켜준다.)
optimizer.step()
사용자 정의 nn 모듈
# -*- coding: utf-8 -*-
import torch
class TwoLayerNet(torch.nn.Module):
def __init__(self, D_in, H, D_out):
"""
In the constructor we instantiate two nn.Linear modules and assign them as
member variables.
"""
super(TwoLayerNet, self).__init__()
self.linear1 = torch.nn.Linear(D_in, H)
self.linear2 = torch.nn.Linear(H, D_out)
def forward(self, x):
"""
In the forward function we accept a Tensor of input data and we must return
a Tensor of output data. We can use Modules defined in the constructor as
well as arbitrary operators on Tensors.
"""
h_relu = self.linear1(x).clamp(min=0)
y_pred = self.linear2(h_relu)
return y_pred
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10
# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
# Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in, H, D_out)
# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
# 손실 계산을 MSELoss를 사용하고 optimizer는 SGD를 사용했다.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
# Forward pass: Compute predicted y by passing x to the model
y_pred = model(x)
# Compute and print loss
loss = criterion(y_pred, y)
print(t, loss.item())
# Zero gradients, perform a backward pass, and update the weights.
optimizer.zero_grad()
loss.backward()
optimizer.step()
Numpy와 Tensor가 갖고 있는 장단점이 있겠지만 Tensor를 사용하는 가장 큰 이유는 GPU를 활용할 수 있기 때문이다. 코드 실행 후 데이터 선처리 및 후처리에서 Numpy, Pandas 등의 다양하고 적절한 패키지와 모듈을 사용하면 우리가 원하는 데이터를 뽑아낼 수 있다. 하지만 형변환의 과정에서 오류가 발생할 수 있기 때문에 항상 주의하여야 한다. Tensor를 다루는 프레임워크는 많을것이다. 지금까지 다뤄본 내용에 의하면 tensorflow와 가장 큰 차이점은 아마 autograd가 아닐까 생각한다. vanishing gradient 문제를 해결하기 위해 역전파 알고리즘에 대해 배울때 정말 어려웠다. 역전파도 사실 chain rule을 이용한 가단한 편미분 계산이지만 복잡한 신경망을 구성할 때 이것을 완벽하게 구성할 수 있을까라는 의구심이 들었는데 autograd를 어떻게 하면 잘 사용할 수 있을지는 모르겠지만 매우 강력한 기능인것만은 확실하다.