Learning PyTorch with Examples

소스코드 원본 링크 : https://pytorch.org/tutorials/beginner/pytorch_with_examples.html

Tensor

# -*- coding: utf-8 -*-

import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    # 순전파에서 h1 = x * w1 행렬식을 계산해준다. 사이즈를 잘 고려한다.
    h = x.mm(w1)
    # h 가 계산되면 .clamp(min=0) 를 통해 Relu 를 적용시켜준다. (not sigmoid)
    h_relu = h.clamp(min=0)
    # y = h_relu * w2
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    # 평균제곱오차(MSE)를 사용했다. 
    # .item()을 통해 텐서에서 일반 자료형의 숫자를 뽑아내준다.
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    # autograd를 사용하지 않고 직접 구현한 역전파 계산 과정이다. 
    # 앞에 2.0이 곱해지는 이유는 평균제곱오차에서의 편미분계산 때문이다.
    # 지금 이 신경망은 복잡하지않기 때문에 신경망 역전파 행렬을 직접 다뤘지만 신경망이
    # 복잡해지면 분명히 Pytorch의 강력한 기능인 autograd의 필요성을 느끼게 될 것이다.
    
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    # 가중치 업그레이드
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

Autograd

# -*- coding: utf-8 -*-
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    # 위의 코드와의 차이점이다.autograd를 이용하여 gradient 값을 쉽게 얻을 수 있다.
    
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

사용자 정의 Autograd

# -*- coding: utf-8 -*-
import torch

class MyReLU(torch.autograd.Function):
    """
 We can implement our own custom autograd Functions by subclassing
 torch.autograd.Function and implementing the forward and backward passes
 which operate on Tensors.
 """

    @staticmethod
    def forward(ctx, input):
        """
 In the forward pass we receive a Tensor containing the input and return
 a Tensor containing the output. ctx is a context object that can be used
 to stash information for backward computation. You can cache arbitrary
 objects for use in the backward pass using the ctx.save_for_backward method.
 """
		# .save_for_backward는 역전파에서 사용할 어떤 값들을 위한 저장 메소드라고 생각하면 된다.
		# 입력에 대한 relu 연산을 한 후 반환해준다.
		# 순전파의 목적은 입력 텐서를 relu를 연산한 출력 텐서로 반환해주는 것이다.
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
 In the backward pass we receive a Tensor containing the gradient of the loss
 with respect to the output, and we need to compute the gradient of the loss
 with respect to the input.
 """
		# save_for_backward에서 저장했던 값들을 이용하여 입력 변화도를 계산한다.
		# 역전파의 목적은 출력 변화도를 통해 입력 변화도를 계산하는 것이다. 
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

NN Module

# -*- coding: utf-8 -*-
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.

# nn.Sequantial 을 사용하여 모델이 순차적인 절차를 밝도록 도와준다. Linear 모듈은 입출력 텐서를 다룬다. 
# 1. D_in에 대한 출력 H
# 2. H의 relu --> H_relu
# 3. H_relu에 대한 출력 D_out

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
# 손실을 직접 계산하지 않고 nn 패키지에 있는 MSELoss 모듈을 사용한다.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(x)

    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

Optim

# -*- coding: utf-8 -*-
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
# optimizer로 Adam을 선택했다. model.parameters()는 모델의 가중치와 같은 파라미터들에 대한 정보를 가지고 있다.
 
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    # optimizer를 업데이트 시켜준다. ( 파라미터에 대한 정보를 갖고 있으므로 파라미터를 업데이트 시켜준다.)
    
    optimizer.step()

사용자 정의 nn 모듈

# -*- coding: utf-8 -*-
import torch

class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
 In the constructor we instantiate two nn.Linear modules and assign them as
 member variables.
 """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
 In the forward function we accept a Tensor of input data and we must return
 a Tensor of output data. We can use Modules defined in the constructor as
 well as arbitrary operators on Tensors.
 """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.

# 손실 계산을 MSELoss를 사용하고 optimizer는 SGD를 사용했다.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Numpy와 Tensor가 갖고 있는 장단점이 있겠지만 Tensor를 사용하는 가장 큰 이유는 GPU를 활용할 수 있기 때문이다. 코드 실행 후 데이터 선처리 및 후처리에서 Numpy, Pandas 등의 다양하고 적절한 패키지와 모듈을 사용하면 우리가 원하는 데이터를 뽑아낼 수 있다. 하지만 형변환의 과정에서 오류가 발생할 수 있기 때문에 항상 주의하여야 한다. Tensor를 다루는 프레임워크는 많을것이다. 지금까지 다뤄본 내용에 의하면 tensorflow와 가장 큰 차이점은 아마 autograd가 아닐까 생각한다. vanishing gradient 문제를 해결하기 위해 역전파 알고리즘에 대해 배울때 정말 어려웠다. 역전파도 사실 chain rule을 이용한 가단한 편미분 계산이지만 복잡한 신경망을 구성할 때 이것을 완벽하게 구성할 수 있을까라는 의구심이 들었는데 autograd를 어떻게 하면 잘 사용할 수 있을지는 모르겠지만 매우 강력한 기능인것만은 확실하다.