Source code for deepod.models.tabular.repen

# -*- coding: utf-8 -*-
"""
Representation learning-based unsupervised/weakly-supervised anomaly detection
PyTorch's implementation
this script is partially adapted from the official keras's implementation
https://www.google.com/url?q=https%3A%2F%2Fgithub.com%2FGuansongPang%2Fdeep-outlier-detection&sa=D&sntz=1&usg=AOvVaw2GbqWiY-Y2wkZSjKgU5eQs
@Author: Hongzuo Xu <hongzuoxu@126.com, xuhongzuo13@nudt.edu.cn>
"""

from deepod.core.base_model import BaseDeepAD
from deepod.core.networks.base_networks import MLPnet
from torch.utils.data import DataLoader
import torch
import torch.nn.functional as F
from sklearn.utils.random import sample_without_replacement
from sklearn.neighbors import KDTree
import numpy as np


[docs]class REPEN(BaseDeepAD):
    """
    Learning Representations of Ultrahigh-dimensional Data for Random
    Distance-based Outlier Detection (KDD'18)
    :cite:`pang2018repen`

    """
    def __init__(self, epochs=100, batch_size=64, lr=1e-3,
                 init_score_ensemble_size=50, init_score_subsample_size=8,
                 rep_dim=128, hidden_dims='100,50', act='LeakyReLU', bias=False,
                 epoch_steps=-1, prt_steps=10, device='cuda',
                 verbose=2, random_state=42):
        super(REPEN, self).__init__(
            model_name='REPEN', data_type='tabular', epochs=epochs, batch_size=batch_size, lr=lr,
            network='MLP',
            epoch_steps=epoch_steps, prt_steps=prt_steps, device=device,
            verbose=verbose, random_state=random_state
        )

        self.init_score_ensemble_size = init_score_ensemble_size
        self.init_score_subsample_size = init_score_subsample_size
        self.rep_dim = rep_dim
        self.hidden_dims = hidden_dims
        self.act = act
        self.bias = bias

        return

[docs]    def training_prepare(self, X, y):

        network_params = {
            'n_features': self.n_features,
            'n_hidden': self.hidden_dims,
            'n_output': self.rep_dim,
            'activation': self.act,
            'bias': self.bias
        }
        net = MLPnet(**network_params).to(self.device)

        init_scores = repen_init_score_calculator(x_train=X,
                                                  ensemble_size=self.init_score_ensemble_size,
                                                  subsample_size=self.init_score_subsample_size)
        train_loader = REPENLoader(X, batch_size=self.batch_size, init_scores=init_scores)
        criterion = REPENLoss()

        if self.verbose >= 2:
            print(net)

        return train_loader, net, criterion

[docs]    def inference_prepare(self, X):
        test_loader = DataLoader(X, batch_size=self.batch_size,
                                 drop_last=False, shuffle=False)
        self.criterion.reduction = 'none'
        return test_loader

[docs]    def training_forward(self, batch_x, net, criterion):
        exp, pos, neg = batch_x
        exp, pos, neg = exp.float().to(self.device), \
                        pos.float().to(self.device), \
                        neg.float().to(self.device)
        exp, pos, neg = net(exp), net(pos), net(neg)
        loss = criterion(exp, pos, neg)
        return loss

[docs]    def inference_forward(self, batch_x, net, criterion):
        batch_x = batch_x.float().to(self.device)
        batch_z = net(batch_x)
        s = torch.zeros(batch_z.shape[0])  # for consistency
        return batch_z, s

[docs]    def decision_function_update(self, z, scores):
        scores = repen_init_score_calculator(z,
                                             ensemble_size=self.init_score_ensemble_size,
                                             subsample_size=self.init_score_subsample_size).flatten()
        return z, scores


class REPENLoader:
    """
    Triplets loader
    """
    def __init__(self, X, batch_size, init_scores, steps_per_epoch=None):
        self.X = X
        self.batch_size = min(batch_size, len(X))
        self.init_scores = init_scores
        self.inlier_ids, self.outlier_ids = self.cutoff_unsorted(init_scores)
        self.steps_per_epoch = steps_per_epoch if steps_per_epoch is not None \
            else int(len(X)/self.batch_size)
        self.counter = 0
        return

    def __iter__(self):
        self.counter = 0
        return self

    def __next__(self):
        self.counter += 1
        examples, positives, negatives = self.triplet_batch_generation()
        examples, positives, negatives = torch.from_numpy(examples), \
                                         torch.from_numpy(positives), \
                                         torch.from_numpy(negatives)
        if self.counter > self.steps_per_epoch:
            raise StopIteration

        return examples, positives, negatives

    def triplet_batch_generation(self, prior_knowledge=None):
        X = self.X
        outlier_scores = self.init_scores
        inlier_ids = self.inlier_ids
        outlier_ids = self.outlier_ids
        batch_size = self.batch_size

        transforms = np.sum(outlier_scores[inlier_ids]) - outlier_scores[inlier_ids]
        total_weights_p = np.sum(transforms)
        positive_weights = transforms / total_weights_p
        positive_weights = positive_weights.flatten()
        total_weights_n = np.sum(outlier_scores[outlier_ids])
        negative_weights = outlier_scores[outlier_ids] / total_weights_n
        negative_weights = negative_weights.flatten()
        examples_ids = np.zeros([batch_size]).astype('int')
        positives_ids = np.zeros([batch_size]).astype('int')
        negatives_ids = np.zeros([batch_size]).astype('int')
        for i in range(0, batch_size):
            sid = np.random.choice(len(inlier_ids), 1, p=positive_weights)
            examples_ids[i] = inlier_ids[sid]
            sid2 = np.random.choice(len(inlier_ids), 1)

            while sid2 == sid:
                sid2 = np.random.choice(len(inlier_ids), 1)

            positives_ids[i] = inlier_ids[sid2]
            if np.logical_and(prior_knowledge is not None, i % 2 == 0):
                did = np.random.choice(prior_knowledge.shape[0], 1)
                negatives_ids[i] = did
            else:
                sid = np.random.choice(len(outlier_ids), 1, p=negative_weights)
                negatives_ids[i] = outlier_ids[sid]
        examples = X[examples_ids, :]
        positives = X[positives_ids, :]
        negatives = np.zeros([batch_size, X.shape[1]])
        if prior_knowledge is not None:
            negatives[1::2] = X[negatives_ids[1::2], :]
            negatives[::2] = prior_knowledge[negatives_ids[::2], :]
        else:
            negatives = X[negatives_ids, :]
        return examples, positives, negatives

    @staticmethod
    def cutoff_unsorted(values, th=1.7321):
        v_mean = np.mean(values)
        v_std = np.std(values)
        th = v_mean + th * v_std  # 1.7321
        if th >= np.max(values):  # return the top-10 outlier scores
            temp = np.sort(values)
            th = temp[-11]
        outlier_ind = np.where(values > th)[0]
        inlier_ind = np.where(values <= th)[0]
        return inlier_ind, outlier_ind


class REPENLoss(torch.nn.Module):
    def __init__(self, reduction='mean'):
        super(REPENLoss, self).__init__()
        self.reduction = reduction
        # self.triplet_loss = torch.nn.TripletMarginLoss(margin=1000., p=2, reduction=reduction)

    def forward(self, example, positive, negative):
        positive_distances = torch.sum(torch.square(example - positive), dim=-1)
        negative_distances = torch.sum(torch.square(example - negative), dim=-1)
        loss = F.relu(1000.-(negative_distances - positive_distances))
        # loss = triplet_loss(example, positive, negative)

        reduction = self.reduction

        if reduction == 'mean':
            return torch.mean(loss)
        elif reduction == 'sum':
            return torch.sum(loss)
        elif reduction == 'none':
            return loss


def repen_init_score_calculator(x_train, ensemble_size=50, subsample_size=8):
    """
    the outlier scoring method, a bagging ensemble of Sp. See the following reference for detail.
    Pang, Guansong, Kai Ming Ting, and David Albrecht.
    "LeSiNN: Detecting anomalies by identifying least similar nearest neighbours."
    In ICDMW15. IEEE.
    """
    # this is for sub-sequences derived from time-series data
    if len(x_train.shape) == 3:
        x_train = x_train[:, -1, :]

    scores = np.zeros([x_train.shape[0], 1])

    ensemble_seeds = np.random.randint(0, np.iinfo(np.int32).max, ensemble_size)

    for i in range(0, ensemble_size):
        rs = np.random.RandomState(ensemble_seeds[i])
        sid = sample_without_replacement(n_population=x_train.shape[0],
                                         n_samples=subsample_size, random_state=rs)
        subsample = x_train[sid]

        kdt = KDTree(subsample, metric='euclidean')
        dists, indices = kdt.query(x_train, k=1)
        scores += dists
    scores = scores / ensemble_size
    return scores