Skip to content

uqregressors.utils.torch_sklearn_utils

torch_sklearn_utils

A collection of sklearn utility functions refactored to work with pytorch tensors.

The key functions are
  • TorchStandardScaler (class)
  • TorchKFold (class)
  • train_test_split (function)

Warning

TorchKFold returns the indices of each K-Fold, while train_test_split returns the values in each split.

TorchKFold

A class meant to split the data into K-folds for conformalization or cross validation.

Parameters:

Name Type Description Default
n_splits int

The number of folds for data splitting.

5
shuffle bool

Whether to shuffle the data before splitting.

False
random_state int or None

Controls shuffling for reproducibility.

None
Source code in uqregressors\utils\torch_sklearn_utils.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
class TorchKFold:
    """
    A class meant to split the data into K-folds for conformalization or cross validation. 

    Args: 
        n_splits (int): The number of folds for data splitting.
        shuffle (bool): Whether to shuffle the data before splitting. 
        random_state (int or None): Controls shuffling for reproducibility.
    """
    def __init__(self, n_splits=5, shuffle=False, random_state=None):
        if n_splits < 2:
            raise ValueError("n_splits must be at least 2.")
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def split(self, X):
        """
        Yield train/test indices for each fold.

        Args:
            X (torch.Tensor, np.ndarray, or list): Input data with shape (n_samples, ...)

        Yields:
            (tuple[torch.LongTensor, torch.LongTensor]): train_idx, val_idx; the indices of the training and validation sets for each of the splits. 
        """
        if isinstance(X, torch.Tensor):
            n_samples = X.shape[0]
        else:
            X = np.asarray(X)
            n_samples = len(X)

        indices = np.arange(n_samples)

        if self.shuffle:
            rng = np.random.default_rng(self.random_state)
            rng.shuffle(indices)

        fold_sizes = np.full(self.n_splits, n_samples // self.n_splits, dtype=int)
        fold_sizes[:n_samples % self.n_splits] += 1
        current = 0

        for fold_size in fold_sizes:
            val_idx = indices[current:current + fold_size]
            train_idx = np.concatenate([indices[:current], indices[current + fold_size:]])
            current += fold_size

            yield (
                torch.from_numpy(train_idx).long(),
                torch.from_numpy(val_idx).long()
            )

split(X)

Yield train/test indices for each fold.

Parameters:

Name Type Description Default
X torch.Tensor, np.ndarray, or list

Input data with shape (n_samples, ...)

required

Yields:

Type Description
tuple[LongTensor, LongTensor]

train_idx, val_idx; the indices of the training and validation sets for each of the splits.

Source code in uqregressors\utils\torch_sklearn_utils.py
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
def split(self, X):
    """
    Yield train/test indices for each fold.

    Args:
        X (torch.Tensor, np.ndarray, or list): Input data with shape (n_samples, ...)

    Yields:
        (tuple[torch.LongTensor, torch.LongTensor]): train_idx, val_idx; the indices of the training and validation sets for each of the splits. 
    """
    if isinstance(X, torch.Tensor):
        n_samples = X.shape[0]
    else:
        X = np.asarray(X)
        n_samples = len(X)

    indices = np.arange(n_samples)

    if self.shuffle:
        rng = np.random.default_rng(self.random_state)
        rng.shuffle(indices)

    fold_sizes = np.full(self.n_splits, n_samples // self.n_splits, dtype=int)
    fold_sizes[:n_samples % self.n_splits] += 1
    current = 0

    for fold_size in fold_sizes:
        val_idx = indices[current:current + fold_size]
        train_idx = np.concatenate([indices[:current], indices[current + fold_size:]])
        current += fold_size

        yield (
            torch.from_numpy(train_idx).long(),
            torch.from_numpy(val_idx).long()
        )

TorchStandardScaler

Standardized scaling to 0 mean values with unit variance.

Attributes:

Name Type Description
mean_ float

The mean of the data, subtracted from the data during scaling.

std_ float

The standard deviation of the data, by which the data is divided during scaling.

Source code in uqregressors\utils\torch_sklearn_utils.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
class TorchStandardScaler:
    """
    Standardized scaling to 0 mean values with unit variance.

    Attributes: 
        mean_ (float): The mean of the data, subtracted from the data during scaling. 
        std_ (float): The standard deviation of the data, by which the data is divided during scaling.
    """
    def __init__(self):
        self.mean_ = None
        self.std_ = None

    def fit(self, X):
        """
        Fits the standard scaler. 

        Args: 
            X (torch.Tensor): data to be scaled of shape (n_samples, n_features).

        Returns: 
            (TorchStandardScaler): the scaler with updated mean_ and std_ attributes. 
        """
        self.mean_ = X.mean(dim=0, keepdim=True)
        self.std_ = X.std(dim=0, unbiased=False, keepdim=True)
        # Avoid division by zero
        self.std_[self.std_ < 1e-8] = 1.0
        return self

    def transform(self, X):
        """
        Transforms the standard scaler based on the attributes obtained with the fit method. 

        Args: 
            X (torch.Tensor): data to be scaled of shape (n_samples, n_features).

        Returns: 
            (torch.Tensor): The scaled data
        """
        return (X - self.mean_) / self.std_

    def fit_transform(self, X): 
        """
        Performs the fit and transforms the data. A combination of the fit and transform methods.

        Args: 
            X (torch.Tensor): data to be scaled of shape (n_samples, n_features).

        Returns: 
            (torch.Tensor): The scaled data
        """
        self.fit(X)
        return self.transform(X)

    def inverse_transform(self, X_scaled):
        """
        Transforms scaled data back to the original scale. 

        Args: 
            X_scaled (torch.Tensor): scaled data of shape (n_samples, n_features).

        Returns: 
            (torch.Tensor): The unscaled data. 
        """
        return X_scaled * self.std_ + self.mean_

fit(X)

Fits the standard scaler.

Parameters:

Name Type Description Default
X Tensor

data to be scaled of shape (n_samples, n_features).

required

Returns:

Type Description
TorchStandardScaler

the scaler with updated mean_ and std_ attributes.

Source code in uqregressors\utils\torch_sklearn_utils.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
def fit(self, X):
    """
    Fits the standard scaler. 

    Args: 
        X (torch.Tensor): data to be scaled of shape (n_samples, n_features).

    Returns: 
        (TorchStandardScaler): the scaler with updated mean_ and std_ attributes. 
    """
    self.mean_ = X.mean(dim=0, keepdim=True)
    self.std_ = X.std(dim=0, unbiased=False, keepdim=True)
    # Avoid division by zero
    self.std_[self.std_ < 1e-8] = 1.0
    return self

fit_transform(X)

Performs the fit and transforms the data. A combination of the fit and transform methods.

Parameters:

Name Type Description Default
X Tensor

data to be scaled of shape (n_samples, n_features).

required

Returns:

Type Description
Tensor

The scaled data

Source code in uqregressors\utils\torch_sklearn_utils.py
58
59
60
61
62
63
64
65
66
67
68
69
def fit_transform(self, X): 
    """
    Performs the fit and transforms the data. A combination of the fit and transform methods.

    Args: 
        X (torch.Tensor): data to be scaled of shape (n_samples, n_features).

    Returns: 
        (torch.Tensor): The scaled data
    """
    self.fit(X)
    return self.transform(X)

inverse_transform(X_scaled)

Transforms scaled data back to the original scale.

Parameters:

Name Type Description Default
X_scaled Tensor

scaled data of shape (n_samples, n_features).

required

Returns:

Type Description
Tensor

The unscaled data.

Source code in uqregressors\utils\torch_sklearn_utils.py
71
72
73
74
75
76
77
78
79
80
81
def inverse_transform(self, X_scaled):
    """
    Transforms scaled data back to the original scale. 

    Args: 
        X_scaled (torch.Tensor): scaled data of shape (n_samples, n_features).

    Returns: 
        (torch.Tensor): The unscaled data. 
    """
    return X_scaled * self.std_ + self.mean_

transform(X)

Transforms the standard scaler based on the attributes obtained with the fit method.

Parameters:

Name Type Description Default
X Tensor

data to be scaled of shape (n_samples, n_features).

required

Returns:

Type Description
Tensor

The scaled data

Source code in uqregressors\utils\torch_sklearn_utils.py
46
47
48
49
50
51
52
53
54
55
56
def transform(self, X):
    """
    Transforms the standard scaler based on the attributes obtained with the fit method. 

    Args: 
        X (torch.Tensor): data to be scaled of shape (n_samples, n_features).

    Returns: 
        (torch.Tensor): The scaled data
    """
    return (X - self.mean_) / self.std_

train_test_split(X, y, test_size=0.2, device='cpu', random_state=None, shuffle=True)

Split arrays or tensors into training and test sets.

Parameters:

Name Type Description Default
X array - like or Tensor

Features to be split.

required
y array - like or Tensor

Targets to be split.

required
test_size float

Proportion of the dataset to include in the test split (between 0 and 1).

0.2
random_state int or None

Controls the shuffling for reproducibility.

None
shuffle bool

Whether or not to shuffle the data before splitting.

True

Returns:

Type Description
Tuple[ndarray, ndarray, ndarray, ndarray]

X_train, X_test, y_train, y_test; same type as inputs

Source code in uqregressors\utils\torch_sklearn_utils.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
def train_test_split(X, y, test_size=0.2, device="cpu", random_state=None, shuffle=True):
    """
    Split arrays or tensors into training and test sets.

    Args:
        X (array-like or torch.Tensor): Features to be split. 
        y (array-like or torch.Tensor): Targets to be split. 
        test_size (float): Proportion of the dataset to include in the test split (between 0 and 1).
        random_state (int or None): Controls the shuffling for reproducibility.
        shuffle (bool): Whether or not to shuffle the data before splitting.

    Returns:
        (Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]): X_train, X_test, y_train, y_test; same type as inputs
    """
    # Convert to numpy for easy indexing
    if isinstance(X, torch.Tensor):
        X_np = X.cpu().numpy()
        is_tensor = True
    else:
        X_np = np.asarray(X)
        is_tensor = False

    if isinstance(y, torch.Tensor):
        y_np = y.cpu().numpy()
    else:
        y_np = np.asarray(y)

    # Check dimensions
    if X_np.shape[0] != y_np.shape[0]:
        raise ValueError(f"X and y must have the same number of samples. Got {X_np.shape[0]} and {y_np.shape[0]}.")

    n_samples = X_np.shape[0]
    n_test = int(n_samples * test_size)

    if random_state is not None:
        rng = np.random.default_rng(random_state)
    else:
        rng = np.random.default_rng()

    indices = np.arange(n_samples)
    if shuffle:
        rng.shuffle(indices)

    test_indices = indices[:n_test]
    train_indices = indices[n_test:]

    X_train, X_test = X_np[train_indices], X_np[test_indices]
    y_train, y_test = y_np[train_indices], y_np[test_indices]

    if is_tensor:
        X_train = torch.tensor(X_train, dtype=X.dtype, device=device)
        X_test = torch.tensor(X_test, dtype=X.dtype, device=device)
        y_train = torch.tensor(y_train, dtype=y.dtype, device=device)
        y_test = torch.tensor(y_test, dtype=y.dtype, device=device)

    return X_train, X_test, y_train, y_test