Skip to content

slise.data

This script contains functions for modifying data, mainly normalisation and PCA.

DataScaling

Bases: NamedTuple

Container class for scaling information

Source code in slise/data.py
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
class DataScaling(NamedTuple):
    """
    Container class for scaling information
    """

    x_center: np.ndarray
    x_scale: np.ndarray
    y_center: float
    y_scale: float
    columns: np.ndarray

    def scale_x(self, x: np.ndarray, remove_columns: bool = True) -> np.ndarray:
        """Scale a x matrix / vector using the stored scaling information.
        See [slise.data.scale_same][].

        Args:
            x (np.ndarray): New x matrix / vector.
            remove_columns (bool, optional): Remove columns according to the stored information. Defaults to True.

        Returns:
            np.ndarray: Scaled matrix / vector.
        """
        return scale_same(x, self.x_center, self.x_scale, self.columns, remove_columns)

    def scale_y(self, y: Union[float, np.ndarray]) -> Union[float, np.ndarray]:
        """Scale a y vector / scalar using the stored scaling information.
        See [slise.data.scale_same][].

        Args:
            y (np.ndarray): New y vector / scalar.

        Returns:
            np.ndarray: Scaled y vector / scalar.
        """
        return scale_same(y, self.y_center, self.y_scale)

    def unscale_model(self, model: np.ndarray) -> np.ndarray:
        """Unscale a linear model.
        See [slise.data.unscale_model][].

        Args:
            model (np.ndarray): Linear model operating on scaled data.

        Returns:
            np.ndarray: Linear model operating on unscaled data.
        """
        return unscale_model(
            model,
            self.x_center,
            self.x_scale,
            self.y_center,
            self.y_scale,
            self.columns,
        )

scale_x(x, remove_columns=True)

Scale a x matrix / vector using the stored scaling information. See slise.data.scale_same.

Parameters:

Name Type Description Default
x ndarray

New x matrix / vector.

required
remove_columns bool

Remove columns according to the stored information. Defaults to True.

True

Returns:

Type Description
ndarray

np.ndarray: Scaled matrix / vector.

Source code in slise/data.py
199
200
201
202
203
204
205
206
207
208
209
210
def scale_x(self, x: np.ndarray, remove_columns: bool = True) -> np.ndarray:
    """Scale a x matrix / vector using the stored scaling information.
    See [slise.data.scale_same][].

    Args:
        x (np.ndarray): New x matrix / vector.
        remove_columns (bool, optional): Remove columns according to the stored information. Defaults to True.

    Returns:
        np.ndarray: Scaled matrix / vector.
    """
    return scale_same(x, self.x_center, self.x_scale, self.columns, remove_columns)

scale_y(y)

Scale a y vector / scalar using the stored scaling information. See slise.data.scale_same.

Parameters:

Name Type Description Default
y ndarray

New y vector / scalar.

required

Returns:

Type Description
Union[float, ndarray]

np.ndarray: Scaled y vector / scalar.

Source code in slise/data.py
212
213
214
215
216
217
218
219
220
221
222
def scale_y(self, y: Union[float, np.ndarray]) -> Union[float, np.ndarray]:
    """Scale a y vector / scalar using the stored scaling information.
    See [slise.data.scale_same][].

    Args:
        y (np.ndarray): New y vector / scalar.

    Returns:
        np.ndarray: Scaled y vector / scalar.
    """
    return scale_same(y, self.y_center, self.y_scale)

unscale_model(model)

Unscale a linear model. See slise.data.unscale_model.

Parameters:

Name Type Description Default
model ndarray

Linear model operating on scaled data.

required

Returns:

Type Description
ndarray

np.ndarray: Linear model operating on unscaled data.

Source code in slise/data.py
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
def unscale_model(self, model: np.ndarray) -> np.ndarray:
    """Unscale a linear model.
    See [slise.data.unscale_model][].

    Args:
        model (np.ndarray): Linear model operating on scaled data.

    Returns:
        np.ndarray: Linear model operating on unscaled data.
    """
    return unscale_model(
        model,
        self.x_center,
        self.x_scale,
        self.y_center,
        self.y_scale,
        self.columns,
    )

add_intercept_column(X)

Add a constant column of ones to the matrix.

Parameters:

Name Type Description Default
X ndarray

Matrix or vector.

required

Returns:

Type Description
ndarray

np.ndarray: Matrix / vector where the first column / value is one.

Source code in slise/data.py
10
11
12
13
14
15
16
17
18
19
20
21
def add_intercept_column(X: np.ndarray) -> np.ndarray:
    """Add a constant column of ones to the matrix.

    Args:
        X (np.ndarray): Matrix or vector.

    Returns:
        np.ndarray: Matrix / vector where the first column / value is one.
    """
    if len(X.shape) == 1:
        return np.concatenate(([1.0], X))
    return np.concatenate((np.ones((X.shape[0], 1)), X), 1)

remove_intercept_column(X)

Remove the first column. Used to revert slise.data.add_intercept_column.

Parameters:

Name Type Description Default
X ndarray

Matrix or vector.

required

Returns:

Type Description
ndarray

np.ndarray: Matrix / vector without the first column / value.

Source code in slise/data.py
24
25
26
27
28
29
30
31
32
33
34
35
36
def remove_intercept_column(X: np.ndarray) -> np.ndarray:
    """Remove the first column.
    Used to revert [slise.data.add_intercept_column][].

    Args:
        X (np.ndarray): Matrix or vector.

    Returns:
        np.ndarray: Matrix / vector without the first column / value.
    """
    if len(X.shape) == 1:
        return X[1:]
    return X[:, 1:]

remove_constant_columns(X, epsilon=None)

Remove columns that are constant from a matrix. Used to revert slise.data.add_constant_columns.

Parameters:

Name Type Description Default
X ndarray

Data matrix.

required
epsilon Optional[float]

Treshold for constant (std < epsilon). Defaults to machine epsilon.

None

Returns:

Type Description
Tuple[ndarray, ndarray]

Tuple[np.ndarray, np.ndarray]: A tuple of the reduced matrix and a mask showing which columns where retained.

Source code in slise/data.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
def remove_constant_columns(
    X: np.ndarray, epsilon: Optional[float] = None
) -> Tuple[np.ndarray, np.ndarray]:
    """Remove columns that are constant from a matrix.
    Used to revert [slise.data.add_constant_columns][].

    Args:
        X (np.ndarray): Data matrix.
        epsilon (Optional[float], optional): Treshold for constant (std < epsilon). Defaults to machine epsilon.

    Returns:
        Tuple[np.ndarray, np.ndarray]: A tuple of the reduced matrix and a mask showing which columns where retained.
    """
    if epsilon is None:
        epsilon = np.finfo(X.dtype).eps
    std = np.std(X, 0)
    mask = std > epsilon
    return X[:, mask], mask

add_constant_columns(X, mask, intercept=False)

Add (back) contant columns to a matrix.

Parameters:

Name Type Description Default
X ndarray

Data matrix.

required
mask Optional[ndarray]

A boolean array showing which columns are already in the matrix.

required
intercept bool

Does X has an intercept (added to it after constant columns where removed). Defaults to False.

False

Returns:

Type Description
ndarray

np.ndarray: A matrix with new columns filled with zeros.

Source code in slise/data.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def add_constant_columns(
    X: np.ndarray, mask: Optional[np.ndarray], intercept: bool = False
) -> np.ndarray:
    """Add (back) contant columns to a matrix.

    Args:
        X (np.ndarray): Data matrix.
        mask (Optional[np.ndarray]): A boolean array showing which columns are already in the matrix.
        intercept (bool, optional): Does X has an intercept (added to it after constant columns where removed). Defaults to False.

    Returns:
        np.ndarray: A matrix with new columns filled with zeros.
    """
    if mask is None:
        return X
    if intercept:
        mask = np.concatenate(([True], mask))
    if len(X.shape) < 2:
        X2 = np.zeros(len(mask), X.dtype)
        X2[mask] = X
        return X2
    else:
        X2 = np.zeros((X.shape[0], len(mask)), X.dtype)
        X2[:, mask] = X
        return X2

normalise_robust(x, epsilon=None)

A robust version of normalisation that uses median and mad (median absolute deviation). Any zeros in the scale are replaced by ones to avoid division by zero.

Parameters:

Name Type Description Default
x ndarray

Vector or tensor to normalise.

required
epsilon Optional[float]

Threshold for the scale being zero. Defaults to machine epsilon.

None

Returns:

Type Description
Tuple[ndarray, Union[float, ndarray], Union[float, ndarray]]

Tuple[np.ndarray, Union[float, np.ndarray], Union[float, np.ndarray]]: Tuple of normalised x, center and scale.

Source code in slise/data.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def normalise_robust(
    x: np.ndarray, epsilon: Optional[float] = None
) -> Tuple[np.ndarray, Union[float, np.ndarray], Union[float, np.ndarray]]:
    """A robust version of normalisation that uses median and mad (median absolute deviation).
        Any zeros in the scale are replaced by ones to avoid division by zero.

    Args:
        x (np.ndarray): Vector or tensor to normalise.
        epsilon (Optional[float], optional): Threshold for the scale being zero. Defaults to machine epsilon.

    Returns:
        Tuple[np.ndarray, Union[float, np.ndarray], Union[float, np.ndarray]]: Tuple of normalised x, center and scale.
    """
    if epsilon is None:
        epsilon = np.finfo(x.dtype).eps
    if len(x.shape) < 2:
        center = np.median(x)
        x = x - center
        scale = np.median(np.abs(x))
        if scale <= epsilon:
            scale = 1.0
        return x / scale, center, scale
    else:
        center = np.median(x, 0)
        x = x - center[None, :]
        scale = np.median(np.abs(x), 0)
        scale[scale <= epsilon] = 1.0
        return x / scale[None, :], center, scale

scale_same(x, center, scale, constant_colums=None, remove_columns=True)

Scale a matrix or vector the same way as another.

Parameters:

Name Type Description Default
x ndarray

Matrix or vector to scale.

required
center Union[float, ndarray]

The center used for the previous scaling.

required
scale Union[float, ndarray]

The scale used for the previous scaling.

required
constant_colums Optional[ndarray]

Boolean mask of constant columns. Defaults to None.

None
remove_columns bool

Should constant columns be removed. Defaults to True.

True

Returns:

Type Description
ndarray

np.ndarray: The scaled matrix/vector.

Source code in slise/data.py
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
def scale_same(
    x: Union[np.ndarray, float],
    center: Union[float, np.ndarray],
    scale: Union[float, np.ndarray],
    constant_colums: Optional[np.ndarray] = None,
    remove_columns: bool = True,
) -> np.ndarray:
    """Scale a matrix or vector the same way as another.

    Args:
        x (np.ndarray): Matrix or vector to scale.
        center (Union[float, np.ndarray]): The center used for the previous scaling.
        scale (Union[float, np.ndarray]): The scale used for the previous scaling.
        constant_colums (Optional[np.ndarray], optional): Boolean mask of constant columns. Defaults to None.
        remove_columns (bool, optional): Should constant columns be removed. Defaults to True.

    Returns:
        np.ndarray: The scaled matrix/vector.
    """
    if isinstance(x, float) or len(x.shape) < 2:
        if constant_colums is not None:
            if not remove_columns:
                y = np.zeros_like(x)
                y[constant_colums] = (x[constant_colums] - center) / scale
                return y
            x = x[constant_colums]
        return (x - center) / scale
    else:
        if constant_colums is not None:
            if not remove_columns:
                y = np.zeros_like(x)
                y[:, constant_colums] = (
                    x[:, constant_colums] - center[None, :]
                ) / scale[None, :]
                return y
            x = x[:, constant_colums]
        return (x - center[None, :]) / scale[None, :]

unscale_model(model, x_center, x_scale, y_center=0.0, y_scale=1.0, columns=None)

Scale a linear model such that it matches unnormalised data.

Parameters:

Name Type Description Default
model ndarray

The model for normalised data.

required
x_center ndarray

The center used for normalising X.

required
x_scale ndarray

The scale used for normalising X.

required
y_center float

The scale used for normalising y. Defaults to 0.0.

0.0
y_scale float

The center used for normalising y. Defaults to 1.0.

1.0
columns Optional[ndarray]

Mask of removed columns (see remove_constant_columns). Defaults to None.

None

Returns:

Type Description
ndarray

np.ndarray: The unscaled model.

Source code in slise/data.py
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
def unscale_model(
    model: np.ndarray,
    x_center: np.ndarray,
    x_scale: np.ndarray,
    y_center: float = 0.0,
    y_scale: float = 1.0,
    columns: Optional[np.ndarray] = None,
) -> np.ndarray:
    """Scale a linear model such that it matches unnormalised data.

    Args:
        model (np.ndarray): The model for normalised data.
        x_center (np.ndarray): The center used for normalising X.
        x_scale (np.ndarray): The scale used for normalising X.
        y_center (float, optional): The scale used for normalising y. Defaults to 0.0.
        y_scale (float, optional): The center used for normalising y. Defaults to 1.0.
        columns (Optional[np.ndarray], optional): Mask of removed columns (see remove_constant_columns). Defaults to None.

    Returns:
        np.ndarray: The unscaled model.
    """
    if len(model) == len(x_center):
        model = np.concatenate((np.zeros(1, x_center.dtype), model))
    else:
        model = model.copy()
    model[0] = (model[0] - np.sum(model[1:] * x_center / x_scale)) * y_scale + y_center
    model[1:] = model[1:] / x_scale * y_scale
    if columns is not None:
        return add_constant_columns(model, columns, True)
    else:
        return model

pca_simple(x, dimensions=10, tolerance=1e-10)

Fit and use PCA for dimensionality reduction.

Parameters:

Name Type Description Default
x ndarray

Matrix to reduce.

required
dimensions int

The number of dimensions to return. Defaults to 10.

10
tolerance float

Threshold for variance being zero. Defaults to 1e-10.

1e-10

Returns:

Type Description
Tuple[ndarray, ndarray]

Tuple[np.ndarray, np.ndarray]: Tuple of the reduced matrix and PCA rotation matrix.

Source code in slise/data.py
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
def pca_simple(
    x: np.ndarray, dimensions: int = 10, tolerance: float = 1e-10
) -> Tuple[np.ndarray, np.ndarray]:
    """Fit and use PCA for dimensionality reduction.

    Args:
        x (np.ndarray): Matrix to reduce.
        dimensions (int, optional): The number of dimensions to return. Defaults to 10.
        tolerance (float, optional): Threshold for variance being zero. Defaults to 1e-10.

    Returns:
        Tuple[np.ndarray, np.ndarray]: Tuple of the reduced matrix and PCA rotation matrix.
    """
    if len(x.shape) == 1:
        return x, 1.0
    dimensions = min(dimensions, *x.shape)
    u, s, v = np.linalg.svd(x, False, True, False)
    dimensions = max(1, np.sum(s[: min(dimensions, len(s))] > s[0] * tolerance))
    return u[:, :dimensions].dot(np.diag(s[:dimensions])), v[:dimensions, :]

pca_rotate(x, v)

Use a trained PCA for dimensionality reduction. See slise.data.pca_simple for how to obtain a rotation matrix.

Parameters:

Name Type Description Default
x ndarray

Matrix to reduce.

required
v ndarray

PCA rotation matrix.

required

Returns:

Type Description
ndarray

np.ndarray: The reduced matrix.

Source code in slise/data.py
265
266
267
268
269
270
271
272
273
274
275
276
def pca_rotate(x: np.ndarray, v: np.ndarray) -> np.ndarray:
    """Use a trained PCA for dimensionality reduction.
    See [slise.data.pca_simple][] for how to obtain a rotation matrix.

    Args:
        x (np.ndarray): Matrix to reduce.
        v (np.ndarray): PCA rotation matrix.

    Returns:
        np.ndarray: The reduced matrix.
    """
    return x @ v.T

pca_invert(x, v)

Revert a PCA dimensionality reduction. See slise.data.pca_simple for how to obtain a rotation matrix.

Parameters:

Name Type Description Default
x ndarray

Matrix to expand.

required
v ndarray

PCA rotation matrix.

required

Returns:

Type Description
ndarray

np.ndarray: The expanded matrix.

Source code in slise/data.py
279
280
281
282
283
284
285
286
287
288
289
290
def pca_invert(x: np.ndarray, v: np.ndarray) -> np.ndarray:
    """Revert a PCA dimensionality reduction.
    See [slise.data.pca_simple][] for how to obtain a rotation matrix.

    Args:
        x (np.ndarray): Matrix to expand.
        v (np.ndarray): PCA rotation matrix.

    Returns:
        np.ndarray: The expanded matrix.
    """
    return x @ v

pca_rotate_model(model, v)

Transform a linear model to work in PCA reduced space. See slise.data.pca_simple for how to obtain a rotation matrix.

Parameters:

Name Type Description Default
model ndarray

Linear model coefficients.

required
v ndarray

PCA rotation matrix.

required

Returns:

Type Description
ndarray

np.ndarray: The transformed linear model.

Source code in slise/data.py
293
294
295
296
297
298
299
300
301
302
303
304
305
306
def pca_rotate_model(model: np.ndarray, v: np.ndarray) -> np.ndarray:
    """Transform a linear model to work in PCA reduced space.
    See [slise.data.pca_simple][] for how to obtain a rotation matrix.

    Args:
        model (np.ndarray): Linear model coefficients.
        v (np.ndarray): PCA rotation matrix.

    Returns:
        np.ndarray: The transformed linear model.
    """
    if len(model) > v.shape[1]:
        return np.concatenate((model[:1], v @ model[1:]))
    return v @ model

pca_invert_model(model, v)

Transform a linear model from PCA space to "normal" space. See slise.data.pca_simple for how to obtain a rotation matrix.

Parameters:

Name Type Description Default
model ndarray

Linear model coefficients (in PCA space).

required
v ndarray

PCA rotation matrix.

required

Returns:

Type Description
ndarray

np.ndarray: The transformed linear model.

Source code in slise/data.py
309
310
311
312
313
314
315
316
317
318
319
320
321
322
def pca_invert_model(model: np.ndarray, v: np.ndarray) -> np.ndarray:
    """Transform a linear model from PCA space to "normal" space.
    See [slise.data.pca_simple][] for how to obtain a rotation matrix.

    Args:
        model (np.ndarray): Linear model coefficients (in PCA space).
        v (np.ndarray): PCA rotation matrix.

    Returns:
        np.ndarray: The transformed linear model.
    """
    if len(model) > v.shape[0]:
        return np.concatenate((model[:1], v.T @ model[1:]))
    return v.T @ model