mcikmeans

`MCIKMeans`

Implementation of K-Means with Minimization of Cluster Impurity (MCI-Kmeans), as described in [1].

This algorithm implements a semi-supervised version of K-Means, that aims to minimize the intra-cluster dispersion while also minimizing the impurity of each cluster.

[1] Masud, Mohammad M., et al. "A practical approach to classify evolving data streams: Training with limited amount of labeled data." 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.

Parameters:

Name	Type	Description	Default
`n_clusters`	`int`	Number of clusters to generate	`8`
`max_iter`	`int`	Maximum number of iterations of the M-Step	`300`
`conditional_mode_max_iter`	`int`	Maximum number of iterations of the E-Step	`300`
`random_state`	`int`	Seed for the random number generation. Makes the algorithm deterministic if a number is provided.	`None`

Attributes:

Name	Type	Description
`clusters`	`dict`	Dictionary containing each cluster with their label as key
`cluster_centers_`	`ndarray`	Array containing the coordinates of the cluster centers
`labels_`	`ndarray`	Labels of each point

Source code in streamndr/utils/mcikmeans.py

class MCIKMeans():
    """Implementation of K-Means with Minimization of Cluster Impurity (MCI-Kmeans), as described in [1].

    This algorithm implements a semi-supervised version of K-Means, that aims to minimize the intra-cluster dispersion while also minimizing the impurity of each cluster.

    [1] Masud, Mohammad M., et al. "A practical approach to classify evolving data streams: Training with limited amount of labeled data." 
    2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.

    Parameters
    ----------
    n_clusters : int
        Number of clusters to generate
    max_iter : int
        Maximum number of iterations of the M-Step
    conditional_mode_max_iter : int
        Maximum number of iterations of the E-Step
    random_state : int
        Seed for the random number generation. Makes the algorithm deterministic if a number is provided.

    Attributes
    ----------
    clusters : dict
        Dictionary containing each cluster with their label as key
    cluster_centers_ : numpy.ndarray
        Array containing the coordinates of the cluster centers
    labels_ : numpy.ndarray
        Labels of each point
    """
    def __init__(self,
                 n_clusters=8,
                 max_iter=300,
                 conditional_mode_max_iter=300,
                 random_state=None):

        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.conditional_mode_max_iter = conditional_mode_max_iter
        self.random_state = random_state

        if random_state != None:
            random.seed(random_state)

        self.clusters = []

    def fit(self, X, y):
        """Compute MCI-Kmeans clustering.

        Parameters
        ----------
        X : numpy.ndarray
            Samples
        y : list of int
            Labels of the samples, expects -1 if the label is not known

        Returns
        -------
        MCIKmeans
            Fitted estimator
        """
        y = np.array(y)

        samples_per_class = {}
        number_of_centroids = {}
        unlabeled_samples = []

        nb_labeled_samples = len(X[y!=-1])
        remaining = 0
        for label in np.unique(y):
            if label == -1:
                unlabeled_samples = X[y==-1]
                continue

            samples_per_class[label] = X[y==label]

            weight = self.n_clusters * len(samples_per_class[label]) / nb_labeled_samples
            remaining += weight - round(weight)
            number_of_centroids[label] = round(weight)

        while(remaining > 0):
            #Find the label with the smallest number of centroids and add the remainder to it
            key_with_min_value = min(number_of_centroids, key=lambda i: number_of_centroids[i])
            number_of_centroids[key_with_min_value] += 1

            remaining -= 1

        centroids = []
        for label in samples_per_class:
            centroids.extend(self._init_centroids(samples_per_class[label], number_of_centroids[label]))

            if (len(centroids) < number_of_centroids[label]) and (len(unlabeled_samples) > 0):
                filling_samples = copy.deepcopy(unlabeled_samples)

                while ((len(centroids) < number_of_centroids[label]) and (len(filling_samples) > 0)):
                    choice = filling_samples.pop(random.randrange(len(filling_samples)))
                    centroids.append(choice)

        for i in range(len(centroids)):
            self.clusters.append(ImpurityBasedCluster(i, centroids[i]))

        iterations = 0
        changing = True

        while changing and iterations < self.max_iter:
            changing = self._iterative_conditional_mode(samples_per_class, unlabeled_samples)

            for cluster in self.clusters:
                if cluster.n > 0:
                    cluster.update_properties()

            iterations += 1

        self.cluster_centers_ = np.array([cluster.centroid for cluster in self.clusters])
        self.labels_ = self.predict(X)

        return self


    def predict(self, X):
        """Predict the closest cluster each sample in X belongs to.

        Parameters
        ----------
        X : numpy.ndarray
            Samples to predict

        Returns
        -------
        numpy.ndarray
            Index of the cluster each sample belongs to
        """
        labels, _ = get_closest_clusters(X, [cluster.centroid for cluster in self.clusters])

        return labels

    def fit_predict(self, X, y):
        """Compute cluster centers and predict cluster index for each sample. Convenience method; equivalent to calling fit(X) followed by predict(X).

        Parameters
        ----------
        X : numpy.ndarray
            Samples
        y : list of int
            Labels of the samples, expects -1 if the label is not known

        Returns
        -------
        numpy.ndarray
            Index of the cluster each sample belongs to
        """
        return self.fit(X, y).labels_

    def _init_centroids(self, samples, numbers_of_centroids):
        centroids = []

        if len(samples) <= numbers_of_centroids:
            centroids.extend(samples)
            return centroids

        candidates = copy.deepcopy(samples).tolist()

        for i in range(numbers_of_centroids):
            selected = candidates.pop(random.randrange(len(candidates)))

            centroids.append(selected)

        return centroids

    def _iterative_conditional_mode(self, samples_per_class, unlabeled_samples):

        _labeled_samples = []
        for key, value in samples_per_class.items():
            _labeled_samples.extend([ShortMemInstance(x, None, key) for x in value])

        _unlabeled_samples = [ShortMemInstance(x, None, -1) for x in unlabeled_samples]

        iterations = 0
        changed = True
        no_change = True

        while iterations < self.conditional_mode_max_iter and changed:
            total_nb_samples = len(_labeled_samples) + len(_unlabeled_samples)

            iterations += 1
            changed = False

            for i in range(total_nb_samples):
                sample = None

                if (len(_labeled_samples) > 0) and ((len(unlabeled_samples) == 0) or bool(random.getrandbits(1))):
                    sample = _labeled_samples.pop(random.randrange(len(_labeled_samples)))

                else:
                    sample = _unlabeled_samples.pop(random.randrange(len(_unlabeled_samples)))

                previous_cluster_id = sample.timestamp

                if previous_cluster_id is not None:
                    self.clusters[previous_cluster_id].remove_sample(sample)
                    sample.timestamp = None

                distances = np.linalg.norm([cluster.centroid for cluster in self.clusters] - sample.point, axis=1)

                if sample.y_true != -1:
                    entropies = np.array([cluster.entropy for cluster in self.clusters])
                    dissimilarities = np.array([cluster.dissimilarity_count(sample) for cluster in self.clusters])
                    distances = distances * (1 + entropies * dissimilarities)

                chosen_cluster = np.argmin(distances)


                self.clusters[chosen_cluster].add_sample(sample)
                sample.timestamp = chosen_cluster

                self.clusters[chosen_cluster].update_entropy()

                if self.clusters[chosen_cluster].label != previous_cluster_id:
                    changed = True
                    no_change = False

        return not no_change

`fit(X, y)`

Compute MCI-Kmeans clustering.

Parameters:

Name	Type	Description	Default
`X`	`ndarray`	Samples	required
`y`	`list of int`	Labels of the samples, expects -1 if the label is not known	required

Returns:

Type	Description
`MCIKmeans`	Fitted estimator

Source code in streamndr/utils/mcikmeans.py

def fit(self, X, y):
    """Compute MCI-Kmeans clustering.

    Parameters
    ----------
    X : numpy.ndarray
        Samples
    y : list of int
        Labels of the samples, expects -1 if the label is not known

    Returns
    -------
    MCIKmeans
        Fitted estimator
    """
    y = np.array(y)

    samples_per_class = {}
    number_of_centroids = {}
    unlabeled_samples = []

    nb_labeled_samples = len(X[y!=-1])
    remaining = 0
    for label in np.unique(y):
        if label == -1:
            unlabeled_samples = X[y==-1]
            continue

        samples_per_class[label] = X[y==label]

        weight = self.n_clusters * len(samples_per_class[label]) / nb_labeled_samples
        remaining += weight - round(weight)
        number_of_centroids[label] = round(weight)

    while(remaining > 0):
        #Find the label with the smallest number of centroids and add the remainder to it
        key_with_min_value = min(number_of_centroids, key=lambda i: number_of_centroids[i])
        number_of_centroids[key_with_min_value] += 1

        remaining -= 1

    centroids = []
    for label in samples_per_class:
        centroids.extend(self._init_centroids(samples_per_class[label], number_of_centroids[label]))

        if (len(centroids) < number_of_centroids[label]) and (len(unlabeled_samples) > 0):
            filling_samples = copy.deepcopy(unlabeled_samples)

            while ((len(centroids) < number_of_centroids[label]) and (len(filling_samples) > 0)):
                choice = filling_samples.pop(random.randrange(len(filling_samples)))
                centroids.append(choice)

    for i in range(len(centroids)):
        self.clusters.append(ImpurityBasedCluster(i, centroids[i]))

    iterations = 0
    changing = True

    while changing and iterations < self.max_iter:
        changing = self._iterative_conditional_mode(samples_per_class, unlabeled_samples)

        for cluster in self.clusters:
            if cluster.n > 0:
                cluster.update_properties()

        iterations += 1

    self.cluster_centers_ = np.array([cluster.centroid for cluster in self.clusters])
    self.labels_ = self.predict(X)

    return self

`fit_predict(X, y)`

Compute cluster centers and predict cluster index for each sample. Convenience method; equivalent to calling fit(X) followed by predict(X).

Parameters:

Name	Type	Description	Default
`X`	`ndarray`	Samples	required
`y`	`list of int`	Labels of the samples, expects -1 if the label is not known	required

Returns:

Type	Description
`ndarray`	Index of the cluster each sample belongs to

Source code in streamndr/utils/mcikmeans.py

def fit_predict(self, X, y):
    """Compute cluster centers and predict cluster index for each sample. Convenience method; equivalent to calling fit(X) followed by predict(X).

    Parameters
    ----------
    X : numpy.ndarray
        Samples
    y : list of int
        Labels of the samples, expects -1 if the label is not known

    Returns
    -------
    numpy.ndarray
        Index of the cluster each sample belongs to
    """
    return self.fit(X, y).labels_

`predict(X)`

Predict the closest cluster each sample in X belongs to.

Parameters:

Name	Type	Description	Default
`X`	`ndarray`	Samples to predict	required

Returns:

Type	Description
`ndarray`	Index of the cluster each sample belongs to

Source code in streamndr/utils/mcikmeans.py

def predict(self, X):
    """Predict the closest cluster each sample in X belongs to.

    Parameters
    ----------
    X : numpy.ndarray
        Samples to predict

    Returns
    -------
    numpy.ndarray
        Index of the cluster each sample belongs to
    """
    labels, _ = get_closest_clusters(X, [cluster.centroid for cluster in self.clusters])

    return labels

`MicroCluster`

Bases: object

A representation of a cluster with compressed information.

Parameters:

Name	Type	Description	Default
`label`	`int`	Label associated with this microcluster	required
`instances`	`ndarray`	Instances in this microcluster, preferably these would not be stored if not needed using keep_instances=False. Will be converted to Python list for append performance.	`None`
`timestamp`	`int`	Timestamp this microcluster was last updated, used for forgetting mechanisms	`0`
`keep_instances`	`bool`	Whether or not to store the instances within the microcluster. Should preferably set to false, but some implementations require access to the instances	`True`

Attributes:

Name	Type	Description
`n`	`int`	Number of instances stored in this microcluster
`linear_sum`	`ndarray`	Linear sum of the points belonging to this microcluster
`squared_sum`	`ndarray`	Sum of the squared l2 norms of all samples belonging to this microcluster
`centroid`	`ndarray`	Centroid coordinates of the microcluster
`max_distance`	`ndarray`	Maximum distance between a point belonging to the microcluster and its centroid
`mean_distance`	`ndarray`	Mean distance of the distances between the cluster's points and its centroid

Source code in streamndr/utils/data_structure.py

class MicroCluster(object):
    """A representation of a cluster with compressed information.

    Parameters
    ----------
    label : int
        Label associated with this microcluster
    instances : numpy.ndarray
        Instances in this microcluster, preferably these would not be stored if not needed using keep_instances=False. Will be converted to Python list for append performance.
    timestamp : int
        Timestamp this microcluster was last updated, used for forgetting mechanisms  
    keep_instances : bool
        Whether or not to store the instances within the microcluster. Should preferably set to false, but some implementations require
        access to the instances

    Attributes
    ----------
    n : int
        Number of instances stored in this microcluster
    linear_sum : numpy.ndarray
        Linear sum of the points belonging to this microcluster
    squared_sum : numpy.ndarray
        Sum of the squared l2 norms of all samples belonging to this microcluster
    centroid : numpy.ndarray
        Centroid coordinates of the microcluster
    max_distance : numpy.ndarray
        Maximum distance between a point belonging to the microcluster and its centroid
    mean_distance : numpy.ndarray
        Mean distance of the distances between the cluster's points and its centroid
    """

    def __init__(self,
                 label,  # the class the microcluster belongs to
                 instances=None,
                 timestamp=0, 
                 keep_instances=True #Required True for MINAS
                 ):

        # TODO: remove instances entirely so it doesn't need to be stored in memory; Might not be possible because of _best_threshold used by MINAS which needs instances
        super(MicroCluster, self).__init__()
        self.label = label

        if instances is not None:
            self.instances = instances.tolist()
            self.n = len(instances)
            self.linear_sum = instances.sum(axis=0)

            # Sum of the squared l2 norms of all samples belonging to a microcluster:
            self.squared_sum = np.square(np.linalg.norm(self.instances, axis=1)).sum()
            # self.squared_sum = np.square(instances).sum(axis=0)  # From CluSTREAM paper
            self.centroid = self.linear_sum / self.n
            self.max_distance = np.max(self.distance_to_centroid(instances))
            self.mean_distance = np.mean(self.distance_to_centroid(instances))
            self.update_properties()

        else:
            self.instances = None
            self.n = 0
            self.linear_sum = 0
            self.squared_sum = 0
            self.max_distance = 0
            self.mean_distance = 0

        self.timestamp = timestamp


        if not keep_instances:
            self.instances = None

    def __str__(self):
        """Returns string representation of a microcluster.

        Returns
        -------
        str
            String representation of microcluster
        """

        return f"""Target class {self.label}
                # of instances: {self.n}
                Linear sum: {self.linear_sum}
                Squared sum: {self.squared_sum}
                Centroid: {self.centroid}
                Radius: {self.radius}
                Timestamp of last change: {self.timestamp}"""

    def small_str(self):
        """Returns string representation of a microcluster.

        Returns
        -------
        str
            Small string representation of microcluster
        """

        return f"""Target class {self.label}
                # of instances: {self.n}
                Timestamp of last change: {self.timestamp}"""

    def get_radius(self):
        """Returns radius of the microcluster.

        Returns
        -------
        float
            Radius of the microcluster
        """

        factor = 1.5
        # from BIRCH Wikipedia
        diff = (self.squared_sum / self.n) - np.dot(self.centroid, self.centroid)
        if diff > 1e-15:
            return factor * np.sqrt(diff)
        else:  # in this case diff should be zero, but sometimes it's an infinitesimal difference
            return 0
        # from MINAS paper:
        #return factor*np.std(self.distance_to_centroid(self.instances))

    def distance_to_centroid(self, X):
        """Returns distance from X to centroid of this cluster.

        Parameters
        ----------
        X : numpy.ndarray or list
            Point or multiple points

        Returns
        -------
        numpy.ndarray
            Distance from X to the microcluster's centroid
        """
        if not isinstance(X, np.ndarray):
            X = np.array(X)

        if len(X.shape) == 1:  # X is only one point
            return np.linalg.norm(X - self.centroid)
        else:  # X contains several points
            return np.linalg.norm(X - self.centroid, axis=1)

    def encompasses(self, X):
        """Checks if point X is inside this microcluster. The point X is considered within the microcluster if the distance 
        between the point and the microcluster's centroid is less than the radius of the microcluster.

        Parameters
        ----------
        X : numpy.ndarray
            One point

        Returns
        -------
        bool
            If the point distance to centroid is contained within the microcluster or not
        """

        return np.less(self.distance_to_centroid(X), self.radius)

    def find_closest_cluster(self, clusters):
        """Finds closest microcluster to this one among passed microclusters.

        Parameters
        ----------
        clusters : list of MicroCluster

        Returns
        -------
        MicroCluster
            Closest microcluster
        """

        return min(clusters, key=lambda cl: cl.distance_to_centroid(self.centroid))

    def update_cluster(self, X, timestamp, update_summary):
        """Adds point received in parameter to the cluster and update cluster's centroid if wanted.

        Parameters
        ----------
        X : numpy.ndarray
            One point
        timestamp : int
            Timestamp when this point was added to this microcluster
        update_summary : bool
            Whether or not to update the microcluster properties with this new point
        """

        assert len(X.shape) == 1  # it's just one point
        self.timestamp = timestamp

        if self.instances is not None:
            self.instances.append(X)

        if update_summary:
            self.mean_distance = (self.n * self.mean_distance + self.distance_to_centroid(X)) / (self.n + 1)
            self.n += 1
            self.linear_sum = np.sum([self.linear_sum, X], axis=0)
            self.squared_sum = np.sum([self.squared_sum, np.square(X).sum()], axis=0)
            self.update_properties()

    def update_properties(self):
        """Updates centroid and radius based on current cluster properties."""
        self.centroid = self.linear_sum / self.n

        if self.instances is not None:
            self.radius = self.get_radius()
            if np.max(self.distance_to_centroid(self.instances)) > self.max_distance:
                self.max_distance = np.max(self.distance_to_centroid(self.instances))

    def is_cohesive(self, clusters):
        """Verifies if this cluster is cohesive for novelty detection purposes.
        A new micro-cluster is cohesive if its silhouette coefficient is larger than 0.
        'b' represents the Euclidean distance between the centroid of the new micro-cluster and the centroid of its
        closest micro-cluster, and 'a' represents the standard deviation of the distances between the examples of the
        new micro-cluster and the centroid of the new micro-cluster.

        Parameters
        ----------
        clusters : List of MicroCluster
            Existing known micro-clusters

        Returns
        -------
        bool
            If the cluster is cohesive (silhouette>0) or not
        """
        b = self.distance_to_centroid(self.find_closest_cluster(clusters).centroid)
        a = np.std(self.distance_to_centroid(self.instances))
        silhouette = (b - a) / max(a, b)  # hm, this is always positive if b > a
        return silhouette > 0

    def is_representative(self, min_examples):
        """Verifies if this cluster is representative for novelty detection purposes.
        A new micro-cluster is representative if it contains a minimal number of examples,
        where this number is a user-defined parameter.

        Parameters
        ----------
        min_examples : int
            The number of samples the microcluster needs to have to be considered representative.

        Returns
        -------
        bool
            If the cluster is representative or not
        """
        return self.n >= min_examples

`str()`

Returns string representation of a microcluster.

Returns:

Type	Description
`str`	String representation of microcluster

Source code in streamndr/utils/data_structure.py

def __str__(self):
    """Returns string representation of a microcluster.

    Returns
    -------
    str
        String representation of microcluster
    """

    return f"""Target class {self.label}
            # of instances: {self.n}
            Linear sum: {self.linear_sum}
            Squared sum: {self.squared_sum}
            Centroid: {self.centroid}
            Radius: {self.radius}
            Timestamp of last change: {self.timestamp}"""

`distance_to_centroid(X)`

Returns distance from X to centroid of this cluster.

Parameters:

Name	Type	Description	Default
`X`	`ndarray or list`	Point or multiple points	required

Returns:

Type	Description
`ndarray`	Distance from X to the microcluster's centroid

Source code in streamndr/utils/data_structure.py

def distance_to_centroid(self, X):
    """Returns distance from X to centroid of this cluster.

    Parameters
    ----------
    X : numpy.ndarray or list
        Point or multiple points

    Returns
    -------
    numpy.ndarray
        Distance from X to the microcluster's centroid
    """
    if not isinstance(X, np.ndarray):
        X = np.array(X)

    if len(X.shape) == 1:  # X is only one point
        return np.linalg.norm(X - self.centroid)
    else:  # X contains several points
        return np.linalg.norm(X - self.centroid, axis=1)

`encompasses(X)`

Checks if point X is inside this microcluster. The point X is considered within the microcluster if the distance between the point and the microcluster's centroid is less than the radius of the microcluster.

Parameters:

Name	Type	Description	Default
`X`	`ndarray`	One point	required

Returns:

Type	Description
`bool`	If the point distance to centroid is contained within the microcluster or not

Source code in streamndr/utils/data_structure.py

def encompasses(self, X):
    """Checks if point X is inside this microcluster. The point X is considered within the microcluster if the distance 
    between the point and the microcluster's centroid is less than the radius of the microcluster.

    Parameters
    ----------
    X : numpy.ndarray
        One point

    Returns
    -------
    bool
        If the point distance to centroid is contained within the microcluster or not
    """

    return np.less(self.distance_to_centroid(X), self.radius)

`find_closest_cluster(clusters)`

Finds closest microcluster to this one among passed microclusters.

Parameters:

Name	Type	Description	Default
`clusters`	`list of MicroCluster`		required

Returns:

Type	Description
`MicroCluster`	Closest microcluster

Source code in streamndr/utils/data_structure.py

def find_closest_cluster(self, clusters):
    """Finds closest microcluster to this one among passed microclusters.

    Parameters
    ----------
    clusters : list of MicroCluster

    Returns
    -------
    MicroCluster
        Closest microcluster
    """

    return min(clusters, key=lambda cl: cl.distance_to_centroid(self.centroid))

`get_radius()`

Returns radius of the microcluster.

Returns:

Type	Description
`float`	Radius of the microcluster

Source code in streamndr/utils/data_structure.py

def get_radius(self):
    """Returns radius of the microcluster.

    Returns
    -------
    float
        Radius of the microcluster
    """

    factor = 1.5
    # from BIRCH Wikipedia
    diff = (self.squared_sum / self.n) - np.dot(self.centroid, self.centroid)
    if diff > 1e-15:
        return factor * np.sqrt(diff)
    else:  # in this case diff should be zero, but sometimes it's an infinitesimal difference
        return 0

`is_cohesive(clusters)`

Verifies if this cluster is cohesive for novelty detection purposes. A new micro-cluster is cohesive if its silhouette coefficient is larger than 0. 'b' represents the Euclidean distance between the centroid of the new micro-cluster and the centroid of its closest micro-cluster, and 'a' represents the standard deviation of the distances between the examples of the new micro-cluster and the centroid of the new micro-cluster.

Parameters:

Name	Type	Description	Default
`clusters`	`List of MicroCluster`	Existing known micro-clusters	required

Returns:

Type	Description
`bool`	If the cluster is cohesive (silhouette>0) or not

Source code in streamndr/utils/data_structure.py

def is_cohesive(self, clusters):
    """Verifies if this cluster is cohesive for novelty detection purposes.
    A new micro-cluster is cohesive if its silhouette coefficient is larger than 0.
    'b' represents the Euclidean distance between the centroid of the new micro-cluster and the centroid of its
    closest micro-cluster, and 'a' represents the standard deviation of the distances between the examples of the
    new micro-cluster and the centroid of the new micro-cluster.

    Parameters
    ----------
    clusters : List of MicroCluster
        Existing known micro-clusters

    Returns
    -------
    bool
        If the cluster is cohesive (silhouette>0) or not
    """
    b = self.distance_to_centroid(self.find_closest_cluster(clusters).centroid)
    a = np.std(self.distance_to_centroid(self.instances))
    silhouette = (b - a) / max(a, b)  # hm, this is always positive if b > a
    return silhouette > 0

`is_representative(min_examples)`

Verifies if this cluster is representative for novelty detection purposes. A new micro-cluster is representative if it contains a minimal number of examples, where this number is a user-defined parameter.

Parameters:

Name	Type	Description	Default
`min_examples`	`int`	The number of samples the microcluster needs to have to be considered representative.	required

Returns:

Type	Description
`bool`	If the cluster is representative or not

Source code in streamndr/utils/data_structure.py

def is_representative(self, min_examples):
    """Verifies if this cluster is representative for novelty detection purposes.
    A new micro-cluster is representative if it contains a minimal number of examples,
    where this number is a user-defined parameter.

    Parameters
    ----------
    min_examples : int
        The number of samples the microcluster needs to have to be considered representative.

    Returns
    -------
    bool
        If the cluster is representative or not
    """
    return self.n >= min_examples

`small_str()`

Returns string representation of a microcluster.

Returns:

Type	Description
`str`	Small string representation of microcluster

Source code in streamndr/utils/data_structure.py

def small_str(self):
    """Returns string representation of a microcluster.

    Returns
    -------
    str
        Small string representation of microcluster
    """

    return f"""Target class {self.label}
            # of instances: {self.n}
            Timestamp of last change: {self.timestamp}"""

`update_cluster(X, timestamp, update_summary)`

Adds point received in parameter to the cluster and update cluster's centroid if wanted.

Parameters:

Name	Type	Description	Default
`X`	`ndarray`	One point	required
`timestamp`	`int`	Timestamp when this point was added to this microcluster	required
`update_summary`	`bool`	Whether or not to update the microcluster properties with this new point	required

Source code in streamndr/utils/data_structure.py

def update_cluster(self, X, timestamp, update_summary):
    """Adds point received in parameter to the cluster and update cluster's centroid if wanted.

    Parameters
    ----------
    X : numpy.ndarray
        One point
    timestamp : int
        Timestamp when this point was added to this microcluster
    update_summary : bool
        Whether or not to update the microcluster properties with this new point
    """

    assert len(X.shape) == 1  # it's just one point
    self.timestamp = timestamp

    if self.instances is not None:
        self.instances.append(X)

    if update_summary:
        self.mean_distance = (self.n * self.mean_distance + self.distance_to_centroid(X)) / (self.n + 1)
        self.n += 1
        self.linear_sum = np.sum([self.linear_sum, X], axis=0)
        self.squared_sum = np.sum([self.squared_sum, np.square(X).sum()], axis=0)
        self.update_properties()

`update_properties()`

Updates centroid and radius based on current cluster properties.

Source code in streamndr/utils/data_structure.py

def update_properties(self):
    """Updates centroid and radius based on current cluster properties."""
    self.centroid = self.linear_sum / self.n

    if self.instances is not None:
        self.radius = self.get_radius()
        if np.max(self.distance_to_centroid(self.instances)) > self.max_distance:
            self.max_distance = np.max(self.distance_to_centroid(self.instances))

`ShortMemInstance`

Instance of a point associated with a timestamp. Used for the buffer memory which stores the unknown samples.

Attributes:

Name	Type	Description
`point`	`ndarray`	The coordinates of the point
`timestamp`	`int`	The timestamp the point was added/treated
`y_true`	`int`	The true value of the class

Source code in streamndr/utils/data_structure.py

class ShortMemInstance:
    """Instance of a point associated with a timestamp. Used for the buffer memory which stores the unknown samples.

    Attributes
    ----------
    point : numpy.ndarray
        The coordinates of the point
    timestamp : int
        The timestamp the point was added/treated
    y_true : int
        The true value of the class
    """
    def __init__(self, point, timestamp, y_true=None):
        self.point = point
        self.timestamp = timestamp
        self.y_true = y_true

    def __eq__(self, other):
        """Elements are equal if they have the same values for all variables.
        This currently does not consider the timestamp.

        Parameters
        ----------
        other : ShortMemInstance
            Other instance to compared to

        Returns
        -------
        bool
            If the instances are equals or not
        """
        if type(other) == np.ndarray:
            return np.all(self.point == other)

`eq(other)`

Elements are equal if they have the same values for all variables. This currently does not consider the timestamp.

Parameters:

Name	Type	Description	Default
`other`	`ShortMemInstance`	Other instance to compared to	required

Returns:

Type	Description
`bool`	If the instances are equals or not

Source code in streamndr/utils/data_structure.py

def __eq__(self, other):
    """Elements are equal if they have the same values for all variables.
    This currently does not consider the timestamp.

    Parameters
    ----------
    other : ShortMemInstance
        Other instance to compared to

    Returns
    -------
    bool
        If the instances are equals or not
    """
    if type(other) == np.ndarray:
        return np.all(self.point == other)

mcikmeans

MCIKMeans

fit(X, y)

fit_predict(X, y)

predict(X)

MicroCluster

__str__()

distance_to_centroid(X)

encompasses(X)

find_closest_cluster(clusters)

get_radius()

is_cohesive(clusters)

is_representative(min_examples)

small_str()

update_cluster(X, timestamp, update_summary)

update_properties()

ShortMemInstance

__eq__(other)

`MCIKMeans`

`fit(X, y)`

`fit_predict(X, y)`

`predict(X)`

`MicroCluster`

`str()`

`distance_to_centroid(X)`

`encompasses(X)`

`find_closest_cluster(clusters)`

`get_radius()`

`is_cohesive(clusters)`

`is_representative(min_examples)`

`small_str()`

`update_cluster(X, timestamp, update_summary)`

`update_properties()`

`ShortMemInstance`

`eq(other)`