anyway/backend/src/utils/cluster_manager.py

import logging
from typing import Literal

import numpy as np
from sklearn.cluster import DBSCAN
from pydantic import BaseModel
from OSMPythonTools.overpass import Overpass, overpassQueryBuilder
from OSMPythonTools.cachingStrategy import CachingStrategy, JSON

from ..structs.landmark import Landmark
from ..utils.get_time_separation import get_distance
from ..constants import OSM_CACHE_DIR


class Cluster(BaseModel):
    """"
    A class representing an interesting area for shopping or sightseeing.

    It can represent either a general area or a specifc route with start and end point.
    The importance represents the number of shops found in this cluster.

    Attributes:
        type :       either a 'street' or 'area' (representing a denser field of shops).
        importance : size of the cluster (number of points).
        centroid :   center of the cluster.
        start :      if the type is a street it goes from here...
        end :        ...to here
    """
    type: Literal['street', 'area']
    importance: int
    centroid: tuple
    # start: Optional[list] = None      # for later use if we want to have streets as well
    # end: Optional[list] = None


class ClusterManager:

    logger = logging.getLogger(__name__)

    # NOTE: all points are in (lat, lon) format
    valid: bool             # Ensure the manager is valid (ie there are some clusters to be found)
    all_points: list
    cluster_points: list
    cluster_labels: list
    cluster_type: Literal['sightseeing', 'shopping']

    def __init__(self, bbox: tuple, cluster_type: Literal['sightseeing', 'shopping']) -> None:
        """
        Upon intialization, generate the point cloud used for cluster detection.
        The points represent bag/clothes shops and general boutiques.
        If the first step is successful, it applies the DBSCAN clustering algorithm with different
        parameters depending on the size of the city (number of points).
        It filters out noise points and keeps only the largest clusters.

        A successful initialization updates:
            - `self.cluster_points`: The points belonging to clusters.
            - `self.cluster_labels`: The labels for the points in clusters.

        The method also calls `filter_clusters()` to retain only the largest clusters.

        Args:
            bbox: The bounding box coordinates (around:radius, center_lat, center_lon).
        """

        # Initialize overpass and cache
        self.overpass = Overpass()
        CachingStrategy.use(JSON, cacheDir=OSM_CACHE_DIR)

        self.cluster_type = cluster_type
        if cluster_type == 'shopping' :
            elem_type = ['node']
            sel = ['"shop"~"^(bag|boutique|clothes)$"']
            out = 'skel'
        else :
            elem_type = ['way']
            sel = ['"historic"="building"']
            out = 'center'

        # Initialize the points for cluster detection
        query = overpassQueryBuilder(
            bbox = bbox,
            elementType = elem_type,
            selector = sel,
            includeCenter = True,
            out = out
        )

        try:
            result = self.overpass.query(query)
        except Exception as e:
            self.logger.error(f"Error fetching landmarks: {e}")

        if len(result.elements()) == 0 :
            self.valid = False

        else :
            points = []
            for elem in result.elements() :
                coords = tuple((elem.lat(), elem.lon()))
                if coords[0] is None :
                    coords = tuple((elem.centerLat(), elem.centerLon()))
                points.append(coords)

            self.all_points = np.array(points)
            self.valid = True

            # Apply DBSCAN to find clusters. Choose different settings for different cities.
            if self.cluster_type == 'shopping' and len(self.all_points) > 200 :
                dbscan = DBSCAN(eps=0.00118, min_samples=15, algorithm='kd_tree')  # for large cities
            elif self.cluster_type == 'sightseeing' :
                dbscan = DBSCAN(eps=0.0025, min_samples=15, algorithm='kd_tree')  # for historic neighborhoods
            else :
                dbscan = DBSCAN(eps=0.00075, min_samples=10, algorithm='kd_tree')  # for small cities

            labels = dbscan.fit_predict(self.all_points)

            # Separate clustered points and noise points
            self.cluster_points = self.all_points[labels != -1]
            self.cluster_labels = labels[labels != -1]

            # filter the clusters to keep only the largest ones
            self.filter_clusters()


    def generate_clusters(self) -> list[Landmark]:
        """
        Generate a list of landmarks based on identified clusters.

        This method iterates over the different clusters, calculates the centroid
        (as the mean of the points within each cluster), and assigns an importance
        based on the size of the cluster.

        The generated shopping locations are stored in `self.clusters`
        as a list of `Cluster` objects, each with:
            - `type`: Set to 'area'.
            - `centroid`: The calculated centroid of the cluster.
            - `importance`: The number of points in the cluster.
        """

        if not self.valid :
            return []       # Return empty list if no clusters were found

        locations = []

        # loop through the different clusters
        for label in set(self.cluster_labels):

            # Extract points belonging to the current cluster
            current_cluster = self.cluster_points[self.cluster_labels == label]

            # Calculate the centroid as the mean of the points
            centroid = np.mean(current_cluster, axis=0)

            if self.cluster_type == 'shopping' :
                score = len(current_cluster)*2
            else :
                score = len(current_cluster)*8
            locations.append(Cluster(
                type='area',
                centroid=centroid,
                importance = score
            ))

        # Transform the locations in landmarks and return the list
        cluster_landmarks = []
        for cluster in locations :
            cluster_landmarks.append(self.create_landmark(cluster))

        return cluster_landmarks


    def create_landmark(self, cluster: Cluster) -> Landmark:
        """
        Create a Landmark object based on the given shopping location.

        This method queries the Overpass API for nearby neighborhoods and shopping malls
        within a 1000m radius around the shopping location centroid. It selects the closest
        result and creates a landmark with the associated details such as name, type, and OSM ID.

        Parameters:
            shopping_location (Cluster): A Cluster object containing
            the centroid and importance of the area.

        Returns:
            Landmark: A Landmark object containing details such as the name, type,
            location, attractiveness, and OSM details.
        """

        # Define the bounding box for a given radius around the coordinates
        lat, lon = cluster.centroid
        bbox = ("around:1000", str(lat), str(lon))

        # Query neighborhoods and shopping malls
        selectors = ['"place"~"^(suburb|neighborhood|neighbourhood|quarter|city_block)$"']

        if self.cluster_type == 'shopping' :
            selectors.append('"shop"="mall"')
            new_name = 'Shopping Area'
            t = 40
        else :
            new_name = 'Neighborhood'
            t = 15

        min_dist = float('inf')
        new_name_en = None
        osm_id = 0
        osm_type = 'node'

        for sel in selectors :
            query = overpassQueryBuilder(
                bbox = bbox,
                elementType = ['node', 'way', 'relation'],
                selector = sel,
                includeCenter = True,
                out = 'center'
            )

            try:
                result = self.overpass.query(query)
            except Exception as e:
                self.logger.error(f"Error fetching landmarks: {e}")
                continue

            for elem in result.elements():
                location = (elem.centerLat(), elem.centerLon())

                # Skip if element has neither name or location
                if elem.tag('name') is None :
                    continue
                if location[0] is None :
                    location = (elem.lat(), elem.lon())
                    if location[0] is None :
                        continue

                d = get_distance(cluster.centroid, location)
                if  d < min_dist :
                    min_dist = d
                    new_name = elem.tag('name')
                    osm_type = elem.type()      # Add type: 'way' or 'relation'
                    osm_id = elem.id()          # Add OSM id

                    # Add english name if it exists
                    try :
                        new_name_en = elem.tag('name:en')
                    except:
                        pass

        return Landmark(
            name=new_name,
            type=self.cluster_type,
            location=cluster.centroid,              # TODO: use the fact the we can also recognize streets.
            attractiveness=cluster.importance,
            n_tags=0,
            osm_id=osm_id,
            osm_type=osm_type,
            name_en=new_name_en,
            duration=t
        )


    def filter_clusters(self):
        """
        Filter clusters to retain only the 5 largest clusters by point count.

        This method calculates the size of each cluster and filters out all but the
        5 largest clusters. It then updates the cluster points and labels to reflect
        only those from the top 5 clusters.
        """
        label_counts = np.bincount(self.cluster_labels)

        # Step 3: Get the indices (labels) of the 5 largest clusters
        top_5_labels = np.argsort(label_counts)[-5:]  # Get the largest 5 clusters

        # Step 4: Filter points to keep only the points in the top 5 clusters
        filtered_cluster_points = []
        filtered_cluster_labels = []

        for label in top_5_labels:
            filtered_cluster_points.append(self.cluster_points[self.cluster_labels == label])
            filtered_cluster_labels.append(np.full((label_counts[label],), label))  # Replicate the label

        # update the cluster points and labels with the filtered data
        self.cluster_points = np.vstack(filtered_cluster_points)        # ValueError here
        self.cluster_labels = np.concatenate(filtered_cluster_labels)