better naming and MM

2024-12-16 17:56:53 +01:00
parent ddd2e91328
commit 9b61471c94
4 changed files with 119 additions and 105 deletions
--- a/backend/src/utils/cluster_manager.py
+++ b/backend/src/utils/cluster_manager.py
@@ -0,0 +1,282 @@
+import logging
+from typing import Literal
+
+import numpy as np
+from sklearn.cluster import DBSCAN
+from pydantic import BaseModel
+from OSMPythonTools.overpass import Overpass, overpassQueryBuilder
+from OSMPythonTools.cachingStrategy import CachingStrategy, JSON
+
+from ..structs.landmark import Landmark
+from ..utils.get_time_separation import get_distance
+from ..constants import OSM_CACHE_DIR
+
+
+class Cluster(BaseModel):
+    """"
+    A class representing an interesting area for shopping or sightseeing.
+    
+    It can represent either a general area or a specifc route with start and end point.
+    The importance represents the number of shops found in this cluster.
+    
+    Attributes:
+        type :       either a 'street' or 'area' (representing a denser field of shops).
+        importance : size of the cluster (number of points).
+        centroid :   center of the cluster.
+        start :      if the type is a street it goes from here...
+        end :        ...to here
+    """
+    type: Literal['street', 'area']
+    importance: int
+    centroid: tuple
+    # start: Optional[list] = None      # for later use if we want to have streets as well
+    # end: Optional[list] = None
+
+
+class ClusterManager:
+
+    logger = logging.getLogger(__name__)
+
+    # NOTE: all points are in (lat, lon) format
+    valid: bool             # Ensure the manager is valid (ie there are some clusters to be found) 
+    all_points: list
+    cluster_points: list
+    cluster_labels: list
+    cluster_type: Literal['sightseeing', 'shopping']
+
+    def __init__(self, bbox: tuple, cluster_type: Literal['sightseeing', 'shopping']) -> None:
+        """
+        Upon intialization, generate the point cloud used for cluster detection.
+        The points represent bag/clothes shops and general boutiques.
+        If the first step is successful, it applies the DBSCAN clustering algorithm with different
+        parameters depending on the size of the city (number of points). 
+        It filters out noise points and keeps only the largest clusters.
+
+        A successful initialization updates:
+            - `self.cluster_points`: The points belonging to clusters.
+            - `self.cluster_labels`: The labels for the points in clusters.
+        
+        The method also calls `filter_clusters()` to retain only the largest clusters.
+
+        Args: 
+            bbox: The bounding box coordinates (around:radius, center_lat, center_lon).
+        """
+
+        # Initialize overpass and cache
+        self.overpass = Overpass()
+        CachingStrategy.use(JSON, cacheDir=OSM_CACHE_DIR)
+
+        self.cluster_type = cluster_type
+        if cluster_type == 'shopping' :
+            elem_type = ['node']
+            sel = ['"shop"~"^(bag|boutique|clothes)$"']
+            out = 'skel'
+        else :
+            elem_type = ['way']
+            sel = ['"historic"="building"']
+            out = 'center'
+
+        # Initialize the points for cluster detection
+        query = overpassQueryBuilder(
+            bbox = bbox,
+            elementType = elem_type,
+            selector = sel,
+            includeCenter = True,
+            out = out
+        )
+
+        try:
+            result = self.overpass.query(query)
+        except Exception as e:
+            self.logger.error(f"Error fetching landmarks: {e}")
+
+        if len(result.elements()) == 0 :
+            self.valid = False
+        
+        else :
+            points = []
+            for elem in result.elements() :
+                coords = tuple((elem.lat(), elem.lon()))
+                if coords[0] is None :
+                    coords = tuple((elem.centerLat(), elem.centerLon()))
+                points.append(coords)
+
+            self.all_points = np.array(points)
+            self.valid = True
+
+            # Apply DBSCAN to find clusters. Choose different settings for different cities.
+            if self.cluster_type == 'shopping' and len(self.all_points) > 200 :
+                dbscan = DBSCAN(eps=0.00118, min_samples=15, algorithm='kd_tree')  # for large cities
+            elif self.cluster_type == 'sightseeing' :
+                dbscan = DBSCAN(eps=0.0025, min_samples=15, algorithm='kd_tree')  # for historic neighborhoods
+            else :
+                dbscan = DBSCAN(eps=0.00075, min_samples=10, algorithm='kd_tree')  # for small cities
+
+            labels = dbscan.fit_predict(self.all_points)
+
+            # Separate clustered points and noise points
+            self.cluster_points = self.all_points[labels != -1]
+            self.cluster_labels = labels[labels != -1]
+
+            # filter the clusters to keep only the largest ones
+            self.filter_clusters()        
+
+
+    def generate_clusters(self) -> list[Landmark]:
+        """
+        Generate a list of landmarks based on identified clusters.
+
+        This method iterates over the different clusters, calculates the centroid 
+        (as the mean of the points within each cluster), and assigns an importance 
+        based on the size of the cluster.
+
+        The generated shopping locations are stored in `self.clusters` 
+        as a list of `Cluster` objects, each with:
+            - `type`: Set to 'area'.
+            - `centroid`: The calculated centroid of the cluster.
+            - `importance`: The number of points in the cluster.
+        """
+
+        if not self.valid :
+            return []       # Return empty list if no clusters were found
+
+        locations = []
+
+        # loop through the different clusters
+        for label in set(self.cluster_labels):
+
+            # Extract points belonging to the current cluster
+            current_cluster = self.cluster_points[self.cluster_labels == label]
+            
+            # Calculate the centroid as the mean of the points
+            centroid = np.mean(current_cluster, axis=0)
+
+            if self.cluster_type == 'shopping' :
+                score = len(current_cluster)*2
+            else :
+                score = len(current_cluster)*4
+            locations.append(Cluster(
+                type='area',
+                centroid=centroid,
+                importance = score
+            ))
+
+        # Transform the locations in landmarks and return the list
+        cluster_landmarks = []
+        for cluster in locations :
+            cluster_landmarks.append(self.create_landmark(cluster))
+
+        return cluster_landmarks
+
+
+    def create_landmark(self, cluster: Cluster) -> Landmark:
+        """
+        Create a Landmark object based on the given shopping location.
+
+        This method queries the Overpass API for nearby neighborhoods and shopping malls 
+        within a 1000m radius around the shopping location centroid. It selects the closest 
+        result and creates a landmark with the associated details such as name, type, and OSM ID.
+
+        Parameters:
+            shopping_location (Cluster): A Cluster object containing 
+            the centroid and importance of the area.
+
+        Returns:
+            Landmark: A Landmark object containing details such as the name, type, 
+            location, attractiveness, and OSM details.
+        """
+
+        # Define the bounding box for a given radius around the coordinates
+        lat, lon = cluster.centroid
+        bbox = ("around:1000", str(lat), str(lon))
+
+        # Query neighborhoods and shopping malls
+        selectors = ['"place"~"^(suburb|neighborhood|neighbourhood|quarter|city_block)$"']
+
+        if self.cluster_type == 'shopping' :
+            selectors.append('"shop"="mall"')
+            new_name = 'Shopping Area'
+            t = 40
+        else : 
+            new_name = 'Neighborhood'
+            t = 15
+
+        min_dist = float('inf')
+        new_name_en = None
+        osm_id = 0
+        osm_type = 'node'
+
+        for sel in selectors : 
+            query = overpassQueryBuilder(
+                bbox = bbox,
+                elementType = ['node', 'way', 'relation'],
+                selector = sel,
+                includeCenter = True,
+                out = 'center'
+            )
+
+            try:
+                result = self.overpass.query(query)
+            except Exception as e:
+                self.logger.error(f"Error fetching landmarks: {e}")
+                continue
+
+            for elem in result.elements():
+                location = (elem.centerLat(), elem.centerLon())
+
+                if location[0] is None : 
+                    location = (elem.lat(), elem.lon())
+                    if location[0] is None : 
+                        continue
+
+                d = get_distance(cluster.centroid, location)
+                if  d < min_dist :
+                    min_dist = d
+                    new_name = elem.tag('name')
+                    osm_type = elem.type()      # Add type: 'way' or 'relation'
+                    osm_id = elem.id()          # Add OSM id 
+
+                    # Add english name if it exists
+                    try :
+                        new_name_en = elem.tag('name:en')
+                    except:
+                        pass 
+        
+        return Landmark(
+            name=new_name,
+            type=self.cluster_type,
+            location=cluster.centroid,              # TODO: use the fact the we can also recognize streets.
+            attractiveness=cluster.importance,
+            n_tags=0,
+            osm_id=osm_id,
+            osm_type=osm_type,
+            name_en=new_name_en,
+            duration=t
+        )
+
+
+    def filter_clusters(self):
+        """
+        Filter clusters to retain only the 5 largest clusters by point count.
+
+        This method calculates the size of each cluster and filters out all but the 
+        5 largest clusters. It then updates the cluster points and labels to reflect 
+        only those from the top 5 clusters.
+        """
+        label_counts = np.bincount(self.cluster_labels)
+
+        # Step 3: Get the indices (labels) of the 5 largest clusters
+        top_5_labels = np.argsort(label_counts)[-5:]  # Get the largest 5 clusters
+
+        # Step 4: Filter points to keep only the points in the top 5 clusters
+        filtered_cluster_points = []
+        filtered_cluster_labels = []
+
+        for label in top_5_labels:
+            filtered_cluster_points.append(self.cluster_points[self.cluster_labels == label])
+            filtered_cluster_labels.append(np.full((label_counts[label],), label))  # Replicate the label
+
+        # update the cluster points and labels with the filtered data
+        self.cluster_points = np.vstack(filtered_cluster_points)
+        self.cluster_labels = np.concatenate(filtered_cluster_labels)
+