Skip to content

Latest commit

 

History

History
387 lines (297 loc) · 13.4 KB

File metadata and controls

387 lines (297 loc) · 13.4 KB

Accelerating Dimensionality Reduction Beyond PCA

Slide 1: Understanding PCA Time Complexity

The Principal Component Analysis (PCA) algorithm exhibits computational complexity of O(nm^2 + m^3), where n represents samples and m features. This cubic relationship with dimensionality creates significant performance bottlenecks when dealing with high-dimensional data, making it impractical for datasets with thousands of dimensions.

import numpy as np
from time import time

def measure_pca_time(n_samples, n_features):
    # Generate random data matrix
    X = np.random.randn(n_samples, n_features)
    
    start = time()
    # Compute covariance matrix (nm^2)
    cov_matrix = np.dot(X.T, X) / n_samples
    
    # Compute eigendecomposition (m^3)
    eigenvals, eigenvecs = np.linalg.eigh(cov_matrix)
    
    total_time = time() - start
    print(f"Time for {n_features}D: {total_time:.2f}s")

# Compare execution times
measure_pca_time(1000, 100)   # Lower dimension
measure_pca_time(1000, 1000)  # Higher dimension

Slide 2: Random Projection Theory

Random Projection leverages the Johnson-Lindenstrauss lemma, which states that points in high-dimensional space can be projected into a lower-dimensional space while approximately preserving relative distances between points. This provides a computationally efficient alternative to PCA.

def random_projection_matrix(original_dim, target_dim):
    """
    Create a random projection matrix following Gaussian distribution
    """
    return np.random.normal(0, 1/target_dim, (target_dim, original_dim))

# Mathematical representation in LaTeX (not rendered)
'''
$$
\text{Projection Matrix } R \in \mathbb{R}^{k \times d}
\text{where } R_{ij} \sim \mathcal{N}(0, \frac{1}{k})
$$
'''

Slide 3: Sparse Random Projection Implementation

Sparse Random Projection improves upon standard random projection by using a sparse matrix with values {-1, 0, 1}, reducing computation time and memory usage while maintaining similar theoretical guarantees for dimensionality reduction.

import numpy as np
from scipy import sparse

def sparse_random_projection_matrix(original_dim, target_dim, density=0.1):
    # Initialize sparse matrix
    nnz = int(original_dim * target_dim * density)
    rows = np.random.randint(0, target_dim, nnz)
    cols = np.random.randint(0, original_dim, nnz)
    data = np.random.choice([-1, 1], nnz)
    
    # Create sparse matrix
    R = sparse.coo_matrix((data, (rows, cols)), 
                         shape=(target_dim, original_dim))
    return R * (1 / np.sqrt(target_dim * density))

Slide 4: Comparative Analysis Framework

We'll create a framework to compare PCA and Sparse Random Projection in terms of both computation time and preservation of data relationships. This framework will help demonstrate the efficiency gains while validating accuracy.

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from time import time

class DimensionalityReductionComparator:
    def __init__(self, n_components):
        self.n_components = n_components
        
    def compare_methods(self, X):
        X_scaled = StandardScaler().fit_transform(X)
        
        # PCA timing
        start = time()
        pca = PCA(n_components=self.n_components)
        X_pca = pca.fit_transform(X_scaled)
        pca_time = time() - start
        
        # SRP timing
        start = time()
        R = sparse_random_projection_matrix(X.shape[1], 
                                          self.n_components)
        X_srp = X_scaled @ R.T
        srp_time = time() - start
        
        return {
            'pca_time': pca_time,
            'srp_time': srp_time,
            'X_pca': X_pca,
            'X_srp': X_srp
        }

Slide 5: Distance Preservation Metrics

A crucial aspect of dimensionality reduction is preserving relative distances between points. We'll implement metrics to measure how well both PCA and Sparse Random Projection maintain these relationships.

from sklearn.metrics.pairwise import euclidean_distances
import numpy as np

def compute_distance_correlation(X_original, X_reduced):
    # Compute pairwise distances in original space
    D_original = euclidean_distances(X_original)
    
    # Compute pairwise distances in reduced space
    D_reduced = euclidean_distances(X_reduced)
    
    # Flatten matrices and compute correlation
    corr = np.corrcoef(D_original.flatten(), 
                       D_reduced.flatten())[0,1]
    return corr

Slide 6: Real-world Example: Image Data

Implementing dimensionality reduction on a real image dataset demonstrates the practical advantages of Sparse Random Projection over PCA when dealing with high-dimensional visual data.

from sklearn.datasets import load_digits
import matplotlib.pyplot as plt

# Load digits dataset (8x8 images)
digits = load_digits()
X = digits.data
y = digits.target

# Compare methods
comparator = DimensionalityReductionComparator(n_components=32)
results = comparator.compare_methods(X)

print(f"PCA Time: {results['pca_time']:.3f}s")
print(f"SRP Time: {results['srp_time']:.3f}s")

# Distance preservation
pca_corr = compute_distance_correlation(X, results['X_pca'])
srp_corr = compute_distance_correlation(X, results['X_srp'])
print(f"PCA Distance Correlation: {pca_corr:.3f}")
print(f"SRP Distance Correlation: {srp_corr:.3f}")

Slide 7: High-Dimensional Text Data Processing

Natural language processing often deals with high-dimensional sparse matrices from text vectorization. Here we'll demonstrate how Sparse Random Projection efficiently handles text feature matrices while maintaining semantic relationships.

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups

# Load text data
newsgroups = fetch_20newsgroups(subset='train', 
                               categories=['comp.graphics', 'sci.med'])
vectorizer = TfidfVectorizer(max_features=5000)
X_text = vectorizer.fit_transform(newsgroups.data).toarray()

# Apply dimensionality reduction
n_components = 100
R = sparse_random_projection_matrix(X_text.shape[1], n_components)
X_reduced = X_text @ R.T

# Measure sparsity and computation time
original_sparsity = np.sum(X_text == 0) / X_text.size
reduced_sparsity = np.sum(X_reduced == 0) / X_reduced.size
print(f"Original sparsity: {original_sparsity:.3f}")
print(f"Reduced sparsity: {reduced_sparsity:.3f}")

Slide 8: Clustering Quality Comparison

We'll implement a comprehensive comparison of clustering quality between original high-dimensional data and its reduced representations using both PCA and Sparse Random Projection.

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np

def compare_clustering_quality(X_original, X_pca, X_srp, n_clusters=3):
    # Initialize KMeans
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    
    # Cluster and evaluate original data
    labels_orig = kmeans.fit_predict(X_original)
    score_orig = silhouette_score(X_original, labels_orig)
    
    # Cluster and evaluate PCA-reduced data
    labels_pca = kmeans.fit_predict(X_pca)
    score_pca = silhouette_score(X_pca, labels_pca)
    
    # Cluster and evaluate SRP-reduced data
    labels_srp = kmeans.fit_predict(X_srp)
    score_srp = silhouette_score(X_srp, labels_srp)
    
    return {
        'original_score': score_orig,
        'pca_score': score_pca,
        'srp_score': score_srp
    }

Slide 9: Memory Optimization Techniques

Implementing memory-efficient versions of dimensionality reduction becomes crucial when handling extremely large datasets that don't fit in memory. Here's an implementation using batch processing.

def batch_random_projection(X, batch_size, target_dim):
    n_samples = X.shape[0]
    R = sparse_random_projection_matrix(X.shape[1], target_dim)
    X_reduced = np.zeros((n_samples, target_dim))
    
    # Process in batches
    for i in range(0, n_samples, batch_size):
        end_idx = min(i + batch_size, n_samples)
        X_batch = X[i:end_idx]
        X_reduced[i:end_idx] = X_batch @ R.T
        
    return X_reduced

# Example usage with large dataset
n_samples, n_features = 100000, 5000
X_large = np.random.randn(n_samples, n_features)
X_reduced = batch_random_projection(X_large, 
                                  batch_size=1000, 
                                  target_dim=100)

Slide 10: Error Bounds Implementation

Implementing theoretical error bounds for Sparse Random Projection helps in determining the minimum number of dimensions needed to maintain a desired level of accuracy in distance preservation.

import numpy as np
from math import log

def calculate_min_dimensions(n_samples, error_tolerance=0.1, 
                           confidence=0.9):
    """
    Calculate minimum dimensions needed for Johnson-Lindenstrauss lemma
    """
    # Mathematical formula (LaTeX representation):
    # $$k \geq \frac{4 \log(n)}{(\epsilon^2/2 - \epsilon^3/3)}$$
    
    epsilon = error_tolerance
    denominator = (epsilon**2/2 - epsilon**3/3)
    k = int(np.ceil(4 * log(n_samples) / denominator))
    
    return k

# Example usage
n_samples = 10000
min_dim = calculate_min_dimensions(n_samples)
print(f"Minimum dimensions needed: {min_dim}")

Slide 11: Performance Scaling Analysis

Implementing a comprehensive scaling analysis to visualize how PCA and Sparse Random Projection performance changes with increasing dimensionality, providing empirical evidence for theoretical complexity claims.

import numpy as np
import matplotlib.pyplot as plt
from time import time

def scaling_analysis(max_dim=5000, step=500, n_samples=1000):
    dimensions = range(step, max_dim + step, step)
    pca_times = []
    srp_times = []
    
    for dim in dimensions:
        X = np.random.randn(n_samples, dim)
        target_dim = dim // 2
        
        # Time PCA
        start = time()
        pca = PCA(n_components=target_dim)
        pca.fit_transform(X)
        pca_times.append(time() - start)
        
        # Time SRP
        start = time()
        R = sparse_random_projection_matrix(dim, target_dim)
        X @ R.T
        srp_times.append(time() - start)
    
    return dimensions, pca_times, srp_times

Slide 12: Numerical Stability Analysis

Implementing checks for numerical stability in high dimensions, particularly important when dealing with sparse matrices and potential floating-point errors in large-scale computations.

def stability_analysis(X, n_trials=10):
    """
    Analyze numerical stability of random projections
    """
    n_samples, n_features = X.shape
    target_dim = n_features // 2
    distance_correlations = []
    
    # Original pairwise distances
    D_original = euclidean_distances(X)
    
    for _ in range(n_trials):
        # Generate new random projection
        R = sparse_random_projection_matrix(n_features, target_dim)
        X_reduced = X @ R.T
        
        # Compute correlation
        D_reduced = euclidean_distances(X_reduced)
        corr = np.corrcoef(D_original.flatten(), 
                          D_reduced.flatten())[0,1]
        distance_correlations.append(corr)
    
    return {
        'mean_correlation': np.mean(distance_correlations),
        'std_correlation': np.std(distance_correlations),
        'min_correlation': np.min(distance_correlations),
        'max_correlation': np.max(distance_correlations)
    }

Slide 13: Real-time Processing Implementation

Implementing a streaming version of Sparse Random Projection for real-time processing of high-dimensional data streams, crucial for online learning applications.

class StreamingRandomProjection:
    def __init__(self, original_dim, target_dim):
        self.R = sparse_random_projection_matrix(original_dim, target_dim)
        self.target_dim = target_dim
        
    def transform_stream(self, X_stream, chunk_size=100):
        """
        Process streaming data in chunks
        """
        buffer = []
        chunk = []
        
        for x in X_stream:
            chunk.append(x)
            if len(chunk) >= chunk_size:
                # Process chunk
                X_chunk = np.array(chunk)
                projected_chunk = X_chunk @ self.R.T
                buffer.extend(projected_chunk)
                chunk = []
        
        # Process remaining data
        if chunk:
            X_chunk = np.array(chunk)
            projected_chunk = X_chunk @ self.R.T
            buffer.extend(projected_chunk)
            
        return np.array(buffer)

# Example usage with data stream
stream_processor = StreamingRandomProjection(1000, 100)
data_stream = (np.random.randn(1000) for _ in range(1000))
projected_stream = stream_processor.transform_stream(data_stream)

Slide 14: Additional Resources