Update documentation structure and enhance .gitignore

- Added generated index files and report directories to .gitignore to prevent unnecessary tracking of transient files. - Updated README links to reflect new documentation paths for better navigation. - Improved documentation organization by ensuring all links point to the correct locations, enhancing user experience and accessibility.
2025-12-12 21:18:55 -08:00
parent 664707d912
commit fe0365757a
106 changed files with 4666 additions and 2294 deletions
--- a/scripts/analyze-files-to-prune.py
+++ b/scripts/analyze-files-to-prune.py
@@ -0,0 +1,204 @@
+#!/usr/bin/env python3
+"""
+Analyze Files for Pruning
+Identifies files that could potentially be removed from the project.
+"""
+
+import os
+import hashlib
+from pathlib import Path
+from collections import defaultdict
+from datetime import datetime
+
+def analyze_project():
+    """Analyze project for files that can be pruned."""
+    
+    root = Path('.')
+    results = {
+        'temp_files': [],
+        'duplicates': defaultdict(list),
+        'large_files': [],
+        'old_status_files': [],
+        'backup_files': [],
+        'build_artifacts': [],
+        'potentially_obsolete': []
+    }
+    
+    # Patterns for files to check
+    temp_patterns = ['.tmp', '.swp', '.swo', '~', '.DS_Store', '.log']
+    backup_patterns = ['.backup', '.bak', '.old', '.orig']
+    
+    # Directories to skip
+    skip_dirs = {'.git', 'node_modules', 'dist', 'build', '.next', 'coverage', '__pycache__'}
+    
+    # Check all files
+    for root_dir, dirs, files in os.walk('.'):
+        # Skip certain directories
+        dirs[:] = [d for d in dirs if d not in skip_dirs]
+        
+        root_path = Path(root_dir)
+        
+        for file in files:
+            file_path = root_path / file
+            
+            # Skip if in ignored directory
+            if any(skip in str(file_path) for skip in skip_dirs):
+                continue
+            
+            # Check for temp files
+            if any(pattern in file for pattern in temp_patterns):
+                results['temp_files'].append(str(file_path))
+            
+            # Check for backup files
+            if any(pattern in file for pattern in backup_patterns):
+                results['backup_files'].append(str(file_path))
+            
+            # Check for large files (>5MB)
+            try:
+                size = file_path.stat().st_size
+                if size > 5 * 1024 * 1024:  # 5MB
+                    results['large_files'].append((str(file_path), size))
+            except:
+                pass
+            
+            # Check for old status/complete files in docs
+            if 'docs' in str(file_path) and file_path.suffix == '.md':
+                file_lower = file.upper()
+                if any(keyword in file_lower for keyword in ['COMPLETE', 'COMPLETION', 'FINAL_STATUS', 'ALL_STEPS_COMPLETE']):
+                    if 'archive' not in str(file_path) and 'status' in str(file_path):
+                        results['old_status_files'].append(str(file_path))
+            
+            # Check for potentially obsolete documentation
+            if 'docs' in str(file_path) and file_path.suffix == '.md':
+                file_lower = file.upper()
+                # Files that might be superseded
+                obsolete_keywords = ['OLD_', 'DEPRECATED', 'LEGACY', 'UNUSED']
+                if any(keyword in file_lower for keyword in obsolete_keywords):
+                    results['potentially_obsolete'].append(str(file_path))
+    
+    return results
+
+def find_duplicate_content():
+    """Find files with duplicate content."""
+    duplicates = defaultdict(list)
+    
+    skip_dirs = {'.git', 'node_modules', 'dist', 'build', '.next', 'coverage', '__pycache__'}
+    
+    for root_dir, dirs, files in os.walk('.'):
+        dirs[:] = [d for d in dirs if d not in skip_dirs]
+        
+        for file in files:
+            if not file.endswith(('.md', '.json', '.yaml', '.yml', '.txt')):
+                continue
+            
+            file_path = Path(root_dir) / file
+            if any(skip in str(file_path) for skip in skip_dirs):
+                continue
+            
+            try:
+                with open(file_path, 'rb') as f:
+                    content_hash = hashlib.md5(f.read()).hexdigest()
+                    duplicates[content_hash].append(str(file_path))
+            except:
+                pass
+    
+    # Filter to only actual duplicates (2+ files)
+    return {h: files for h, files in duplicates.items() if len(files) > 1}
+
+def main():
+    print("="*60)
+    print("FILE PRUNING ANALYSIS")
+    print("="*60)
+    print()
+    
+    results = analyze_project()
+    
+    print("1. TEMPORARY FILES")
+    print("-" * 60)
+    if results['temp_files']:
+        print(f"Found {len(results['temp_files'])} temporary files:")
+        for f in sorted(results['temp_files'])[:20]:
+            print(f"  - {f}")
+        if len(results['temp_files']) > 20:
+            print(f"  ... and {len(results['temp_files']) - 20} more")
+    else:
+        print("  No temporary files found")
+    print()
+    
+    print("2. BACKUP FILES")
+    print("-" * 60)
+    if results['backup_files']:
+        print(f"Found {len(results['backup_files'])} backup files:")
+        for f in sorted(results['backup_files']):
+            print(f"  - {f}")
+    else:
+        print("  No backup files found")
+    print()
+    
+    print("3. LARGE FILES (>5MB)")
+    print("-" * 60)
+    if results['large_files']:
+        print(f"Found {len(results['large_files'])} large files:")
+        for f, size in sorted(results['large_files'], key=lambda x: x[1], reverse=True)[:10]:
+            size_mb = size / (1024 * 1024)
+            print(f"  - {f} ({size_mb:.2f} MB)")
+    else:
+        print("  No unusually large files found")
+    print()
+    
+    print("4. OLD STATUS/COMPLETE FILES (outside archive)")
+    print("-" * 60)
+    if results['old_status_files']:
+        print(f"Found {len(results['old_status_files'])} status files that might be archived:")
+        for f in sorted(results['old_status_files']):
+            print(f"  - {f}")
+    else:
+        print("  No old status files found outside archive")
+    print()
+    
+    print("5. POTENTIALLY OBSOLETE FILES")
+    print("-" * 60)
+    if results['potentially_obsolete']:
+        print(f"Found {len(results['potentially_obsolete'])} potentially obsolete files:")
+        for f in sorted(results['potentially_obsolete']):
+            print(f"  - {f}")
+    else:
+        print("  No obviously obsolete files found")
+    print()
+    
+    print("6. DUPLICATE CONTENT")
+    print("-" * 60)
+    duplicates = find_duplicate_content()
+    if duplicates:
+        print(f"Found {len(duplicates)} groups of duplicate files:")
+        for i, (hash_val, files) in enumerate(list(duplicates.items())[:10], 1):
+            print(f"\n  Group {i} ({len(files)} files):")
+            for f in files:
+                print(f"    - {f}")
+        if len(duplicates) > 10:
+            print(f"\n  ... and {len(duplicates) - 10} more duplicate groups")
+    else:
+        print("  No duplicate content found")
+    print()
+    
+    # Summary
+    total_findings = (
+        len(results['temp_files']) +
+        len(results['backup_files']) +
+        len(results['large_files']) +
+        len(results['old_status_files']) +
+        len(results['potentially_obsolete'])
+    )
+    
+    print("="*60)
+    print("SUMMARY")
+    print("="*60)
+    print(f"Total files that could be pruned: {total_findings}")
+    print(f"Duplicate file groups: {len(duplicates)}")
+    print()
+    print("Note: Review each category before deletion.")
+    print("Archive files are intentionally kept for historical reference.")
+
+if __name__ == '__main__':
+    main()
+