Update documentation structure and enhance .gitignore
- Added generated index files and report directories to .gitignore to prevent unnecessary tracking of transient files. - Updated README links to reflect new documentation paths for better navigation. - Improved documentation organization by ensuring all links point to the correct locations, enhancing user experience and accessibility.
This commit is contained in:
204
scripts/analyze-files-to-prune.py
Normal file
204
scripts/analyze-files-to-prune.py
Normal file
@@ -0,0 +1,204 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Analyze Files for Pruning
|
||||
Identifies files that could potentially be removed from the project.
|
||||
"""
|
||||
|
||||
import os
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
|
||||
def analyze_project():
|
||||
"""Analyze project for files that can be pruned."""
|
||||
|
||||
root = Path('.')
|
||||
results = {
|
||||
'temp_files': [],
|
||||
'duplicates': defaultdict(list),
|
||||
'large_files': [],
|
||||
'old_status_files': [],
|
||||
'backup_files': [],
|
||||
'build_artifacts': [],
|
||||
'potentially_obsolete': []
|
||||
}
|
||||
|
||||
# Patterns for files to check
|
||||
temp_patterns = ['.tmp', '.swp', '.swo', '~', '.DS_Store', '.log']
|
||||
backup_patterns = ['.backup', '.bak', '.old', '.orig']
|
||||
|
||||
# Directories to skip
|
||||
skip_dirs = {'.git', 'node_modules', 'dist', 'build', '.next', 'coverage', '__pycache__'}
|
||||
|
||||
# Check all files
|
||||
for root_dir, dirs, files in os.walk('.'):
|
||||
# Skip certain directories
|
||||
dirs[:] = [d for d in dirs if d not in skip_dirs]
|
||||
|
||||
root_path = Path(root_dir)
|
||||
|
||||
for file in files:
|
||||
file_path = root_path / file
|
||||
|
||||
# Skip if in ignored directory
|
||||
if any(skip in str(file_path) for skip in skip_dirs):
|
||||
continue
|
||||
|
||||
# Check for temp files
|
||||
if any(pattern in file for pattern in temp_patterns):
|
||||
results['temp_files'].append(str(file_path))
|
||||
|
||||
# Check for backup files
|
||||
if any(pattern in file for pattern in backup_patterns):
|
||||
results['backup_files'].append(str(file_path))
|
||||
|
||||
# Check for large files (>5MB)
|
||||
try:
|
||||
size = file_path.stat().st_size
|
||||
if size > 5 * 1024 * 1024: # 5MB
|
||||
results['large_files'].append((str(file_path), size))
|
||||
except:
|
||||
pass
|
||||
|
||||
# Check for old status/complete files in docs
|
||||
if 'docs' in str(file_path) and file_path.suffix == '.md':
|
||||
file_lower = file.upper()
|
||||
if any(keyword in file_lower for keyword in ['COMPLETE', 'COMPLETION', 'FINAL_STATUS', 'ALL_STEPS_COMPLETE']):
|
||||
if 'archive' not in str(file_path) and 'status' in str(file_path):
|
||||
results['old_status_files'].append(str(file_path))
|
||||
|
||||
# Check for potentially obsolete documentation
|
||||
if 'docs' in str(file_path) and file_path.suffix == '.md':
|
||||
file_lower = file.upper()
|
||||
# Files that might be superseded
|
||||
obsolete_keywords = ['OLD_', 'DEPRECATED', 'LEGACY', 'UNUSED']
|
||||
if any(keyword in file_lower for keyword in obsolete_keywords):
|
||||
results['potentially_obsolete'].append(str(file_path))
|
||||
|
||||
return results
|
||||
|
||||
def find_duplicate_content():
|
||||
"""Find files with duplicate content."""
|
||||
duplicates = defaultdict(list)
|
||||
|
||||
skip_dirs = {'.git', 'node_modules', 'dist', 'build', '.next', 'coverage', '__pycache__'}
|
||||
|
||||
for root_dir, dirs, files in os.walk('.'):
|
||||
dirs[:] = [d for d in dirs if d not in skip_dirs]
|
||||
|
||||
for file in files:
|
||||
if not file.endswith(('.md', '.json', '.yaml', '.yml', '.txt')):
|
||||
continue
|
||||
|
||||
file_path = Path(root_dir) / file
|
||||
if any(skip in str(file_path) for skip in skip_dirs):
|
||||
continue
|
||||
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
content_hash = hashlib.md5(f.read()).hexdigest()
|
||||
duplicates[content_hash].append(str(file_path))
|
||||
except:
|
||||
pass
|
||||
|
||||
# Filter to only actual duplicates (2+ files)
|
||||
return {h: files for h, files in duplicates.items() if len(files) > 1}
|
||||
|
||||
def main():
|
||||
print("="*60)
|
||||
print("FILE PRUNING ANALYSIS")
|
||||
print("="*60)
|
||||
print()
|
||||
|
||||
results = analyze_project()
|
||||
|
||||
print("1. TEMPORARY FILES")
|
||||
print("-" * 60)
|
||||
if results['temp_files']:
|
||||
print(f"Found {len(results['temp_files'])} temporary files:")
|
||||
for f in sorted(results['temp_files'])[:20]:
|
||||
print(f" - {f}")
|
||||
if len(results['temp_files']) > 20:
|
||||
print(f" ... and {len(results['temp_files']) - 20} more")
|
||||
else:
|
||||
print(" No temporary files found")
|
||||
print()
|
||||
|
||||
print("2. BACKUP FILES")
|
||||
print("-" * 60)
|
||||
if results['backup_files']:
|
||||
print(f"Found {len(results['backup_files'])} backup files:")
|
||||
for f in sorted(results['backup_files']):
|
||||
print(f" - {f}")
|
||||
else:
|
||||
print(" No backup files found")
|
||||
print()
|
||||
|
||||
print("3. LARGE FILES (>5MB)")
|
||||
print("-" * 60)
|
||||
if results['large_files']:
|
||||
print(f"Found {len(results['large_files'])} large files:")
|
||||
for f, size in sorted(results['large_files'], key=lambda x: x[1], reverse=True)[:10]:
|
||||
size_mb = size / (1024 * 1024)
|
||||
print(f" - {f} ({size_mb:.2f} MB)")
|
||||
else:
|
||||
print(" No unusually large files found")
|
||||
print()
|
||||
|
||||
print("4. OLD STATUS/COMPLETE FILES (outside archive)")
|
||||
print("-" * 60)
|
||||
if results['old_status_files']:
|
||||
print(f"Found {len(results['old_status_files'])} status files that might be archived:")
|
||||
for f in sorted(results['old_status_files']):
|
||||
print(f" - {f}")
|
||||
else:
|
||||
print(" No old status files found outside archive")
|
||||
print()
|
||||
|
||||
print("5. POTENTIALLY OBSOLETE FILES")
|
||||
print("-" * 60)
|
||||
if results['potentially_obsolete']:
|
||||
print(f"Found {len(results['potentially_obsolete'])} potentially obsolete files:")
|
||||
for f in sorted(results['potentially_obsolete']):
|
||||
print(f" - {f}")
|
||||
else:
|
||||
print(" No obviously obsolete files found")
|
||||
print()
|
||||
|
||||
print("6. DUPLICATE CONTENT")
|
||||
print("-" * 60)
|
||||
duplicates = find_duplicate_content()
|
||||
if duplicates:
|
||||
print(f"Found {len(duplicates)} groups of duplicate files:")
|
||||
for i, (hash_val, files) in enumerate(list(duplicates.items())[:10], 1):
|
||||
print(f"\n Group {i} ({len(files)} files):")
|
||||
for f in files:
|
||||
print(f" - {f}")
|
||||
if len(duplicates) > 10:
|
||||
print(f"\n ... and {len(duplicates) - 10} more duplicate groups")
|
||||
else:
|
||||
print(" No duplicate content found")
|
||||
print()
|
||||
|
||||
# Summary
|
||||
total_findings = (
|
||||
len(results['temp_files']) +
|
||||
len(results['backup_files']) +
|
||||
len(results['large_files']) +
|
||||
len(results['old_status_files']) +
|
||||
len(results['potentially_obsolete'])
|
||||
)
|
||||
|
||||
print("="*60)
|
||||
print("SUMMARY")
|
||||
print("="*60)
|
||||
print(f"Total files that could be pruned: {total_findings}")
|
||||
print(f"Duplicate file groups: {len(duplicates)}")
|
||||
print()
|
||||
print("Note: Review each category before deletion.")
|
||||
print("Archive files are intentionally kept for historical reference.")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user