KarelWintersky 修订了这个 Gist . 转到此修订
1 file changed, 150 insertions
*zfs_recordsize_advisor6.py(文件已创建)
| @@ -0,0 +1,150 @@ | |||
| 1 | + | #!/usr/bin/env python3 | |
| 2 | + | # | |
| 3 | + | # Рекурсивно сканирует все файлы в указанном разделе | |
| 4 | + | # Группирует файлы по ближайшей степени двойки их размера | |
| 5 | + | # Анализирует распределение размеров файлов | |
| 6 | + | # Рекомендует оптимальный recordsize на основе наиболее распространённого размера файлов | |
| 7 | + | # Рекомендация основана на предположении, что будущие файлы будут похожего размера | |
| 8 | + | # ТЕМ НЕ МЕНЕЕ: | |
| 9 | + | # Для смешанных рабочих нагрузок (много маленьких и несколько больших файлов) обычно лучше | |
| 10 | + | # оставить значение по умолчанию 128K | |
| 11 | + | # | |
| 12 | + | import os | |
| 13 | + | import argparse | |
| 14 | + | from collections import defaultdict | |
| 15 | + | import math | |
| 16 | + | import sys | |
| 17 | + | import time | |
| 18 | + | ||
| 19 | + | def format_size(size_bytes): | |
| 20 | + | """Форматирует размер в удобочитаемом виде (без дробной части)""" | |
| 21 | + | if size_bytes == 0: | |
| 22 | + | return "0B" | |
| 23 | + | ||
| 24 | + | units = ["B", "KB", "MB", "GB", "TB"] | |
| 25 | + | unit_index = 0 | |
| 26 | + | ||
| 27 | + | while size_bytes >= 1024 and unit_index < len(units) - 1: | |
| 28 | + | size_bytes /= 1024 | |
| 29 | + | unit_index += 1 | |
| 30 | + | ||
| 31 | + | return f"{int(size_bytes)}{units[unit_index]}" | |
| 32 | + | ||
| 33 | + | def get_upper_power_of_two(size): | |
| 34 | + | """Возвращает ближайшую степень двойки вверх для заданного размера""" | |
| 35 | + | if size == 0: | |
| 36 | + | return 0 | |
| 37 | + | return 2 ** math.ceil(math.log2(size)) | |
| 38 | + | ||
| 39 | + | def scan_directory(path): | |
| 40 | + | size_bins = defaultdict(int) | |
| 41 | + | total_files = 0 | |
| 42 | + | total_size = 0 | |
| 43 | + | last_update_time = 0 | |
| 44 | + | start_time = time.time() | |
| 45 | + | ||
| 46 | + | for root, _, files in os.walk(path): | |
| 47 | + | for i, file in enumerate(files, 1): | |
| 48 | + | try: | |
| 49 | + | filepath = os.path.join(root, file) | |
| 50 | + | ||
| 51 | + | # Выводим прогресс каждые 0.1 секунды | |
| 52 | + | current_time = time.time() | |
| 53 | + | if current_time - last_update_time > 0.1 or i == len(files): | |
| 54 | + | print(f"\rScanning: {filepath[:80]}... Files: {total_files} ", end="") | |
| 55 | + | sys.stdout.flush() | |
| 56 | + | last_update_time = current_time | |
| 57 | + | ||
| 58 | + | if not os.path.islink(filepath): | |
| 59 | + | size = os.path.getsize(filepath) | |
| 60 | + | bin_key = get_upper_power_of_two(size) | |
| 61 | + | size_bins[bin_key] += 1 | |
| 62 | + | total_files += 1 | |
| 63 | + | total_size += size | |
| 64 | + | except (PermissionError, OSError): | |
| 65 | + | continue | |
| 66 | + | ||
| 67 | + | print("\nScan completed!") | |
| 68 | + | elapsed_time = time.time() - start_time | |
| 69 | + | print(f"Scanned {total_files} files in {elapsed_time:.1f} seconds") | |
| 70 | + | return size_bins, total_files, total_size | |
| 71 | + | ||
| 72 | + | def recommend_recordsize(size_bins, total_files): | |
| 73 | + | if not size_bins: | |
| 74 | + | return "128K" # default value | |
| 75 | + | ||
| 76 | + | recordsize_options = { | |
| 77 | + | 512: "512B", | |
| 78 | + | 1024: "1K", | |
| 79 | + | 2048: "2K", | |
| 80 | + | 4096: "4K", | |
| 81 | + | 8192: "8K", | |
| 82 | + | 16384: "16K", | |
| 83 | + | 32768: "32K", | |
| 84 | + | 65536: "64K", | |
| 85 | + | 131072: "128K", | |
| 86 | + | 262144: "256K", | |
| 87 | + | 524288: "512K", | |
| 88 | + | 1048576: "1M", | |
| 89 | + | 2097152: "2M", | |
| 90 | + | 4194304: "4M", | |
| 91 | + | 8388608: "8M", | |
| 92 | + | 16777216: "16M", | |
| 93 | + | 33554432: "32M" | |
| 94 | + | } | |
| 95 | + | ||
| 96 | + | most_common_size = max(size_bins.items(), key=lambda x: x[1])[0] | |
| 97 | + | ||
| 98 | + | # Выбираем ближайший recordsize, который не меньше most_common_size | |
| 99 | + | recommended = None | |
| 100 | + | for size in sorted(recordsize_options.keys()): | |
| 101 | + | if size >= most_common_size: | |
| 102 | + | recommended = size | |
| 103 | + | break | |
| 104 | + | ||
| 105 | + | # Если не нашли подходящий размер (очень большие файлы), берем максимальный | |
| 106 | + | if recommended is None: | |
| 107 | + | recommended = max(recordsize_options.keys()) | |
| 108 | + | ||
| 109 | + | return recordsize_options.get(recommended, "128K") | |
| 110 | + | ||
| 111 | + | def print_distribution(size_bins, total_files): | |
| 112 | + | print("\nFile size distribution (next power-of-two bins):") | |
| 113 | + | print("{:<20} {:<15} {}".format("Filesize below", "Files count", "Percentage")) | |
| 114 | + | print("-" * 30) | |
| 115 | + | ||
| 116 | + | for size in sorted(size_bins.keys()): | |
| 117 | + | count = size_bins[size] | |
| 118 | + | percent = (count / total_files) * 100 | |
| 119 | + | size_str = format_size(size) | |
| 120 | + | print("{:<20} {:<15} {:.1f}%".format(size_str, count, percent)) # Изменено на .1f | |
| 121 | + | ||
| 122 | + | def main(): | |
| 123 | + | parser = argparse.ArgumentParser(description="ZFS recordsize recommendation tool") | |
| 124 | + | parser.add_argument("path", help="Path to the directory to scan") | |
| 125 | + | args = parser.parse_args() | |
| 126 | + | ||
| 127 | + | print(f"Starting scan of {args.path}...\n") | |
| 128 | + | size_bins, total_files, total_size = scan_directory(args.path) | |
| 129 | + | ||
| 130 | + | if total_files == 0: | |
| 131 | + | print("No files found or permission denied.") | |
| 132 | + | return | |
| 133 | + | ||
| 134 | + | print_distribution(size_bins, total_files) | |
| 135 | + | ||
| 136 | + | recommended = recommend_recordsize(size_bins, total_files) | |
| 137 | + | print(f"\nTotal files scanned: {total_files}") | |
| 138 | + | print(f"Total data size: {format_size(total_size)}") | |
| 139 | + | print(f"\nRecommended ZFS recordsize: {recommended}") | |
| 140 | + | ||
| 141 | + | if recommended == "128K": | |
| 142 | + | print("Note: This is the default ZFS recordsize which works well for most mixed workloads.") | |
| 143 | + | elif recommended.endswith(("B", "K")): | |
| 144 | + | print("Note: Small recordsize is good for many small files but may reduce throughput for large files.") | |
| 145 | + | elif recommended.endswith(("M")): | |
| 146 | + | print("Note: Large recordsize improves throughput for large files but may waste space for small files.") | |
| 147 | + | ||
| 148 | + | if __name__ == "__main__": | |
| 149 | + | main() | |
| 150 | + | ||
    
    
                            
                            上一页
    
    
    下一页