ZFS Recordsize Advisor

*zfs_recordsize_advisor6.py · 5.5 KiB · Python Raw

#!/usr/bin/env python3 # # Рекурсивно сканирует все файлы в указанном разделе # Группирует файлы по ближайшей степени двойки их размера # Анализирует распределение размеров файлов # Рекомендует оптимальный recordsize на основе наиболее распространённого размера файлов # Рекомендация основана на предположении, что будущие файлы будут похожего размера # ТЕМ НЕ МЕНЕЕ: # Для смешанных рабочих нагрузок (много маленьких и несколько больших файлов) обычно лучше # оставить значение по умолчанию 128K # import os import argparse from collections import defaultdict import math import sys import time def format_size(size_bytes): """Форматирует размер в удобочитаемом виде (без дробной части)""" if size_bytes == 0: return "0B" units = ["B", "KB", "MB", "GB", "TB"] unit_index = 0 while size_bytes >= 1024 and unit_index < len(units) - 1: size_bytes /= 1024 unit_index += 1 return f"{int(size_bytes)}{units[unit_index]}" def get_upper_power_of_two(size): """Возвращает ближайшую степень двойки вверх для заданного размера""" if size == 0: return 0 return 2 ** math.ceil(math.log2(size)) def scan_directory(path): size_bins = defaultdict(int) total_files = 0 total_size = 0 last_update_time = 0 start_time = time.time() for root, _, files in os.walk(path): for i, file in enumerate(files, 1): try: filepath = os.path.join(root, file) # Выводим прогресс каждые 0.1 секунды current_time = time.time() if current_time - last_update_time > 0.1 or i == len(files): print(f"\rScanning: {filepath[:80]}... Files: {total_files} ", end="") sys.stdout.flush() last_update_time = current_time if not os.path.islink(filepath): size = os.path.getsize(filepath) bin_key = get_upper_power_of_two(size) size_bins[bin_key] += 1 total_files += 1 total_size += size except (PermissionError, OSError): continue print("\nScan completed!") elapsed_time = time.time() - start_time print(f"Scanned {total_files} files in {elapsed_time:.1f} seconds") return size_bins, total_files, total_size def recommend_recordsize(size_bins, total_files): if not size_bins: return "128K" # default value recordsize_options = { 512: "512B", 1024: "1K", 2048: "2K", 4096: "4K", 8192: "8K", 16384: "16K", 32768: "32K", 65536: "64K", 131072: "128K", 262144: "256K", 524288: "512K", 1048576: "1M", 2097152: "2M", 4194304: "4M", 8388608: "8M", 16777216: "16M", 33554432: "32M" } most_common_size = max(size_bins.items(), key=lambda x: x[1])[0] # Выбираем ближайший recordsize, который не меньше most_common_size recommended = None for size in sorted(recordsize_options.keys()): if size >= most_common_size: recommended = size break # Если не нашли подходящий размер (очень большие файлы), берем максимальный if recommended is None: recommended = max(recordsize_options.keys()) return recordsize_options.get(recommended, "128K") def print_distribution(size_bins, total_files): print("\nFile size distribution (next power-of-two bins):") print("{:<20} {:<15} {}".format("Filesize below", "Files count", "Percentage")) print("-" * 30) for size in sorted(size_bins.keys()): count = size_bins[size] percent = (count / total_files) * 100 size_str = format_size(size) print("{:<20} {:<15} {:.1f}%".format(size_str, count, percent)) # Изменено на .1f def main(): parser = argparse.ArgumentParser(description="ZFS recordsize recommendation tool") parser.add_argument("path", help="Path to the directory to scan") args = parser.parse_args() print(f"Starting scan of {args.path}...\n") size_bins, total_files, total_size = scan_directory(args.path) if total_files == 0: print("No files found or permission denied.") return print_distribution(size_bins, total_files) recommended = recommend_recordsize(size_bins, total_files) print(f"\nTotal files scanned: {total_files}") print(f"Total data size: {format_size(total_size)}") print(f"\nRecommended ZFS recordsize: {recommended}") if recommended == "128K": print("Note: This is the default ZFS recordsize which works well for most mixed workloads.") elif recommended.endswith(("B", "K")): print("Note: Small recordsize is good for many small files but may reduce throughput for large files.") elif recommended.endswith(("M")): print("Note: Large recordsize improves throughput for large files but may waste space for small files.") if __name__ == "__main__": main()

1	#!/usr/bin/env python3
2	#
3	# Рекурсивно сканирует все файлы в указанном разделе
4	# Группирует файлы по ближайшей степени двойки их размера
5	# Анализирует распределение размеров файлов
6	# Рекомендует оптимальный recordsize на основе наиболее распространённого размера файлов
7	# Рекомендация основана на предположении, что будущие файлы будут похожего размера
8	# ТЕМ НЕ МЕНЕЕ:
9	# Для смешанных рабочих нагрузок (много маленьких и несколько больших файлов) обычно лучше
10	# оставить значение по умолчанию 128K
11	#
12	import os
13	import argparse
14	from collections import defaultdict
15	import math
16	import sys
17	import time
18
19	def format_size(size_bytes):
20	"""Форматирует размер в удобочитаемом виде (без дробной части)"""
21	if size_bytes == 0:
22	return "0B"
23
24	units = ["B", "KB", "MB", "GB", "TB"]
25	unit_index = 0
26
27	while size_bytes >= 1024 and unit_index < len(units) - 1:
28	size_bytes /= 1024
29	unit_index += 1
30
31	return f"{int(size_bytes)}{units[unit_index]}"
32
33	def get_upper_power_of_two(size):
34	"""Возвращает ближайшую степень двойки вверх для заданного размера"""
35	if size == 0:
36	return 0
37	return 2 ** math.ceil(math.log2(size))
38
39	def scan_directory(path):
40	size_bins = defaultdict(int)
41	total_files = 0
42	total_size = 0
43	last_update_time = 0
44	start_time = time.time()
45
46	for root, _, files in os.walk(path):
47	for i, file in enumerate(files, 1):
48	try:
49	filepath = os.path.join(root, file)
50
51	# Выводим прогресс каждые 0.1 секунды
52	current_time = time.time()
53	if current_time - last_update_time > 0.1 or i == len(files):
54	print(f"\rScanning: {filepath[:80]}... Files: {total_files} ", end="")
55	sys.stdout.flush()
56	last_update_time = current_time
57
58	if not os.path.islink(filepath):
59	size = os.path.getsize(filepath)
60	bin_key = get_upper_power_of_two(size)
61	size_bins[bin_key] += 1
62	total_files += 1
63	total_size += size
64	except (PermissionError, OSError):
65	continue
66
67	print("\nScan completed!")
68	elapsed_time = time.time() - start_time
69	print(f"Scanned {total_files} files in {elapsed_time:.1f} seconds")
70	return size_bins, total_files, total_size
71
72	def recommend_recordsize(size_bins, total_files):
73	if not size_bins:
74	return "128K" # default value
75
76	recordsize_options = {
77	512: "512B",
78	1024: "1K",
79	2048: "2K",
80	4096: "4K",
81	8192: "8K",
82	16384: "16K",
83	32768: "32K",
84	65536: "64K",
85	131072: "128K",
86	262144: "256K",
87	524288: "512K",
88	1048576: "1M",
89	2097152: "2M",
90	4194304: "4M",
91	8388608: "8M",
92	16777216: "16M",
93	33554432: "32M"
94	}
95
96	most_common_size = max(size_bins.items(), key=lambda x: x[1])[0]
97
98	# Выбираем ближайший recordsize, который не меньше most_common_size
99	recommended = None
100	for size in sorted(recordsize_options.keys()):
101	if size >= most_common_size:
102	recommended = size
103	break
104
105	# Если не нашли подходящий размер (очень большие файлы), берем максимальный
106	if recommended is None:
107	recommended = max(recordsize_options.keys())
108
109	return recordsize_options.get(recommended, "128K")
110
111	def print_distribution(size_bins, total_files):
112	print("\nFile size distribution (next power-of-two bins):")
113	print("{:<20} {:<15} {}".format("Filesize below", "Files count", "Percentage"))
114	print("-" * 30)
115
116	for size in sorted(size_bins.keys()):
117	count = size_bins[size]
118	percent = (count / total_files) * 100
119	size_str = format_size(size)
120	print("{:<20} {:<15} {:.1f}%".format(size_str, count, percent)) # Изменено на .1f
121
122	def main():
123	parser = argparse.ArgumentParser(description="ZFS recordsize recommendation tool")
124	parser.add_argument("path", help="Path to the directory to scan")
125	args = parser.parse_args()
126
127	print(f"Starting scan of {args.path}...\n")
128	size_bins, total_files, total_size = scan_directory(args.path)
129
130	if total_files == 0:
131	print("No files found or permission denied.")
132	return
133
134	print_distribution(size_bins, total_files)
135
136	recommended = recommend_recordsize(size_bins, total_files)
137	print(f"\nTotal files scanned: {total_files}")
138	print(f"Total data size: {format_size(total_size)}")
139	print(f"\nRecommended ZFS recordsize: {recommended}")
140
141	if recommended == "128K":
142	print("Note: This is the default ZFS recordsize which works well for most mixed workloads.")
143	elif recommended.endswith(("B", "K")):
144	print("Note: Small recordsize is good for many small files but may reduce throughput for large files.")
145	elif recommended.endswith(("M")):
146	print("Note: Large recordsize improves throughput for large files but may waste space for small files.")
147
148	if __name__ == "__main__":
149	main()
150
151