ZFS Recordsize Advisor 的修订

1

+

#!/usr/bin/env python3

2

+

#

3

+

# Рекурсивно сканирует все файлы в указанном разделе

4

+

# Группирует файлы по ближайшей степени двойки их размера

5

+

# Анализирует распределение размеров файлов

6

+

# Рекомендует оптимальный recordsize на основе наиболее распространённого размера файлов

7

+

# Рекомендация основана на предположении, что будущие файлы будут похожего размера

8

+

# ТЕМ НЕ МЕНЕЕ:

9

+

# Для смешанных рабочих нагрузок (много маленьких и несколько больших файлов) обычно лучше

10

+

# оставить значение по умолчанию 128K

11

+

#

12

+

import os

13

+

import argparse

14

+

from collections import defaultdict

15

+

import math

16

+

import sys

17

+

import time

18

+

19

+

def format_size(size_bytes):

20

+

"""Форматирует размер в удобочитаемом виде (без дробной части)"""

21

+

if size_bytes == 0:

22

+

return "0B"

23

+

24

+

units = ["B", "KB", "MB", "GB", "TB"]

25

+

unit_index = 0

26

+

27

+

while size_bytes >= 1024 and unit_index < len(units) - 1:

28

+

size_bytes /= 1024

29

+

unit_index += 1

30

+

31

+

return f"{int(size_bytes)}{units[unit_index]}"

32

+

33

+

def get_upper_power_of_two(size):

34

+

"""Возвращает ближайшую степень двойки вверх для заданного размера"""

35

+

if size == 0:

36

+

return 0

37

+

return 2 ** math.ceil(math.log2(size))

38

+

39

+

def scan_directory(path):

40

+

size_bins = defaultdict(int)

41

+

total_files = 0

42

+

total_size = 0

43

+

last_update_time = 0

44

+

start_time = time.time()

45

+

46

+

for root, _, files in os.walk(path):

47

+

for i, file in enumerate(files, 1):

48

+

try:

49

+

filepath = os.path.join(root, file)

50

+

51

+

# Выводим прогресс каждые 0.1 секунды

52

+

current_time = time.time()

53

+

if current_time - last_update_time > 0.1 or i == len(files):

54

+

print(f"\rScanning: {filepath[:80]}... Files: {total_files} ", end="")

55

+

sys.stdout.flush()

56

+

last_update_time = current_time

57

+

58

+

if not os.path.islink(filepath):

59

+

size = os.path.getsize(filepath)

60

+

bin_key = get_upper_power_of_two(size)

61

+

size_bins[bin_key] += 1

62

+

total_files += 1

63

+

total_size += size

64

+

except (PermissionError, OSError):

65

+

continue

66

+

67

+

print("\nScan completed!")

68

+

elapsed_time = time.time() - start_time

69

+

print(f"Scanned {total_files} files in {elapsed_time:.1f} seconds")

70

+

return size_bins, total_files, total_size

71

+

72

+

def recommend_recordsize(size_bins, total_files):

73

+

if not size_bins:

74

+

return "128K" # default value

75

+

76

+

recordsize_options = {

77

+

512: "512B",

78

+

1024: "1K",

79

+

2048: "2K",

80

+

4096: "4K",

81

+

8192: "8K",

82

+

16384: "16K",

83

+

32768: "32K",

84

+

65536: "64K",

85

+

131072: "128K",

86

+

262144: "256K",

87

+

524288: "512K",

88

+

1048576: "1M",

89

+

2097152: "2M",

90

+

4194304: "4M",

91

+

8388608: "8M",

92

+

16777216: "16M",

93

+

33554432: "32M"

94

+

}

95

+

96

+

most_common_size = max(size_bins.items(), key=lambda x: x[1])[0]

97

+

98

+

# Выбираем ближайший recordsize, который не меньше most_common_size

99

+

recommended = None

100

+

for size in sorted(recordsize_options.keys()):

101

+

if size >= most_common_size:

102

+

recommended = size

103

+

break

104

+

105

+

# Если не нашли подходящий размер (очень большие файлы), берем максимальный

106

+

if recommended is None:

107

+

recommended = max(recordsize_options.keys())

108

+

109

+

return recordsize_options.get(recommended, "128K")

110

+

111

+

def print_distribution(size_bins, total_files):

112

+

print("\nFile size distribution (next power-of-two bins):")

113

+

print("{:<20} {:<15} {}".format("Filesize below", "Files count", "Percentage"))

114

+

print("-" * 30)

115

+

116

+

for size in sorted(size_bins.keys()):

117

+

count = size_bins[size]

118

+

percent = (count / total_files) * 100

119

+

size_str = format_size(size)

120

+

print("{:<20} {:<15} {:.1f}%".format(size_str, count, percent)) # Изменено на .1f

121

+

122

+

def main():

123

+

parser = argparse.ArgumentParser(description="ZFS recordsize recommendation tool")

124

+

parser.add_argument("path", help="Path to the directory to scan")

125

+

args = parser.parse_args()

126

+

127

+

print(f"Starting scan of {args.path}...\n")

128

+

size_bins, total_files, total_size = scan_directory(args.path)

129

+

130

+

if total_files == 0:

131

+

print("No files found or permission denied.")

132

+

return

133

+

134

+

print_distribution(size_bins, total_files)

135

+

136

+

recommended = recommend_recordsize(size_bins, total_files)

137

+

print(f"\nTotal files scanned: {total_files}")

138

+

print(f"Total data size: {format_size(total_size)}")

139

+

print(f"\nRecommended ZFS recordsize: {recommended}")

140

+

141

+

if recommended == "128K":

142

+

print("Note: This is the default ZFS recordsize which works well for most mixed workloads.")

143

+

elif recommended.endswith(("B", "K")):

144

+

print("Note: Small recordsize is good for many small files but may reduce throughput for large files.")

145

+

elif recommended.endswith(("M")):

146

+

print("Note: Large recordsize improves throughput for large files but may waste space for small files.")

147

+

148

+

if __name__ == "__main__":

149

+

main()

150

+

KarelWintersky / ZFS Recordsize Advisor

KarelWintersky 修订了这个 Gist 1746702132. 转到此修订

		@@ -0,0 +1,150 @@
1	+	#!/usr/bin/env python3
2	+	#
3	+	# Рекурсивно сканирует все файлы в указанном разделе
4	+	# Группирует файлы по ближайшей степени двойки их размера
5	+	# Анализирует распределение размеров файлов
6	+	# Рекомендует оптимальный recordsize на основе наиболее распространённого размера файлов
7	+	# Рекомендация основана на предположении, что будущие файлы будут похожего размера
8	+	# ТЕМ НЕ МЕНЕЕ:
9	+	# Для смешанных рабочих нагрузок (много маленьких и несколько больших файлов) обычно лучше
10	+	# оставить значение по умолчанию 128K
11	+	#
12	+	import os
13	+	import argparse
14	+	from collections import defaultdict
15	+	import math
16	+	import sys
17	+	import time
18	+
19	+	def format_size(size_bytes):
20	+	"""Форматирует размер в удобочитаемом виде (без дробной части)"""
21	+	if size_bytes == 0:
22	+	return "0B"
23	+
24	+	units = ["B", "KB", "MB", "GB", "TB"]
25	+	unit_index = 0
26	+
27	+	while size_bytes >= 1024 and unit_index < len(units) - 1:
28	+	size_bytes /= 1024
29	+	unit_index += 1
30	+
31	+	return f"{int(size_bytes)}{units[unit_index]}"
32	+
33	+	def get_upper_power_of_two(size):
34	+	"""Возвращает ближайшую степень двойки вверх для заданного размера"""
35	+	if size == 0:
36	+	return 0
37	+	return 2 ** math.ceil(math.log2(size))
38	+
39	+	def scan_directory(path):
40	+	size_bins = defaultdict(int)
41	+	total_files = 0
42	+	total_size = 0
43	+	last_update_time = 0
44	+	start_time = time.time()
45	+
46	+	for root, _, files in os.walk(path):
47	+	for i, file in enumerate(files, 1):
48	+	try:
49	+	filepath = os.path.join(root, file)
50	+
51	+	# Выводим прогресс каждые 0.1 секунды
52	+	current_time = time.time()
53	+	if current_time - last_update_time > 0.1 or i == len(files):
54	+	print(f"\rScanning: {filepath[:80]}... Files: {total_files} ", end="")
55	+	sys.stdout.flush()
56	+	last_update_time = current_time
57	+
58	+	if not os.path.islink(filepath):
59	+	size = os.path.getsize(filepath)
60	+	bin_key = get_upper_power_of_two(size)
61	+	size_bins[bin_key] += 1
62	+	total_files += 1
63	+	total_size += size
64	+	except (PermissionError, OSError):
65	+	continue
66	+
67	+	print("\nScan completed!")
68	+	elapsed_time = time.time() - start_time
69	+	print(f"Scanned {total_files} files in {elapsed_time:.1f} seconds")
70	+	return size_bins, total_files, total_size
71	+
72	+	def recommend_recordsize(size_bins, total_files):
73	+	if not size_bins:
74	+	return "128K" # default value
75	+
76	+	recordsize_options = {
77	+	512: "512B",
78	+	1024: "1K",
79	+	2048: "2K",
80	+	4096: "4K",
81	+	8192: "8K",
82	+	16384: "16K",
83	+	32768: "32K",
84	+	65536: "64K",
85	+	131072: "128K",
86	+	262144: "256K",
87	+	524288: "512K",
88	+	1048576: "1M",
89	+	2097152: "2M",
90	+	4194304: "4M",
91	+	8388608: "8M",
92	+	16777216: "16M",
93	+	33554432: "32M"
94	+	}
95	+
96	+	most_common_size = max(size_bins.items(), key=lambda x: x[1])[0]
97	+
98	+	# Выбираем ближайший recordsize, который не меньше most_common_size
99	+	recommended = None
100	+	for size in sorted(recordsize_options.keys()):
101	+	if size >= most_common_size:
102	+	recommended = size
103	+	break
104	+
105	+	# Если не нашли подходящий размер (очень большие файлы), берем максимальный
106	+	if recommended is None:
107	+	recommended = max(recordsize_options.keys())
108	+
109	+	return recordsize_options.get(recommended, "128K")
110	+
111	+	def print_distribution(size_bins, total_files):
112	+	print("\nFile size distribution (next power-of-two bins):")
113	+	print("{:<20} {:<15} {}".format("Filesize below", "Files count", "Percentage"))
114	+	print("-" * 30)
115	+
116	+	for size in sorted(size_bins.keys()):
117	+	count = size_bins[size]
118	+	percent = (count / total_files) * 100
119	+	size_str = format_size(size)
120	+	print("{:<20} {:<15} {:.1f}%".format(size_str, count, percent)) # Изменено на .1f
121	+
122	+	def main():
123	+	parser = argparse.ArgumentParser(description="ZFS recordsize recommendation tool")
124	+	parser.add_argument("path", help="Path to the directory to scan")
125	+	args = parser.parse_args()
126	+
127	+	print(f"Starting scan of {args.path}...\n")
128	+	size_bins, total_files, total_size = scan_directory(args.path)
129	+
130	+	if total_files == 0:
131	+	print("No files found or permission denied.")
132	+	return
133	+
134	+	print_distribution(size_bins, total_files)
135	+
136	+	recommended = recommend_recordsize(size_bins, total_files)
137	+	print(f"\nTotal files scanned: {total_files}")
138	+	print(f"Total data size: {format_size(total_size)}")
139	+	print(f"\nRecommended ZFS recordsize: {recommended}")
140	+
141	+	if recommended == "128K":
142	+	print("Note: This is the default ZFS recordsize which works well for most mixed workloads.")
143	+	elif recommended.endswith(("B", "K")):
144	+	print("Note: Small recordsize is good for many small files but may reduce throughput for large files.")
145	+	elif recommended.endswith(("M")):
146	+	print("Note: Large recordsize improves throughput for large files but may waste space for small files.")
147	+
148	+	if __name__ == "__main__":
149	+	main()
150	+