The following idea looking at the problem.
Currently it is at every sync:
- scan full checksum of source
- scan full checksum of target
- transfer diff
So dependent on the IO discussion if IO of 1) is faster than IO of 2)
It may be a consideration to try to avoid the scanning (of 2)), by keeping the checksum metadata for the next run.
What you need in this case is a checksummer (maybe self written) which does in one IO stream chunked checksumming and full, like in peaces like:
- 3TB hash
- a. 1.5Tb hash
- b. 1.5TB hash
- a. 750GB hash
- b. 750GB hash
- c. 750GB hash
- d. 750GB hash
...
down to the acceptable change window of e.g. 8k.
- a. 8k hash
- b. 8k hash..
- aaaaa. 8k hash
that you use comparing in a tree structure to the last run (target) and therefore know what branches in the tree you need to follow down to the smallest change window leaf.
This list you would copy, and take the hash metadata as source for the next sync.
Even better would be if you know the writes to the source, so re-scanning would be limited to those branch elements, if needed at all.
Alternative
This you would get for free if you change the storage system to something like zfs, doing snapshots and do then block based incremental sends and receives of the snapshots, which truly represent just changed ranges in the source.
Sorry that the best which comes to my mind.
Update:
As i just learn python, i gave it a try and learned, if you merge the scanner and the hash compare no need for the tree structure.
time python3 ./filehasher.py /export/disk-1/zfs.cache.raw
Namespace(inputfile='/export/disk-1/zfs.cache.raw', min_chunk_size=8192)
os.stat_result(st_mode=33188, st_ino=12, st_dev=2065, st_nlink=1, st_uid=0, st_gid=0, st_size=2147479552, st_atime=1728674135, st_mtime=1728674159, st_ctime=1728674159)
File Size in Bytes is 2147479552
File Size in Bytes is 31 exp 2147483648 min_chunk_exp 13
- load old hashes...
We need 1048574 hashes for this file.
chunk 512 of 0.20%
diff 13:512 o 124a33e4488e3b10a5434f24789173fad96014a4cc46856289b2b7f04aa41538 != n ea42ae1df2ce719035f0c51e3369ebe4e6b8b9dd9c44e2b60c3084a0efeedbcc
chunk 145991 of 55.69%
diff 13:145991 o a9198b4af91db0d5eeb4ad53d1c41302952071c3926713f74558c83d0e901f1e != n 0abfad615307b69f406d089104c0c12c05d197c3e3d9b072894780b9b1c93f88
chunk 145992 of 55.69%
diff 13:145992 o c89ae8a8c82edc9d2045eaad344887ae00878f6066fa3893cd42e7f9fa02c5b4 != n 4a9586bca78224d2c7343abcccd0e2138e2a416d806be72788cef3cdd84e1a2f
chunk 145993 of 55.69%
diff 13:145993 o f882619b014bc284101769ab9d2dc3b017f3fa9873a352a566f03fcb805aee1b != n 97f1aeb56c68be8b524aa480636d6987a75c88cc93f941b8cbe76043d108eabe
chunk 145994 of 55.69%
...cut...
diff 13:146274 o 10b911c4ccc35dfc91f5025eee0b17fd96e9ea7a8c38896e2bda824ec3a6d618 != n 1946f829b181b95cd95c9071f70a300700f8d3e25ac6e00758f7779e99e1bdde
chunk 146275 of 55.80%
diff 13:146275 o c201a9773aba93789a7c0191901b5a0cb04a4eafb90ef6bd225bb89ebc0d0294 != n f7bc8527c8585f680a32940c72ff0d300a95e2d6ddf0341d5892133c225592b8
chunk 146276 of 55.80%
diff 13:146276 o c311d18dfe474cb60aaaf63ecb20b70a7a8a4959a30e986661b79b2ceab8d50c != n d823513af052881feae29e5706c687051b3f54fa894a3b97b94e67eac2594adc
chunk 262143 of 100.00%
write hashes...
real 0m25.236s
user 0m14.316s
sys 0m4.296s
Code looks like: (please mind no support :) )
import argparse
parser = argparse.ArgumentParser("filehasher")
parser.add_argument("inputfile", help="file which should be hashed.", type=str)
parser.add_argument("--min-chunk-size", help="smallest chunk for hashing.", type=int, default=8192)
args = parser.parse_args()
print (args)
import os
file_stats = os.stat(args.inputfile)
print(file_stats)
print(f'File Size in Bytes is {file_stats.st_size}')
import math
two_exp=math.log(file_stats.st_size,2)
if two_exp-int(two_exp) > 0:
two_exp=int(two_exp)+1
else:
two_exp=int(two_exp)
two_exp_mchung=int(math.log(args.min_chunk_size,2))
print(f'File Size in Bytes is {two_exp} exp {2**two_exp} min_chunk_exp {two_exp_mchung}')
sum_hashes=0
multi=2
hash_obj={}
hash_max_size={}
hash_max_current={}
hash_obj_idx={}
import pickle
#args.inputfile+".pickle"
pickle_filename="./hash.pickle."+str(args.min_chunk_size)
old_hash=False
if os.path.isfile(pickle_filename):
print("- load old hashes...")
old_hash=True
with open(pickle_filename, 'rb') as handle:
old_hash_obj = pickle.load(handle)
import hashlib
for idx in range(two_exp,two_exp_mchung - 1,-1):
# print(f"{idx} needs {multi} hashes")
hash_obj_idx[idx]=0
hash_obj[idx]={}
hash_obj[idx][hash_obj_idx[idx]]=hashlib.sha256(b"")
hash_max_size[idx]=2**idx
hash_max_current[idx]=2**idx
sum_hashes=sum_hashes+multi
multi=multi*2
print(f"We need {sum_hashes} hashes for this file.")
def read_in_chunks(file_object, chunk_size=1024):
"""Lazy function (generator) to read a file piece by piece.
Default chunk size: 1k."""
while True:
data = file_object.read(chunk_size)
if not data:
break
#print(f"reading {len(data)}")
yield bytes(data)
def hashing_tree(data):
data_len=len(data)
#print(f"hashing of {data_len}")
for idx in range(two_exp,two_exp_mchung - 1,-1):
hash_obj[idx][hash_obj_idx[idx]].update(data)
hash_max_current[idx]=hash_max_current[idx]-data_len
#print(f"update hash exp = {idx}...{hash_max_size[idx]} - remaining in idx {hash_max_current[idx]}")
if hash_max_current[idx] == 0:
#print(f"idx cycle {idx} from {hash_obj_idx[idx]} to {(hash_obj_idx[idx]+1)}")
# cycle of idx
# 1. reset to max
hash_max_current[idx]=hash_max_size[idx]
# 2. increase index
hash_obj_idx[idx]=hash_obj_idx[idx]+1
hash_obj[idx][hash_obj_idx[idx]]=hashlib.sha256(b"")
def hashing_flat(data):
data_len=len(data)
#print(f"hashing of {data_len}")
idx=two_exp_mchung
hash_obj[idx][hash_obj_idx[idx]].update(data)
hash_max_current[idx]=hash_max_current[idx]-data_len
#print(f"update hash exp = {idx}...{hash_max_size[idx]} - remaining in idx {hash_max_current[idx]}")
if hash_max_current[idx] == 0:
#print(f"idx cycle {idx} from {hash_obj_idx[idx]} to {(hash_obj_idx[idx]+1)}")
# cycle of idx
# 1. reset to max
hash_max_current[idx]=hash_max_size[idx]
# 2. increase index
# 2.1 convert obj to str for pickling
hash_obj[idx][hash_obj_idx[idx]]=hash_obj[idx][hash_obj_idx[idx]].hexdigest()
if old_hash:
if old_hash_obj[idx][hash_obj_idx[idx]] == hash_obj[idx][hash_obj_idx[idx]]:
pass
#print("")
#print(f"ok {idx}:{hash_obj_idx[idx]}")
else:
print("")
print(f"diff {idx}:{hash_obj_idx[idx]} o {old_hash_obj[idx][hash_obj_idx[idx]]} != n {hash_obj[idx][hash_obj_idx[idx]]}")
# 2.2 move on...
hash_obj_idx[idx]=hash_obj_idx[idx]+1
hash_obj[idx][hash_obj_idx[idx]]=hashlib.sha256(b"")
chk=int(0)
max_chk=int(file_stats.st_size/args.min_chunk_size)
with open(args.inputfile,"rb") as f:
for piece in read_in_chunks(f,args.min_chunk_size):
print(f"chunk {chk} of {(chk*100/max_chk):3.2f}% ", end="\r")
hashing_flat(piece)
chk=chk+1
#break
# process_data(piece)
# convert last item
for idx in range(two_exp,two_exp_mchung - 1,-1):
hash_obj[idx][hash_obj_idx[idx]]=hash_obj[idx][hash_obj_idx[idx]].hexdigest()
#import json
#print(json.dumps(hash_obj))
#exit(0)
print("")
print("write hashes...")
with open(pickle_filename, 'wb') as handle:
pickle.dump(hash_obj, handle, protocol=pickle.HIGHEST_PROTOCOL)
#print(a == b)
--partialhelps with interrupted transfers, and--inplacehelps with not first copying over (dst-locally) the unmodified parts, and changes, and then do a replacement, but that has implications on robustness and bears no benefit on file systems that can do no-duplication copies through a reflink mechanism (XFS, btrfs, ZFS). So,--partialdoesn't help here,--inplacemight save a bit of IO on the destination side, but neither accelerate re-syncing, I'm afraid!