Check synchronization of local and cloud files and directories¶
import os
instance_name = "test-sqlite-sync"
!lamin connect {instance_name}
!yes | lamin delete {instance_name}
from lamindb_setup import init, settings
from lamindb_setup.core.upath import UPath, LocalPathClasses
import shutil
import time
import pytest
init(
storage=f"s3://lamindb-ci/{instance_name}",
name=instance_name,
)
Set everything up before starting the tests
dir_sync = settings.storage.root / "dir_sync"
dir_sync.fs.invalidate_cache()
if dir_sync.is_dir():
dir_sync.rmdir()
assert not dir_sync.exists()
(dir_sync / "file1").touch()
(dir_sync / "file2").touch()
assert dir_sync.is_dir()
dir_sync_local = settings.paths.cloud_to_local_no_update(dir_sync)
if dir_sync_local.is_dir():
for file in dir_sync_local.iterdir():
file.unlink()
dir_sync_local.rmdir()
assert not dir_sync_local.exists()
num_files = lambda directory: len(
[file for file in directory.rglob("*") if file.is_file()]
)
assert num_files(dir_sync) == 2
Test cloud_to_local_no_update
paths
test_local_path = UPath("./some/local/path")
assert settings.paths.cloud_to_local_no_update(test_local_path) == test_local_path
assert (
settings.paths.cloud_to_local_no_update(test_local_path.as_posix())
== test_local_path
)
assert (
settings.paths.cloud_to_local_no_update(test_local_path, cache_key="some/cache/key")
== test_local_path
)
assert (
settings.paths.cloud_to_local_no_update(dir_sync)
== settings.cache_dir / f"lamindb-ci/{instance_name}/dir_sync"
)
assert (
settings.paths.cloud_to_local_no_update(dir_sync.as_posix())
== settings.cache_dir / f"lamindb-ci/{instance_name}/dir_sync"
)
assert (
settings.paths.cloud_to_local_no_update(dir_sync, cache_key="dir_cache/key")
== settings.cache_dir / "dir_cache/key"
)
assert (
settings.paths.cloud_to_local_no_update(
dir_sync.as_posix(), cache_key="dir_cache/key"
)
== settings.cache_dir / "dir_cache/key"
)
# for http urls
http_path = UPath(
"https://raw.githubusercontent.com/laminlabs/lamindb-setup/refs/heads/main/README.md"
)
assert http_path.protocol == "https"
http_stat = http_path.stat()
assert http_stat.st_size != 0
assert http_stat.st_mtime == 0
assert http_stat.as_info()["type"] == "file"
http_key = "raw.githubusercontent.com/laminlabs/lamindb-setup/refs/heads/main/README.md"
assert (
settings.paths.cloud_to_local_no_update(http_path) == settings.cache_dir / http_key
)
assert (
settings.paths.cloud_to_local_no_update(str(http_path))
== settings.cache_dir / http_key
)
assert (
settings.paths.cloud_to_local_no_update(http_path, cache_key="check/README.md")
== settings.cache_dir / "check/README.md"
)
Test cloud_to_local
with cache_key
dir_sync_local = settings.paths.cloud_to_local(
dir_sync.as_posix(), cache_key="dir_cache/key"
)
assert dir_sync_local == settings.cache_dir / "dir_cache/key"
assert dir_sync_local.is_dir()
assert num_files(dir_sync_local) == 2
for file in dir_sync_local.iterdir():
file.unlink()
dir_sync_local.rmdir()
Test cloud_to_local
for http
http_local = settings.paths.cloud_to_local(http_path)
assert isinstance(http_local, LocalPathClasses)
assert http_local.stat().st_size == http_path.stat().st_size
http_local_mtime = http_local.stat().st_mtime
# no changes here because the file exists already
assert settings.paths.cloud_to_local(http_path).stat().st_mtime == http_local_mtime
http_local.unlink()
Test sync of general files and directories
dir_sync_local = settings.paths.cloud_to_local(dir_sync)
assert dir_sync_local.is_dir()
assert num_files(dir_sync_local) == 2
for file in ("file1", "file2"):
assert (dir_sync_local / file).stat().st_mtime == (
dir_sync / file
).modified.timestamp()
local_file = dir_sync_local / "file1"
local_file.unlink()
assert not local_file.exists()
assert num_files(dir_sync_local) == 1
dir_sync_local = settings.paths.cloud_to_local(dir_sync)
assert local_file.exists()
assert num_files(dir_sync_local) == 2
for file in ("file1", "file2"):
cloud_file = dir_sync / file
local_file = dir_sync_local / file
cloud_mtime = cloud_file.modified.timestamp()
os.utime(local_file, times=(cloud_mtime - 1, cloud_mtime - 1))
assert local_file.stat().st_mtime < cloud_mtime
dir_sync_local = settings.paths.cloud_to_local(dir_sync)
for file in ("file1", "file2"):
assert (dir_sync_local / file).stat().st_mtime == (
dir_sync / file
).modified.timestamp()
(dir_sync_local / "file1").unlink()
local_file_new = dir_sync_local / "test/file3"
local_file_new_parent = local_file_new.parent
local_file_new_parent.mkdir()
local_file_new.touch()
assert num_files(dir_sync_local) == 2
dir_sync_local = settings.paths.cloud_to_local(dir_sync)
assert num_files(dir_sync_local) == 2
assert local_file_new.exists()
time.sleep(1)
cloud_file = dir_sync / "file1"
# update cloud timestamp
cloud_file.fs.touch(cloud_file.as_posix(), truncate=True)
assert cloud_file.modified.timestamp() > local_file_new.stat().st_mtime
dir_sync_local = settings.paths.cloud_to_local(dir_sync)
assert num_files(dir_sync_local) == 2
assert not local_file_new.exists()
assert not local_file_new_parent.exists()
for file in ("file1", "file2"):
assert (dir_sync_local / file).stat().st_mtime == (
dir_sync / file
).modified.timestamp()
dir_sync.rmdir()
for file in dir_sync_local.iterdir():
file.unlink()
dir_sync_local.rmdir()
Get the paths to the cloud and local sqlite databases.
sqlite_file = settings.instance._sqlite_file
sqlite_file
Remote SQLite file does exists upon instance init:
assert settings.instance._sqlite_file.exists()
Now mimic a new user who loads the instance (this runs 4s):
settings.instance._update_local_sqlite_file()
Get the mere filepath of the local file, without any update:
cache_file = settings.paths.cloud_to_local_no_update(sqlite_file)
cache_file
Delete the local sqlite file:
cache_file.unlink()
assert not cache_file.exists()
Update the local version of the sqlite file:
settings.instance._update_local_sqlite_file()
assert cache_file.exists()
If the local sqlite database is older than the cloud one, the cloud database replaces the local sqlite database file.
cloud_mtime = sqlite_file.modified.timestamp()
cloud_mtime
os.utime(cache_file, times=(cloud_mtime - 1, cloud_mtime - 1))
assert cache_file.stat().st_mtime < sqlite_file.modified.timestamp()
settings.instance._update_local_sqlite_file()
assert cache_file.stat().st_mtime == sqlite_file.modified.timestamp()
check sync of huggingface dataset
hf_path = UPath("hf://datasets/Koncopd/lamindb-test")
hf_path_local = settings.paths.cloud_to_local(hf_path)
assert isinstance(hf_path_local, LocalPathClasses)
assert hf_path_local.is_dir()
assert num_files(hf_path) == num_files(hf_path_local)
shutil.rmtree(hf_path_local)
hf_path = UPath("hf://datasets/Koncopd/lamindb-test@main/anndata/pbmc68k_test.h5ad")
hf_path_local = settings.paths.cloud_to_local(hf_path)
assert isinstance(hf_path_local, LocalPathClasses)
assert hf_path_local.is_file()
hf_path_local.unlink()
hf_path = UPath("hf://datasets/Koncopd/lamindb-test@main/does_not_exist.file")
with pytest.raises(FileNotFoundError):
hf_path.synchronize(UPath("./does_not_exist.file"), error_no_origin=True)
Show code cell content
!yes | lamin delete {instance_name}