Compare commits
18 Commits
a0d9dbffd8
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| cd6e11fe48 | |||
| 1de35afed2 | |||
| d23d6e0b42 | |||
| 3389c0a35a | |||
| 4b4624afc8 | |||
| 19628bc774 | |||
| f02154bfd3 | |||
| baa8c6b9a4 | |||
| 615e3989fb | |||
| 10295e2f21 | |||
| 349e4d43be | |||
| 22b1f374af | |||
| cd553879a2 | |||
| c22f84b7a4 | |||
| 894d0b24c5 | |||
| 07c8bea489 | |||
| 82e5b05c23 | |||
| 82a9e90edf |
34
.gitea/workflows/local-docker.yaml
Normal file
34
.gitea/workflows/local-docker.yaml
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
---
|
||||||
|
name: Push to local registry
|
||||||
|
run-name: ${{ gitea.actor }} is pushing -> local Docker
|
||||||
|
on:
|
||||||
|
- push
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
name: Build and push image
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
container: catthehacker/ubuntu:act-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Login to Docker Registry
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
registry: gitea.likemath.ru
|
||||||
|
username: ${{ secrets.REGISTRY_USERNAME }}
|
||||||
|
password: ${{ secrets.REGISTRY_TOKEN }}
|
||||||
|
|
||||||
|
- name: Set up QEMU
|
||||||
|
uses: docker/setup-qemu-action@v3
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
|
- name: Build and push
|
||||||
|
uses: docker/build-push-action@v6
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
push: true
|
||||||
|
tags: gitea.likemath.ru/alex/s3-mirror:latest
|
||||||
|
platforms: linux/amd64,linux/arm64
|
||||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -159,4 +159,3 @@ cython_debug/
|
|||||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
#.idea/
|
#.idea/
|
||||||
|
|
||||||
|
|||||||
26
.pre-commit-config.yaml
Normal file
26
.pre-commit-config.yaml
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
repos:
|
||||||
|
- repo: https://github.com/psf/black
|
||||||
|
rev: 24.10.0
|
||||||
|
hooks:
|
||||||
|
- id: black
|
||||||
|
args: [--line-length=88, --target-version=py38]
|
||||||
|
- repo: https://github.com/PyCQA/flake8
|
||||||
|
rev: 7.1.1
|
||||||
|
hooks:
|
||||||
|
- id: flake8
|
||||||
|
args: # arguments to configure flake8
|
||||||
|
# making isort line length compatible with black
|
||||||
|
- "--max-line-length=88"
|
||||||
|
- "--max-complexity=18"
|
||||||
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
|
rev: v4.3.0
|
||||||
|
hooks:
|
||||||
|
- id: check-yaml
|
||||||
|
- id: end-of-file-fixer
|
||||||
|
- id: trailing-whitespace
|
||||||
|
- id: check-json
|
||||||
|
- repo: https://github.com/pycqa/isort
|
||||||
|
rev: 5.13.2
|
||||||
|
hooks:
|
||||||
|
- id: isort
|
||||||
|
args: ["--filter-files" ]
|
||||||
14
Dockerfile
Normal file
14
Dockerfile
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
FROM docker.io/python:3.12-slim as builder
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
ENV PYTHONDONTWRITEBYTECODE 1
|
||||||
|
ENV PYTHONUNBUFFERED 1
|
||||||
|
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install -r requirements.txt
|
||||||
|
|
||||||
|
|
||||||
|
COPY *.py /app
|
||||||
|
|
||||||
|
ENTRYPOINT ["python3", "main.py"]
|
||||||
@@ -1,3 +1,9 @@
|
|||||||
# s3-mirror
|
# s3-mirror
|
||||||
|
|
||||||
Full mirroring between two s3-targets with redundant files removing.
|
Full mirroring between two s3-targets with redundant files removing.
|
||||||
|
|
||||||
|
## Example with Docker
|
||||||
|
```
|
||||||
|
docker build . -t s3-mirror
|
||||||
|
podman run -v ~/.mcli/config.json:/root/.mc/config.json:ro --rm s3-mirror
|
||||||
|
```
|
||||||
|
|||||||
204
main.py
Normal file
204
main.py
Normal file
@@ -0,0 +1,204 @@
|
|||||||
|
import argparse
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os.path
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from os.path import expanduser
|
||||||
|
from typing import Mapping, Set, Tuple
|
||||||
|
|
||||||
|
import minio
|
||||||
|
import tqdm
|
||||||
|
from tenacity import RetryError, retry, stop_after_delay, wait_exponential
|
||||||
|
|
||||||
|
default_retry_args = dict(
|
||||||
|
wait=wait_exponential(multiplier=1, min=0.5, max=10), stop=stop_after_delay(20)
|
||||||
|
)
|
||||||
|
|
||||||
|
s3_config_data = json.loads(open(expanduser("~/.mc/config.json")).read())["aliases"]
|
||||||
|
|
||||||
|
|
||||||
|
def get_files(s3, bucket, prefix) -> Mapping[str, Tuple[int, str]]:
|
||||||
|
res = {}
|
||||||
|
prefix_len = len(prefix)
|
||||||
|
for obj in tqdm.tqdm(
|
||||||
|
s3.list_objects(bucket, prefix=prefix, recursive=True), desc="S3 list objects"
|
||||||
|
):
|
||||||
|
if obj.is_dir:
|
||||||
|
continue
|
||||||
|
res[obj.object_name[prefix_len:].lstrip("/")] = (obj.size, obj.etag)
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def save_files(files: Mapping[str, Tuple[int, str]], path):
|
||||||
|
processed_files = {key: list(val) for key, val in files.items()}
|
||||||
|
with open(path, "w") as f:
|
||||||
|
f.write(
|
||||||
|
json.dumps(processed_files, indent=4, ensure_ascii=False, sort_keys=True)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_file_metrics(files: Mapping[str, Tuple[int, str]]):
|
||||||
|
total_size = sum([x[0] for x in files.values()])
|
||||||
|
return f"total {len(files)} files with size: {total_size}"
|
||||||
|
|
||||||
|
|
||||||
|
def build_s3(config_data, name):
|
||||||
|
try:
|
||||||
|
s3_credentials = config_data[name]
|
||||||
|
except KeyError:
|
||||||
|
print(f"Unknown s3: {name}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
return minio.Minio(
|
||||||
|
endpoint=re.sub(r"^https?://", "", s3_credentials["url"]),
|
||||||
|
access_key=s3_credentials["accessKey"],
|
||||||
|
secret_key=s3_credentials["secretKey"],
|
||||||
|
secure=s3_credentials["url"].startswith("https"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_redundant_files(
|
||||||
|
source: Mapping[str, Tuple[int, str]], target: Mapping[str, Tuple[int, str]]
|
||||||
|
) -> Set[str]:
|
||||||
|
"""
|
||||||
|
:return: files that exists in target but not in source. Equal to `target - source`
|
||||||
|
"""
|
||||||
|
res = set()
|
||||||
|
for key in target:
|
||||||
|
if key in source:
|
||||||
|
continue
|
||||||
|
res.add(key)
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def get_different_files(
|
||||||
|
source: Mapping[str, Tuple[int, str]], target: Mapping[str, Tuple[int, str]]
|
||||||
|
) -> Set[str]:
|
||||||
|
"""
|
||||||
|
:return: All key that key in source and target and different values
|
||||||
|
"""
|
||||||
|
res = set()
|
||||||
|
for key in source:
|
||||||
|
if key in target and source[key] != target[key]:
|
||||||
|
res.add(key)
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
@retry(**default_retry_args)
|
||||||
|
def get_object_data(s3: minio.Minio, bucket: str, object_name: str) -> bytes:
|
||||||
|
return s3.get_object(bucket_name=bucket, object_name=object_name).read()
|
||||||
|
|
||||||
|
|
||||||
|
@retry(**default_retry_args)
|
||||||
|
def put_object_data(s3: minio.Minio, bucket: str, object_name: str, data: bytes):
|
||||||
|
s3.put_object(
|
||||||
|
bucket,
|
||||||
|
object_name,
|
||||||
|
data=io.BytesIO(data),
|
||||||
|
length=len(data),
|
||||||
|
num_parallel_uploads=1,
|
||||||
|
part_size=150 * 1024 * 1024,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(description="Reliable s3-mirror with logs")
|
||||||
|
parser.add_argument(
|
||||||
|
"--source", type=str, required=True, help="source path (with s3)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--target", type=str, required=True, help="source path (with s3)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-r",
|
||||||
|
"--remove",
|
||||||
|
default=False,
|
||||||
|
action="store_true",
|
||||||
|
help="remove redundant files",
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def extract_s3_parts(val) -> Tuple[str, str, str]:
|
||||||
|
s3_alias, bucket, path = re.findall(r"([^/]+)/([^/]+)/(.*)", val)[0]
|
||||||
|
return s3_alias, bucket, path
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
logging.info(f"Known s3: {list(s3_config_data.keys())}")
|
||||||
|
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
source_s3_alias, source_bucket, source_prefix = extract_s3_parts(args.source)
|
||||||
|
logging.info(f"{source_s3_alias=}, {source_bucket=}, {source_prefix=}")
|
||||||
|
source_s3 = build_s3(s3_config_data, source_s3_alias)
|
||||||
|
|
||||||
|
target_s3_alias, target_bucket, target_prefix = extract_s3_parts(args.target)
|
||||||
|
logging.info(f"{target_s3_alias=}, {target_bucket=}, {target_prefix=}")
|
||||||
|
target_s3 = build_s3(s3_config_data, target_s3_alias)
|
||||||
|
|
||||||
|
source_files = get_files(source_s3, bucket=source_bucket, prefix=source_prefix)
|
||||||
|
save_files(source_files, "source.json")
|
||||||
|
print(f"Source {get_file_metrics(source_files)}")
|
||||||
|
|
||||||
|
target_files = get_files(target_s3, bucket=target_bucket, prefix=target_prefix)
|
||||||
|
save_files(target_files, "target.json")
|
||||||
|
print(f"Target initial {get_file_metrics(target_files)}")
|
||||||
|
|
||||||
|
if args.remove:
|
||||||
|
redundant = get_redundant_files(source_files, target_files)
|
||||||
|
if redundant:
|
||||||
|
for file_to_remove in redundant:
|
||||||
|
object_name = os.path.join(target_prefix, file_to_remove)
|
||||||
|
logging.info(f"Removing redundant {target_bucket}:{object_name}")
|
||||||
|
try:
|
||||||
|
target_s3.remove_object(
|
||||||
|
bucket_name=target_bucket, object_name=object_name
|
||||||
|
)
|
||||||
|
except Exception as err:
|
||||||
|
print(
|
||||||
|
f"Unable to remove {target_bucket}/{object_name}: erorr {err}"
|
||||||
|
)
|
||||||
|
del target_files[file_to_remove]
|
||||||
|
print(f"Removed {len(redundant)} files")
|
||||||
|
print(f"Target after removing redundant {get_file_metrics(target_files)}")
|
||||||
|
|
||||||
|
new_files = get_redundant_files(target_files, source_files)
|
||||||
|
print(f"New {len(new_files)} files")
|
||||||
|
|
||||||
|
different_files = get_different_files(source_files, target_files)
|
||||||
|
print(f"Different {len(different_files)} files")
|
||||||
|
|
||||||
|
for key in tqdm.tqdm(new_files.union(different_files)):
|
||||||
|
source_object = os.path.join(source_prefix, key)
|
||||||
|
target_object = os.path.join(target_prefix, key)
|
||||||
|
try:
|
||||||
|
logging.info(
|
||||||
|
f"Moving {source_bucket}:{source_object} to "
|
||||||
|
f"{target_bucket}:{target_object}"
|
||||||
|
)
|
||||||
|
source_data = get_object_data(source_s3, source_bucket, source_object)
|
||||||
|
put_object_data(
|
||||||
|
target_s3,
|
||||||
|
target_bucket,
|
||||||
|
target_object,
|
||||||
|
data=source_data,
|
||||||
|
)
|
||||||
|
except RetryError:
|
||||||
|
logging.warning(
|
||||||
|
"Retry on moving"
|
||||||
|
f"{source_bucket}:{source_object} to "
|
||||||
|
f"{target_bucket}:{target_object}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logging.basicConfig(
|
||||||
|
filename="s3-mirror.log",
|
||||||
|
level=logging.INFO,
|
||||||
|
format="[%(asctime)s] %(levelname)s - %(message)s",
|
||||||
|
)
|
||||||
|
|
||||||
|
main()
|
||||||
10
pyproject.toml
Normal file
10
pyproject.toml
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
[tool.black]
|
||||||
|
line-length = 80
|
||||||
|
target-version = ['py38']
|
||||||
|
include = '.pyi?$'
|
||||||
|
|
||||||
|
[tool.isort]
|
||||||
|
profile = "black"
|
||||||
|
py_version = "auto"
|
||||||
|
sections = "FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER"
|
||||||
|
known_local_folder = "src"
|
||||||
3
requirements.txt
Normal file
3
requirements.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
minio==7.1.13
|
||||||
|
tenacity==8.2.1
|
||||||
|
tqdm==4.64.1
|
||||||
Reference in New Issue
Block a user