Compare commits

...

18 Commits

Author SHA1 Message Date
cd6e11fe48 refactor[auto]: pre-commit
All checks were successful
Push to local registry / Build and push image (push) Successful in 2m8s
2024-10-13 01:06:41 +03:00
1de35afed2 ci: Update pre-commit 2024-10-13 01:06:10 +03:00
d23d6e0b42 ci: QEMU build
All checks were successful
Push to local registry / Build and push image (push) Successful in 1m58s
2024-07-21 22:31:54 +03:00
3389c0a35a ci: Multiple docker platforms
Some checks failed
Push to local registry / Build and push image (push) Failing after 13s
2024-07-21 22:28:48 +03:00
4b4624afc8 ci: Local tag
All checks were successful
Push to local registry / Build and push image (push) Successful in 30s
2024-07-21 22:21:17 +03:00
19628bc774 ci: Single tag
Some checks failed
Push to local registry / Build and push image (push) Failing after 21s
2024-07-21 22:19:07 +03:00
f02154bfd3 ci: New local docker 2024-07-21 22:18:31 +03:00
baa8c6b9a4 ci: Add push tag 2024-07-21 22:16:10 +03:00
615e3989fb ci: gitea fix
Some checks failed
Push to local registry / Build and push image (push) Failing after 15s
2024-07-21 22:09:08 +03:00
10295e2f21 ci: Add docker and registry integration 2024-07-21 22:07:01 +03:00
349e4d43be fix: fixes source bucket instead of target bucket for removing 2023-06-04 23:16:48 +03:00
22b1f374af fix: Correct bucket for removing 2023-06-04 23:15:30 +03:00
cd553879a2 feat: Use part_size with fixed big size for better ETag 2023-02-18 23:29:43 +03:00
c22f84b7a4 feat: Improve error logging 2023-02-18 20:54:53 +03:00
894d0b24c5 feat: tqdm for S3-list 2023-02-18 20:54:35 +03:00
07c8bea489 fix: correct calculation of different_files 2023-02-11 23:11:57 +03:00
82e5b05c23 Initial commit 2023-02-11 19:31:58 +03:00
82a9e90edf pre-commit 2023-02-11 19:30:51 +03:00
8 changed files with 298 additions and 2 deletions

View File

@@ -0,0 +1,34 @@
---
name: Push to local registry
run-name: ${{ gitea.actor }} is pushing -> local Docker
on:
- push
jobs:
build:
name: Build and push image
runs-on: ubuntu-latest
container: catthehacker/ubuntu:act-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Login to Docker Registry
uses: docker/login-action@v3
with:
registry: gitea.likemath.ru
username: ${{ secrets.REGISTRY_USERNAME }}
password: ${{ secrets.REGISTRY_TOKEN }}
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build and push
uses: docker/build-push-action@v6
with:
context: .
push: true
tags: gitea.likemath.ru/alex/s3-mirror:latest
platforms: linux/amd64,linux/arm64

1
.gitignore vendored
View File

@@ -159,4 +159,3 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

26
.pre-commit-config.yaml Normal file
View File

@@ -0,0 +1,26 @@
repos:
- repo: https://github.com/psf/black
rev: 24.10.0
hooks:
- id: black
args: [--line-length=88, --target-version=py38]
- repo: https://github.com/PyCQA/flake8
rev: 7.1.1
hooks:
- id: flake8
args: # arguments to configure flake8
# making isort line length compatible with black
- "--max-line-length=88"
- "--max-complexity=18"
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0
hooks:
- id: check-yaml
- id: end-of-file-fixer
- id: trailing-whitespace
- id: check-json
- repo: https://github.com/pycqa/isort
rev: 5.13.2
hooks:
- id: isort
args: ["--filter-files" ]

14
Dockerfile Normal file
View File

@@ -0,0 +1,14 @@
FROM docker.io/python:3.12-slim as builder
WORKDIR /app
ENV PYTHONDONTWRITEBYTECODE 1
ENV PYTHONUNBUFFERED 1
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY *.py /app
ENTRYPOINT ["python3", "main.py"]

View File

@@ -1,3 +1,9 @@
# s3-mirror
Full mirroring between two s3-targets with redundant files removing.
Full mirroring between two s3-targets with redundant files removing.
## Example with Docker
```
docker build . -t s3-mirror
podman run -v ~/.mcli/config.json:/root/.mc/config.json:ro --rm s3-mirror
```

204
main.py Normal file
View File

@@ -0,0 +1,204 @@
import argparse
import io
import json
import logging
import os.path
import re
import sys
from os.path import expanduser
from typing import Mapping, Set, Tuple
import minio
import tqdm
from tenacity import RetryError, retry, stop_after_delay, wait_exponential
default_retry_args = dict(
wait=wait_exponential(multiplier=1, min=0.5, max=10), stop=stop_after_delay(20)
)
s3_config_data = json.loads(open(expanduser("~/.mc/config.json")).read())["aliases"]
def get_files(s3, bucket, prefix) -> Mapping[str, Tuple[int, str]]:
res = {}
prefix_len = len(prefix)
for obj in tqdm.tqdm(
s3.list_objects(bucket, prefix=prefix, recursive=True), desc="S3 list objects"
):
if obj.is_dir:
continue
res[obj.object_name[prefix_len:].lstrip("/")] = (obj.size, obj.etag)
return res
def save_files(files: Mapping[str, Tuple[int, str]], path):
processed_files = {key: list(val) for key, val in files.items()}
with open(path, "w") as f:
f.write(
json.dumps(processed_files, indent=4, ensure_ascii=False, sort_keys=True)
)
def get_file_metrics(files: Mapping[str, Tuple[int, str]]):
total_size = sum([x[0] for x in files.values()])
return f"total {len(files)} files with size: {total_size}"
def build_s3(config_data, name):
try:
s3_credentials = config_data[name]
except KeyError:
print(f"Unknown s3: {name}")
sys.exit(1)
return minio.Minio(
endpoint=re.sub(r"^https?://", "", s3_credentials["url"]),
access_key=s3_credentials["accessKey"],
secret_key=s3_credentials["secretKey"],
secure=s3_credentials["url"].startswith("https"),
)
def get_redundant_files(
source: Mapping[str, Tuple[int, str]], target: Mapping[str, Tuple[int, str]]
) -> Set[str]:
"""
:return: files that exists in target but not in source. Equal to `target - source`
"""
res = set()
for key in target:
if key in source:
continue
res.add(key)
return res
def get_different_files(
source: Mapping[str, Tuple[int, str]], target: Mapping[str, Tuple[int, str]]
) -> Set[str]:
"""
:return: All key that key in source and target and different values
"""
res = set()
for key in source:
if key in target and source[key] != target[key]:
res.add(key)
return res
@retry(**default_retry_args)
def get_object_data(s3: minio.Minio, bucket: str, object_name: str) -> bytes:
return s3.get_object(bucket_name=bucket, object_name=object_name).read()
@retry(**default_retry_args)
def put_object_data(s3: minio.Minio, bucket: str, object_name: str, data: bytes):
s3.put_object(
bucket,
object_name,
data=io.BytesIO(data),
length=len(data),
num_parallel_uploads=1,
part_size=150 * 1024 * 1024,
)
def parse_args():
parser = argparse.ArgumentParser(description="Reliable s3-mirror with logs")
parser.add_argument(
"--source", type=str, required=True, help="source path (with s3)"
)
parser.add_argument(
"--target", type=str, required=True, help="source path (with s3)"
)
parser.add_argument(
"-r",
"--remove",
default=False,
action="store_true",
help="remove redundant files",
)
return parser.parse_args()
def extract_s3_parts(val) -> Tuple[str, str, str]:
s3_alias, bucket, path = re.findall(r"([^/]+)/([^/]+)/(.*)", val)[0]
return s3_alias, bucket, path
def main():
logging.info(f"Known s3: {list(s3_config_data.keys())}")
args = parse_args()
source_s3_alias, source_bucket, source_prefix = extract_s3_parts(args.source)
logging.info(f"{source_s3_alias=}, {source_bucket=}, {source_prefix=}")
source_s3 = build_s3(s3_config_data, source_s3_alias)
target_s3_alias, target_bucket, target_prefix = extract_s3_parts(args.target)
logging.info(f"{target_s3_alias=}, {target_bucket=}, {target_prefix=}")
target_s3 = build_s3(s3_config_data, target_s3_alias)
source_files = get_files(source_s3, bucket=source_bucket, prefix=source_prefix)
save_files(source_files, "source.json")
print(f"Source {get_file_metrics(source_files)}")
target_files = get_files(target_s3, bucket=target_bucket, prefix=target_prefix)
save_files(target_files, "target.json")
print(f"Target initial {get_file_metrics(target_files)}")
if args.remove:
redundant = get_redundant_files(source_files, target_files)
if redundant:
for file_to_remove in redundant:
object_name = os.path.join(target_prefix, file_to_remove)
logging.info(f"Removing redundant {target_bucket}:{object_name}")
try:
target_s3.remove_object(
bucket_name=target_bucket, object_name=object_name
)
except Exception as err:
print(
f"Unable to remove {target_bucket}/{object_name}: erorr {err}"
)
del target_files[file_to_remove]
print(f"Removed {len(redundant)} files")
print(f"Target after removing redundant {get_file_metrics(target_files)}")
new_files = get_redundant_files(target_files, source_files)
print(f"New {len(new_files)} files")
different_files = get_different_files(source_files, target_files)
print(f"Different {len(different_files)} files")
for key in tqdm.tqdm(new_files.union(different_files)):
source_object = os.path.join(source_prefix, key)
target_object = os.path.join(target_prefix, key)
try:
logging.info(
f"Moving {source_bucket}:{source_object} to "
f"{target_bucket}:{target_object}"
)
source_data = get_object_data(source_s3, source_bucket, source_object)
put_object_data(
target_s3,
target_bucket,
target_object,
data=source_data,
)
except RetryError:
logging.warning(
"Retry on moving"
f"{source_bucket}:{source_object} to "
f"{target_bucket}:{target_object}"
)
if __name__ == "__main__":
logging.basicConfig(
filename="s3-mirror.log",
level=logging.INFO,
format="[%(asctime)s] %(levelname)s - %(message)s",
)
main()

10
pyproject.toml Normal file
View File

@@ -0,0 +1,10 @@
[tool.black]
line-length = 80
target-version = ['py38']
include = '.pyi?$'
[tool.isort]
profile = "black"
py_version = "auto"
sections = "FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER"
known_local_folder = "src"

3
requirements.txt Normal file
View File

@@ -0,0 +1,3 @@
minio==7.1.13
tenacity==8.2.1
tqdm==4.64.1