Skip to content

Commit

Permalink
pure-python implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
blublinsky committed May 2, 2024
1 parent fa91404 commit 00d5314
Show file tree
Hide file tree
Showing 64 changed files with 1,117 additions and 560 deletions.
65 changes: 32 additions & 33 deletions .github/workflows/build-images.yml
Original file line number Diff line number Diff line change
@@ -1,36 +1,35 @@

name: Build Transform Images

on:
workflow_dispatch:
push:
branches:
- "dev"
pull_request:
branches:
- "dev"
on:
workflow_dispatch:
push:
branches:
- "dev"
pull_request:
branches:
- "dev"
jobs:
build-code:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Build and Test Code Transforms
run: |
make -C transforms/code DOCKER=docker image test-image
build-universal:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Build and Test Universal Transforms
run: |
make -C transforms/universal DOCKER=docker image test-image
build-tools:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Build and Test Universal Transforms
run: |
make -C tools/ingest2parquet DOCKER=docker image test-image
build-code:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Build and Test Code Transforms
run: |
make -C transforms/code DOCKER=docker image test-image
build-universal:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Build and Test Universal Transforms
run: |
make -C transforms/universal DOCKER=docker image test-image
build-tools:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Build and Test Universal Transforms
run: |
make -C tools/ingest2parquet DOCKER=docker image test-image
41 changes: 20 additions & 21 deletions .github/workflows/build-library.yml
Original file line number Diff line number Diff line change
@@ -1,24 +1,23 @@

name: Build Library

on:
workflow_dispatch:
push:
branches:
- "dev"
pull_request:
branches:
- "dev"
on:
workflow_dispatch:
push:
branches:
- "dev"
pull_request:
branches:
- "dev"
jobs:
build-lib:
runs-on: ubuntu-latest
strategy:
matrix:
python:
- "3.11"
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Build data-processing-lib
run: |
make -C data-processing-lib DOCKER=docker venv build
build-lib:
runs-on: ubuntu-latest
strategy:
matrix:
python:
- "3.11"
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Build data-processing-lib
run: |
make -C data-processing-lib DOCKER=docker venv build
42 changes: 21 additions & 21 deletions .github/workflows/deploy-docs.yml
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
name: deploy docs
name: deploy docs

on:
workflow_dispatch:
push:
branches:
- dev
workflow_dispatch:
push:
branches:
- dev
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: 3.x
- run: pip install mkdocs-material mkdocstrings[python] mkdocs-badges
- run: |
# remove badges
cat README.md |sed '/img\.shields\.io/d' > ./data-processing-lib/doc/index.md
# copy repo docs to mkdocs `docs_dir`
cp doc/* ./data-processing-lib/doc/
# copy kfp tutorials to mkdocs `docs_dir`
cp kfp/doc/* ./data-processing-lib/doc/
cd data-processing-lib && mkdocs gh-deploy --force
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: 3.x
- run: pip install mkdocs-material mkdocstrings[python] mkdocs-badges
- run: |
# remove badges
cat README.md |sed '/img\.shields\.io/d' > ./data-processing-lib/doc/index.md
# copy repo docs to mkdocs `docs_dir`
cp doc/* ./data-processing-lib/doc/
# copy kfp tutorials to mkdocs `docs_dir`
cp kfp/doc/* ./data-processing-lib/doc/
cd data-processing-lib && mkdocs gh-deploy --force
10 changes: 5 additions & 5 deletions .github/workflows/deploy-library.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,17 @@ jobs:
name: Build and check packages
runs-on: ubuntu-latest
steps:
- name: Checkout
- name: Checkout
uses: actions/checkout@v4
with:
# for setuptools-scm
fetch-depth: 0
- name: Build Package for pypi
- name: Build Package for pypi
run: |
make -C data-processing-lib build
make -C data-processing-lib build
publish-test-pypi:
name: Publish packages to test.pypi.org
# disabled
# disabled
if: false
runs-on: ubuntu-latest
needs: build-package
Expand Down Expand Up @@ -56,4 +56,4 @@ jobs:
name: Packages
path: dist
- name: Upload to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
uses: pypa/gh-action-pypi-publish@release/v1
10 changes: 5 additions & 5 deletions .github/workflows/deploy-transforms.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,22 @@ jobs:
name: Build and check images
runs-on: ubuntu-latest
steps:
- name: Checkout
- name: Checkout
uses: actions/checkout@v4
with:
# for setuptools-scm
fetch-depth: 0
- name: Build Package for pypi
- name: Build Package for pypi
run: |
make -C transforms image test-image
make -C transforms image test-image
publish-images:
name: Publish packages to quay.io
# disabled
# disabled
if: false
runs-on: ubuntu-latest
needs: build-images

steps:
- name: Push images to quay.io registry
run: |
make -C transforms publish
make -C transforms publish
141 changes: 70 additions & 71 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -1,74 +1,73 @@

name: Test CI

on:
workflow_dispatch:
push:
branches:
- "dev"
pull_request:
branches:
- "dev"
on:
workflow_dispatch:
push:
branches:
- "dev"
pull_request:
branches:
- "dev"
jobs:
test-lib:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Test data-processing-lib
run: |
make -C data-processing-lib DOCKER=docker venv test
test-code:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Test Code Transforms
run: |
make -C transforms/code DOCKER=docker venv test-src
test-universal:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Test Universal Transforms
run: |
make -C transforms/universal DOCKER=docker venv test-src
test-tools:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Test tools
run: |
make -C tools DOCKER=docker venv test
test-kfp-lib:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Test KFP lib
run: |
source kind/requirements.env
export PATH=$PATH:/tmp/
curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
chmod 700 /tmp/get_helm.sh
HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
chmod 777 /tmp/helm
chmod 777 /tmp/kind
curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
chmod 777 /tmp/kubectl
curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
chmod +x /tmp/mc
export DEPLOY_KUBEFLOW=0
make -C kind setup
make -C kfp/kfp_support_lib build test
test-kfp-compile:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Test KFP compile
run: |
make -C kfp/transform_workflows venv build
test-lib:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Test data-processing-lib
run: |
make -C data-processing-lib DOCKER=docker venv test
test-code:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Test Code Transforms
run: |
make -C transforms/code DOCKER=docker venv test-src
test-universal:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Test Universal Transforms
run: |
make -C transforms/universal DOCKER=docker venv test-src
test-tools:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Test tools
run: |
make -C tools DOCKER=docker venv test
test-kfp-lib:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Test KFP lib
run: |
source kind/requirements.env
export PATH=$PATH:/tmp/
curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
chmod 700 /tmp/get_helm.sh
HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
chmod 777 /tmp/helm
chmod 777 /tmp/kind
curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
chmod 777 /tmp/kubectl
curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
chmod +x /tmp/mc
export DEPLOY_KUBEFLOW=0
make -C kind setup
make -C kfp/kfp_support_lib build test
test-kfp-compile:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Test KFP compile
run: |
make -C kfp/transform_workflows venv build
16 changes: 8 additions & 8 deletions data-processing-lib/doc/advanced-transform-tutorial.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,10 @@ import pyarrow as pa
import ray
from data_processing.data_access import DataAccessFactory
from data_processing.ray import (
DefaultTableTransformConfiguration,
DefaultTableTransformRuntime,
RayUtils,
TransformLauncher,
TableTransformConfigurationRay,
DefaultTableTransformRuntimeRay,
RayUtils,
TransformLauncherRay,
)
from data_processing.transform import AbstractTableTransform
from data_processing.utils import GB, TransformUtils
Expand All @@ -72,10 +72,10 @@ from ray.actor import ActorHandle

class EdedupTransform(AbstractTableTransform):

def __init__(self, config: dict):
super().__init__(config)
self.doc_column = config.get("doc_column", "")
self.hashes = config.get("hashes", [])
def __init__(self, config: dict):
super().__init__(config)
self.doc_column = config.get("doc_column", "")
self.hashes = config.get("hashes", [])
```
The `EdedupTransform` class extends the `AbstractTableTransform`, which defines the required methods.

Expand Down
Loading

0 comments on commit 00d5314

Please sign in to comment.