epoch8 · seriozh1 · Feb 6, 2023 · Feb 6, 2023 · Feb 6, 2023 · Feb 6, 2023
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -25,7 +25,7 @@
     "[python]": {
         "editor.defaultFormatter": "ms-python.black-formatter",
         "editor.formatOnSave": true
-      },
+    },
     "githubIssues.queries": [
         {
             "label": "My Issues",

diff --git a/datapipe/compute.py b/datapipe/compute.py
@@ -4,6 +4,8 @@
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Protocol
 
+import ray
+
 from opentelemetry import trace
 
 from datapipe.datatable import DataStore, DataTable
@@ -293,6 +295,34 @@ def run_pipeline(
     run_steps(ds, steps, run_config)
 
 
+@ray.remote
+def step_worker(
+    step: ComputeStep,
+    ds: DataStore,
+    steps: List[ComputeStep],
+    changelist: ChangeList,
+    run_config: Optional[RunConfig] = None,
+):
+    print(step._name)
+
+    result_changelist = step.run_changelist(ds, changelist, run_config)
+    schedule_runtime(ds, steps, result_changelist, run_config)
+
+
+def schedule_runtime(
+    ds: DataStore,
+    steps: List[ComputeStep],
+    changelist: ChangeList,
+    run_config: Optional[RunConfig] = None,
+):
+    if changelist:
+        changed_tables = set(changelist.changes.keys())
+        for step in steps:
+            for input_dt in step.get_input_dts():
+                if input_dt.name in changed_tables:
+                    step_worker.remote(step, ds, steps, changelist, run_config)
+
+
 def run_changelist(
     ds: DataStore,
     catalog: Catalog,
@@ -310,29 +340,34 @@ def run_steps_changelist(
     steps: List[ComputeStep],
     changelist: ChangeList,
     run_config: Optional[RunConfig] = None,
+    parallel_runtime: Optional[bool] = True,
 ) -> None:
-    current_changes = changelist
-    next_changes = ChangeList()
-    iteration = 0
-
-    with tracer.start_as_current_span("Start pipeline for changelist"):
-        while not current_changes.empty() and iteration < 100:
-            with tracer.start_as_current_span("run_steps"):
-                for step in steps:
-                    with tracer.start_as_current_span(
-                        f"{step.get_name()} "
-                        f"{[i.name for i in step.get_input_dts()]} -> {[i.name for i in step.get_output_dts()]}"
-                    ):
-                        logger.info(
-                            f"Running {step.get_name()} "
+    if parallel_runtime:
+        schedule_runtime(ds, steps, changelist, run_config)
+
+    else:
+        current_changes = changelist
+        next_changes = ChangeList()
+        iteration = 0
+
+        with tracer.start_as_current_span("Start pipeline for changelist"):
+            while not current_changes.empty() and iteration < 100:
+                with tracer.start_as_current_span("run_steps"):
+                    for step in steps:
+                        with tracer.start_as_current_span(
+                            f"{step.get_name()} "
                             f"{[i.name for i in step.get_input_dts()]} -> {[i.name for i in step.get_output_dts()]}"
-                        )
-
-                        step_changes = step.run_changelist(
-                            ds, current_changes, run_config
-                        )
-                        next_changes.extend(step_changes)
-
-            current_changes = next_changes
-            next_changes = ChangeList()
-            iteration += 1
+                        ):
+                            logger.info(
+                                f"Running {step.get_name()} "
+                                f"{[i.name for i in step.get_input_dts()]} -> {[i.name for i in step.get_output_dts()]}"
+                            )
+
+                            step_changes = step.run_changelist(
+                                ds, current_changes, run_config
+                            )
+                            next_changes.extend(step_changes)
+
+                current_changes = next_changes
+                next_changes = ChangeList()
+                iteration += 1
diff --git a/datapipe/core_steps.py b/datapipe/core_steps.py
@@ -15,6 +15,7 @@
 
 import tqdm
 from opentelemetry import trace
+import ray
 
 from datapipe.compute import (
     Catalog,
@@ -43,7 +44,7 @@ def do_batch_transform(
     idx_count: Optional[int] = None,
     kwargs: Optional[Dict[str, Any]] = None,
     run_config: Optional[RunConfig] = None,
-) -> Iterator[ChangeList]:
+) -> Iterable[ChangeList]:
     """
     Множественная инкрементальная обработка `input_dts' на основе изменяющихся индексов
     """
@@ -54,80 +55,108 @@ def do_batch_transform(
         # Nothing to process
         return [ChangeList()]
 
-    for idx in tqdm.tqdm(idx_gen, total=idx_count):
-        with tracer.start_as_current_span("process batch"):
-            logger.debug(f"Idx to process: {idx.to_records()}")
+    batch_results = [
+        do_one_batch_transform.remote(
+            func=func,
+            ds=ds,
+            input_dts=input_dts,
+            output_dts=output_dts,
+            idx=idx,
+            kwargs=kwargs,
+            run_config=run_config,
+        )
+        for idx in idx_gen
+    ]
+
+    return ray.get(batch_results)
+
+
+@ray.remote
+def do_one_batch_transform(
+    func: BatchTransformFunc,
+    ds: DataStore,
+    input_dts: List[DataTable],
+    output_dts: List[DataTable],
+    idx: IndexDF,
+    kwargs: Optional[Dict[str, Any]] = None,
+    run_config: Optional[RunConfig] = None,
+) -> ChangeList:
 
-            with tracer.start_as_current_span("get input data"):
+    logger = logging.getLogger("datapipe.core_steps")
+    tracer = trace.get_tracer("datapipe.core_steps")
+
+    with tracer.start_as_current_span("process batch"):
+        logger.debug(f"Idx to process: {idx.to_records()}")
+        changes = ChangeList()
+
+        with tracer.start_as_current_span("get input data"):
+            try:
+                input_dfs = [inp.get_data(idx) for inp in input_dts]
+            except Exception as e:
+                logger.error(f"Get input data failed: {str(e)}")
+                ds.event_logger.log_exception(
+                    e,
+                    run_config=RunConfig.add_labels(
+                        run_config, {"idx": idx.to_dict(orient="records")}
+                    ),
+                )
+
+                return changes
+
+        if sum(len(j) for j in input_dfs) > 0:
+            with tracer.start_as_current_span("run transform"):
                 try:
-                    input_dfs = [inp.get_data(idx) for inp in input_dts]
+                    chunks_df = func(*input_dfs, **kwargs or {})
                 except Exception as e:
-                    logger.error(f"Get input data failed: {str(e)}")
+                    logger.error(f"Transform failed ({func.__name__}): {str(e)}")
                     ds.event_logger.log_exception(
                         e,
                         run_config=RunConfig.add_labels(
                             run_config, {"idx": idx.to_dict(orient="records")}
                         ),
                     )
 
-                    continue
-
-            changes = ChangeList()
-
-            if sum(len(j) for j in input_dfs) > 0:
-                with tracer.start_as_current_span("run transform"):
-                    try:
-                        chunks_df = func(*input_dfs, **kwargs or {})
-                    except Exception as e:
-                        logger.error(f"Transform failed ({func.__name__}): {str(e)}")
-                        ds.event_logger.log_exception(
-                            e,
-                            run_config=RunConfig.add_labels(
-                                run_config, {"idx": idx.to_dict(orient="records")}
-                            ),
-                        )
+                    return changes
+
+            if isinstance(chunks_df, (list, tuple)):
+                assert len(chunks_df) == len(output_dts)
+            else:
+                assert len(output_dts) == 1
+                chunks_df = [chunks_df]
 
-                        continue
-
-                if isinstance(chunks_df, (list, tuple)):
-                    assert len(chunks_df) == len(output_dts)
-                else:
-                    assert len(output_dts) == 1
-                    chunks_df = [chunks_df]
-
-                with tracer.start_as_current_span("store output batch"):
-                    try:
-                        for k, res_dt in enumerate(output_dts):
-                            # Берем k-ое значение функции для k-ой таблички
-                            # Добавляем результат в результирующие чанки
-                            change_idx = res_dt.store_chunk(
-                                data_df=chunks_df[k],
-                                processed_idx=idx,
-                                run_config=run_config,
-                            )
-
-                            changes.append(res_dt.name, change_idx)
-                    except Exception as e:
-                        logger.error(f"Store output batch failed: {str(e)}")
-                        ds.event_logger.log_exception(
-                            e,
-                            run_config=RunConfig.add_labels(
-                                run_config, {"idx": idx.to_dict(orient="records")}
-                            ),
+            with tracer.start_as_current_span("store output batch"):
+                try:
+                    for k, res_dt in enumerate(output_dts):
+                        # Берем k-ое значение функции для k-ой таблички
+                        # Добавляем результат в результирующие чанки
+                        change_idx = res_dt.store_chunk(
+                            data_df=chunks_df[k],
+                            processed_idx=idx,
+                            run_config=run_config,
                         )
 
-                        continue
+                        changes.append(res_dt.name, change_idx)
+                except Exception as e:
+                    logger.error(f"Store output batch failed: {str(e)}")
+                    ds.event_logger.log_exception(
+                        e,
+                        run_config=RunConfig.add_labels(
+                            run_config, {"idx": idx.to_dict(orient="records")}
+                        ),
+                    )
+
+                    return changes
 
-            else:
-                with tracer.start_as_current_span("delete missing data from output"):
-                    for k, res_dt in enumerate(output_dts):
-                        del_idx = res_dt.meta_table.get_existing_idx(idx)
+        else:
+            with tracer.start_as_current_span("delete missing data from output"):
+                for k, res_dt in enumerate(output_dts):
+                    del_idx = res_dt.meta_table.get_existing_idx(idx)
 
-                        res_dt.delete_by_idx(del_idx, run_config=run_config)
+                    res_dt.delete_by_idx(del_idx, run_config=run_config)
 
-                        changes.append(res_dt.name, del_idx)
+                    changes.append(res_dt.name, del_idx)
 
-            yield changes
+        return changes
 
 
 def do_full_batch_transform(

diff --git a/datapipe/store/database.py b/datapipe/store/database.py
@@ -27,6 +27,8 @@
 tracer = trace.get_tracer("datapipe.store.database")
 
 
+SQLAlchemyInstrumentor().instrument()
+
 SCHEMA_TO_DTYPE_LOOKUP = {
     String: str,
     Integer: int,
@@ -60,8 +62,6 @@ def _init(self, connstr: str, schema: Optional[str]) -> None:
             poolclass=SingletonThreadPool,
         )
 
-        SQLAlchemyInstrumentor().instrument(engine=self.con)
-
         self.sqla_metadata = MetaData(schema=schema)
 
     def __getstate__(self):

diff --git a/examples/rq_simple/app.py b/examples/rq_simple/app.py
@@ -0,0 +1,32 @@
+import time
+from rq import Queue
+from redis import Redis
+from simple_functions import stepAB, stepBD, stepBE, stepCE
+
+
+class Step:
+    def __init__(self, func, input_dt, output_dt):
+        self.func = func
+        self.input_dt = input_dt
+        self.output_dt = output_dt
+
+
+g = {
+    "dtA": [Step(stepAB, "dtA", "dtB")],
+    "dtB": [Step(stepBD, "dtB", "dtD"), Step(stepBE, "dtB", "dtE")],
+    "dtC": [Step(stepCE, "dtC", "dtE")],
+}
+
+
+redis_conn = Redis()
+queue = Queue(connection=redis_conn)
+
+
+def run_changelist_realtime(dt_changed):
+    if dt_changed in g:
+        for step in g[dt_changed]:
+            queue.enqueue(step.func, step.input_dt, step.output_dt)
+            run_changelist_realtime(step.output_dt)
+
+
+run_changelist_realtime("dtA")
diff --git a/examples/rq_simple/simple_functions.py b/examples/rq_simple/simple_functions.py
@@ -0,0 +1,35 @@
+import time
+import pandas as pd
+
+n = 10
+dts_store = {
+    "dtA": pd.DataFrame({"f": [str(i) for i in range(n)]}),
+    "dtB": pd.DataFrame({"f": [str(i) for i in range(n)]}),
+    "dtC": pd.DataFrame({"f": [str(i) for i in range(n)]}),
+    "dtD": pd.DataFrame({"f": [str(i) for i in range(n)]}),
+    "dtE": pd.DataFrame({"f": [str(i) for i in range(n)]}),
+}
+
+
+def stepAB(input_dt, output_dt):
+    print("stepAB")
+    dts_store[output_dt]["f"] = dts_store[input_dt]["f"] + "AB"
+    time.sleep(2)
+
+
+def stepBD(input_dt, output_dt):
+    print("stepBD")
+    dts_store[output_dt]["f"] = dts_store[input_dt]["f"] + "BD"
+    time.sleep(2)
+
+
+def stepBE(input_dt, output_dt):
+    print("stepBE")
+    dts_store[output_dt]["f"] = dts_store[input_dt]["f"] + "BE"
+    time.sleep(2)
+
+
+def stepCE(input_dt, output_dt):
+    print("stepCE")
+    dts_store[output_dt]["f"] = dts_store[input_dt]["f"] + "CE"
+    time.sleep(2)