Skip to content

Commit

Permalink
Merge pull request #2 from DS4SD/DaemonMode
Browse files Browse the repository at this point in the history
feat: Start Command
  • Loading branch information
gabe-l-hart authored Jan 30, 2025
2 parents 8d0a7a8 + 7780cc3 commit 2f53857
Show file tree
Hide file tree
Showing 8 changed files with 183 additions and 4 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ ragnardoc init
ragnardoc add ~/Documents
# Run an ingestion
ragnardoc run
# Start as a background service
ragnardoc start & disown
```

## Configuration
Expand Down Expand Up @@ -91,4 +93,3 @@ ingestion:
- Per-ingestor inclusion / exclusion
- Abstract scrapers to allow non-local scraping
- Service mode!
2 changes: 2 additions & 0 deletions ragnardoc/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@
from .common import add_common, use_common
from .init import InitCommand
from .run import RunCommand
from .start import StartCommand

all_commands = {
cmd.name: cmd
for cmd in [
AddCommand,
InitCommand,
RunCommand,
StartCommand,
]
}
82 changes: 82 additions & 0 deletions ragnardoc/cli/start.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""
The start command initializes ragnardoc to run as a service that continuously
maintains the state of your documents in all of your connected RAG apps.
"""
# Standard
from datetime import timedelta
import argparse
import re
import shlex
import subprocess
import sys
import time

# First Party
import alog

# Local
from .. import config
from .base import CommandBase

log = alog.use_channel("START")


class StartCommand(CommandBase):
__doc__ = __doc__
name = "start"

def __init__(self):
self._period = self._parse_time(config.service.period)
self._cmd = f"{sys.executable} -m ragnardoc run"
self._running = False

def add_args(self, parser: argparse.ArgumentParser):
"""Add the args to configure the periodic scraping"""
parser.add_argument(
"--period",
"-p",
default=None,
help="The period to run the ingestion service",
)

def stop(self):
self._running = False

def run(self, args: argparse.Namespace):
"""Start the infinite loop to run periodically"""
period = self._period
if args.period:
period = self._parse_time(args.period)
self._running = True
while self._running:
log.info("Running ingestion service")
self._ingest()
log.info("Sleeping for %s", period)
time.sleep(period.total_seconds())

def _ingest(self):
"""Run the ingestion as a subprocess. This is done so that config
changes are re-parsed on very run.
"""
with alog.ContextTimer(log.debug, "Ingestion done in: %s"):
subprocess.run(shlex.split(self._cmd))

@staticmethod
def _parse_time(time_str: str) -> timedelta:
"""Parse a time string into a timedelta object"""
pattern = r"(\d+\.?\d*)([dhms])\s*"
seconds = 0
for match in re.finditer(pattern, time_str):
value = float(match.group(1))
unit = match.group(2)
if unit == "s":
seconds += value
elif unit == "m":
seconds += value * 60
elif unit == "h":
seconds += value * 60 * 60
elif unit == "d":
seconds += value * 60 * 60 * 24
if not seconds:
raise ValueError(f"Invalid time string: {time_str}")
return timedelta(seconds=seconds)
4 changes: 4 additions & 0 deletions ragnardoc/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ scraping:
paths: []
regexprs: []

# Scraping service config
service:
period: 5m

# Document ingestion config
ingestion:
# Factory list of ingestion plugins to ingest to
Expand Down
2 changes: 1 addition & 1 deletion scripts/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ if [[ "${pytest_opts[*]}" != *"tests/"* ]]; then
--cov=ragnardoc
--cov-report=term
--cov-report=html
--cov-fail-under=40
--cov-fail-under=56
)
fi

Expand Down
61 changes: 61 additions & 0 deletions tests/cli/test_start.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""
Unit tests for the start command
"""
# Standard
from datetime import timedelta
from unittest import mock
import argparse
import threading
import time

# Third Party
import pytest

# First Party
import aconfig

# Local
from ragnardoc.cli.start import StartCommand


@pytest.mark.parametrize(
["time_str", "expected_delta"],
[
("35s", timedelta(seconds=35)),
("1d 2h 35s", timedelta(seconds=35 + 2 * 60 * 60 + 60 * 60 * 24)),
("16s 6h", timedelta(seconds=16 + 6 * 60 * 60)),
("2hours 1minute", timedelta(seconds=60 + 2 * 60 * 60)),
("0.5s", timedelta(seconds=0.5)),
],
)
def test_parse_time(time_str, expected_delta):
"""Test that time parsing works for various combinations"""
assert StartCommand._parse_time(time_str) == expected_delta


@pytest.mark.parametrize("time_str", ["", " ", "1 d", "1w"])
def test_parse_time_invalid(time_str):
"""Test that ValueError is raised for invalid time strings"""
with pytest.raises(ValueError):
StartCommand._parse_time(time_str)


@mock.patch("subprocess.run")
def test_run(run_mock):
"""Test that running the command launches the infinite loop correctly"""
cmd = StartCommand()
args = aconfig.Config({"period": "0.1s"}, override_env_vars=False)
run_thread = threading.Thread(target=cmd.run, args=(args,))
run_thread.start()
time.sleep(0.05)
cmd.stop()
run_thread.join()
run_mock.assert_called_once()


def test_add_args():
"""Test that the command adds the expected arguments"""
parser = argparse.ArgumentParser()
StartCommand().add_args(parser)
args = parser.parse_args([])
assert hasattr(args, "period")
16 changes: 16 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,19 @@ def txt_data_file(data_dir):
def scratch_dir():
with tempfile.TemporaryDirectory() as dirname:
yield Path(dirname)


# Force RAGNARDOC_HOME to be a temporary directory that will be auto-cleaned up.
# This is done while importing conftest.py to avoid the import-time config
# parsing where user config is merged.
_tempdir = tempfile.TemporaryDirectory(suffix="ragnardoc")
os.environ["RAGNARDOC_HOME"] = _tempdir.name


@pytest.fixture(autouse=True)
def ignore_user_config():
with tempfile.TemporaryDirectory() as temp_home:
_tempdir.cleanup()
os.environ["RAGNARDOC_HOME"] = temp_home
yield
_tempdir.cleanup()
17 changes: 15 additions & 2 deletions tests/test_types.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""
Unit tests for core types
"""
# Standard
import time

# Local
from ragnardoc import types
Expand Down Expand Up @@ -60,10 +62,20 @@ def test_set_content():
def test_fingerprint_content_change(scratch_dir, data_dir):
"""Test that the doc's fingerprint is computed correctly and mirrors changes
to the document itself
NOTE: This test in a previous version did expose a weakness in the current
fingerprint implementation that uses os.stat: If the content changes to
something of the exact same length AND the change happens so quickly
after the initial write that the timestamp precision results in an exact
equivalent, there's no way to tell the difference with os.stat! This is
an acceptable limitation given that in ragnardoc, changes would be made
by users generally who are not operating at this speed.
"""
doc_path = scratch_dir / "doc.txt"
content1 = "Hello World"
content2 = "Hiya world!"
# NOTE: This was previously "Hiya world!" which exposed the race condition
# with os.stat being exactly equivalent.
content2 = "Hiya world! How's life these days?"
with open(doc_path, "w") as handle:
handle.write(content1)

Expand All @@ -77,8 +89,9 @@ def test_fingerprint_content_change(scratch_dir, data_dir):
assert doc.content == read_content1 == content1

# Update the doc content
with open(doc_path, "w") as handle:
with open(doc.path, "w") as handle:
handle.write(content2)
handle.flush()

# Make sure the fingerprint changes and the content is invalidated and
# re-loaded
Expand Down

0 comments on commit 2f53857

Please sign in to comment.