Skip to content

Commit 7975a70

Browse files
authored
Persistent logs, with uploading to object storage (#28, PR #31)
1 parent 01e473d commit 7975a70

12 files changed

+590
-8
lines changed

kubernetes.yaml

+33-2
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ spec:
5151
mountPath: /opt/app/scrapyd_k8s.conf
5252
readOnly: true
5353
subPath: scrapyd_k8s.conf
54+
#- name: joblogs
55+
# mountPath: /data
5456
# Enable if your spider repository needs a pull secret
5557
# - name: scrapyd-k8s-pull-secret
5658
# mountPath: /opt/app/.docker
@@ -59,6 +61,9 @@ spec:
5961
- configMap:
6062
name: scrapyd-k8s-config
6163
name: scrapyd-k8s-config
64+
#- name: joblogs
65+
# persistentVolumeClaim:
66+
# claimName: pv-claim
6267
# Enable if your spider repository needs a pull secret
6368
# - secret:
6469
# secretName: pull-secret
@@ -80,7 +85,7 @@ data:
8085
8186
repository = scrapyd_k8s.repository.Remote
8287
launcher = scrapyd_k8s.launcher.K8s
83-
88+
8489
namespace = default
8590
8691
# This is an example spider that should work out of the box.
@@ -106,6 +111,29 @@ metadata:
106111
app.kubernetes.io/name: spider-example
107112
stringData:
108113
FOO_API_KEY: "1234567890abcdef"
114+
#---
115+
#apiVersion: v1
116+
#kind: PersistentVolume
117+
#metadata:
118+
# name: pv-volume
119+
#spec:
120+
# capacity:
121+
# storage: 5Gi
122+
# accessModes:
123+
# - ReadWriteOnce
124+
# hostPath:
125+
# path: "/mnt/data"
126+
#---
127+
#apiVersion: v1
128+
#kind: PersistentVolumeClaim
129+
#metadata:
130+
# name: pv-claim
131+
#spec:
132+
# accessModes:
133+
# - ReadWriteOnce
134+
# resources:
135+
# requests:
136+
# storage: 5Gi
109137
---
110138
apiVersion: v1
111139
kind: ConfigMap
@@ -144,10 +172,13 @@ metadata:
144172
rules:
145173
- apiGroups: [""]
146174
resources: ["pods"]
147-
verbs: ["get", "list"]
175+
verbs: ["get", "list", "watch"]
148176
- apiGroups: [""]
149177
resources: ["pods/exec"]
150178
verbs: ["get"]
179+
- apiGroups: [""]
180+
resources: ["pods/log"]
181+
verbs: ["get"]
151182
- apiGroups: ["batch"]
152183
resources: ["jobs"]
153184
verbs: ["get", "list", "create", "delete"]

requirements.txt

+2
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,5 @@ kubernetes>=27.2.0 # introduction of suspend in jobspec
33
flask>=2.0.0
44
natsort>=8.0.0
55
Flask-BasicAuth>=0.2.0
6+
MarkupSafe>=2.1.5
7+
apache-libcloud>=3.8.0

scrapyd_k8s.sample-k8s.conf

+15
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,18 @@ requests_cpu = 80m
5757
requests_memory = 0.12G
5858
limits_cpu = 0.5
5959
limits_memory = 0.2G
60+
61+
#[joblogs]
62+
# Choose storage provider
63+
#storage_provider = s3
64+
#container_name = scrapyd-k8s-example-bucket
65+
66+
# Choose number of unique logs, but at least 2
67+
#num_lines_to_check = 2
68+
69+
#[joblogs.storage.s3]
70+
# Set your S3 key as ENV or below
71+
#key = ${S3_KEY}
72+
# Set your S3 secret key as ENV or below
73+
#secret = ${S3_SECRET}
74+
#region = eu-north-1

scrapyd_k8s/__main__.py

+17-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,18 @@
1-
from .api import run
1+
import logging
2+
import sys
3+
from .api import run, config
4+
from .joblogs import joblogs_init
25

3-
run()
6+
def setup_logging():
7+
logging.basicConfig(
8+
level=logging.INFO,
9+
format='%(asctime)s %(name)s [%(levelname)s]: %(message)s',
10+
handlers=[
11+
logging.StreamHandler(sys.stdout)
12+
]
13+
)
14+
15+
if __name__ == "__main__":
16+
setup_logging()
17+
joblogs_init(config)
18+
run()

scrapyd_k8s/api.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
#!/usr/bin/env python3
22
import uuid
3+
import logging
34

45
from flask import Flask, request, Response, jsonify
56
from flask_basicauth import BasicAuth
6-
from markupsafe import escape
77
from natsort import natsort_keygen, ns
88

99
from .config import Config
@@ -13,7 +13,7 @@
1313
repository = (config.repository_cls())(config)
1414
launcher = (config.launcher_cls())(config)
1515
scrapyd_config = config.scrapyd()
16-
16+
logger = logging.getLogger(__name__)
1717

1818
@app.get("/")
1919
def home():
@@ -155,5 +155,11 @@ def run():
155155
if config_username is not None and config_password is not None:
156156
enable_authentication(app, config_username, config_password)
157157

158+
if config.joblogs() is not None:
159+
launcher.enable_joblogs(config)
160+
logger.info("Job logs handling enabled.")
161+
else:
162+
logger.debug("Job logs handling not enabled; 'joblogs' configuration section is missing.")
163+
158164
# run server
159165
app.run(host=host, port=port)

scrapyd_k8s/config.py

+14
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,27 @@ def launcher_cls(self):
2121
pkg, cls = repo.rsplit('.', 1)
2222
return getattr(import_module(pkg), cls)
2323

24+
def joblogs(self):
25+
if self._config.has_section('joblogs'):
26+
return self._config['joblogs']
27+
else:
28+
return None
29+
30+
def joblogs_storage(self, provider):
31+
if not self._config.has_section('joblogs.storage.%s' % provider):
32+
return None
33+
return self._config['joblogs.storage.%s' % provider]
34+
2435
def listprojects(self):
2536
return self._projects
2637

2738
def project(self, project):
2839
if project in self._projects:
2940
return ProjectConfig(self._config, project, self._config['project.' + project])
3041

42+
def namespace(self):
43+
return self.scrapyd().get('namespace', 'default')
44+
3145
class ProjectConfig:
3246
def __init__(self, config, projectid, projectconfig):
3347
self._id = projectid

scrapyd_k8s/joblogs/__init__.py

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import logging
2+
from scrapyd_k8s.joblogs.log_handler_k8s import KubernetesJobLogHandler
3+
4+
logger = logging.getLogger(__name__)
5+
6+
def joblogs_init(config):
7+
"""
8+
Initializes job logs handling by starting the Kubernetes job log handler.
9+
10+
Parameters
11+
----------
12+
config : Config
13+
Configuration object containing settings for job logs and storage.
14+
15+
Returns
16+
-------
17+
None
18+
"""
19+
joblogs_config = config.joblogs()
20+
if joblogs_config and joblogs_config.get('storage_provider') is not None:
21+
log_handler = KubernetesJobLogHandler(config)
22+
log_handler.start()
23+
logger.info("Job logs handler started.")
24+
else:
25+
logger.warning("No storage provider configured; job logs will not be uploaded.")

0 commit comments

Comments
 (0)