Skip to content

Commit 249910d

Browse files
authored
Improve job logs (#668, PR #42)
* keep retrying reconnection * cap backoff time with 15 min * configure logs in a separate file, to setup logs configuration before other modules inherit it * add debug logs to the watcher to better monitor each step * add level for logging to be configurable in the config file * change timeout_seconds parameter for k8s api connection from 0 to 300 to reset the connection periodically * add locking for events like subscibe, unsubscribe and event dispatching * change timeout_seconds watcher param back to 0 to try and hold connection forever * make directory for logs configurable and required for joblogs * remove random component from log files names, keeping job_id
1 parent a857ae5 commit 249910d

File tree

3 files changed

+86
-43
lines changed

3 files changed

+86
-43
lines changed

CONFIG.md

+4
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ For Kubernetes, it is important to set resource limits.
4545

4646
TODO: explain how to set limits, with default, project and spider specificity.
4747

48+
### [joblogs] section
49+
* `logs_dir` - a directory to store log files collected on k8s cluster (implemented only for Kubernetes). When configuring, keep in mind that in the Dockerfile the `USER` is set to `nobody` so not all directories are writable, but if you make a child directory under `/tmp` you won't encounter permission problems.
50+
51+
4852

4953
### Kubernetes API interaction
5054

kubernetes.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,9 @@ data:
104104
requests_memory = 0.2G
105105
limits_cpu = 0.8
106106
limits_memory = 0.5G
107+
108+
[joblogs]
109+
logs_dir = /tmp/scrapyd_k8s_logs
107110
---
108111
apiVersion: v1
109112
kind: Secret

scrapyd_k8s/joblogs/log_handler_k8s.py

+79-43
Original file line numberDiff line numberDiff line change
@@ -12,39 +12,46 @@ class KubernetesJobLogHandler:
1212
"""
1313
A class to handle Kubernetes job logs by watching pods, streaming logs, and uploading them to object storage.
1414
15-
...
15+
This class:
16+
- Observes Kubernetes pods for job-related events.
17+
- Streams logs from running pods, storing them locally.
18+
- Uploads completed job logs to object storage.
19+
- Retrieves and concatenates log files as needed.
1620
1721
Attributes
1822
----------
1923
DEFAULT_BLOCK_SIZE : int
20-
Default size (in bytes) of blocks to read when retrieving the last N lines from a file.
24+
Default size (in bytes) of blocks to read when retrieving lines from a file.
2125
config : object
2226
Configuration object containing settings for job logs and storage.
2327
watcher_threads : dict
2428
Dictionary to keep track of watcher threads for each pod.
25-
pod_tmp_mapping : dict
26-
Mapping of pod names to their temporary log file paths.
2729
namespace : str
2830
Kubernetes namespace to watch pods in.
2931
num_lines_to_check : int
30-
Number of lines to check for matching logs when resuming log streaming.
32+
Number of lines to check from the end of the existing log file to avoid duplicates.
3133
object_storage_provider : LibcloudObjectStorage
3234
Instance of the object storage provider for uploading logs.
3335
3436
Methods
3537
-------
36-
start():
37-
Starts the pod watcher thread for job logs.
38+
get_existing_log_filename(job_name):
39+
Retrieves an existing temporary log file path for a given job name.
40+
3841
get_last_n_lines(file_path, num_lines):
3942
Efficiently retrieves the last `num_lines` lines from a file.
43+
4044
concatenate_and_delete_files(main_file_path, temp_file_path, block_size=6144):
4145
Concatenates a temporary file to the main log file and deletes the temporary file.
46+
4247
make_log_filename_for_job(job_name):
43-
Generates a unique temporary file path for storing logs of a job.
48+
Ensures a log file exists for a given job and returns its path.
49+
4450
stream_logs(job_name):
45-
Streams logs from a Kubernetes pod and writes them to a file.
46-
watch_pods():
47-
Watches Kubernetes pods and handles events such as starting log streaming or uploading logs.
51+
Streams logs from a Kubernetes pod corresponding to the given job name and writes them to a file.
52+
53+
handle_events(event):
54+
Processes Kubernetes pod events to start log streaming or upload logs when pods complete.
4855
"""
4956
# The value was chosen to provide a balance between memory usage and the number of I/O operations
5057
DEFAULT_BLOCK_SIZE = 6144
@@ -60,11 +67,32 @@ def __init__(self, config):
6067
"""
6168
self.config = config
6269
self.watcher_threads = {}
63-
self.pod_tmp_mapping = {}
6470
self.namespace = config.namespace()
6571
self.num_lines_to_check = int(config.joblogs().get('num_lines_to_check', 0))
72+
self.logs_dir = self.config.joblogs().get('logs_dir').strip()
73+
if not self.logs_dir:
74+
raise ValueError("Configuration error: 'logs_dir' is missing in joblogs configuration section.")
6675
self.object_storage_provider = LibcloudObjectStorage(self.config)
6776

77+
def get_existing_log_filename(self, job_id):
78+
"""
79+
Retrieves the existing temporary log file path for a job without creating a new one.
80+
81+
Parameters
82+
----------
83+
job_id : str
84+
ID of the Kubernetes job or pod, which is also the name of the log file.
85+
86+
Returns
87+
-------
88+
str or None
89+
Path to the existing temporary log file for the given job, or None if no such file exists.
90+
"""
91+
log_file_path = os.path.join(self.logs_dir, f"{job_id}.txt")
92+
if os.path.isfile(log_file_path):
93+
return log_file_path
94+
return None
95+
6896
def get_last_n_lines(self, file_path, num_lines):
6997
"""
7098
Efficiently retrieves the last `num_lines` lines from a file.
@@ -141,38 +169,46 @@ def concatenate_and_delete_files(self, main_file_path, temp_file_path, block_siz
141169
except (IOError, OSError) as e:
142170
logger.error(f"Failed to concatenate and delete files for job: {e}")
143171

144-
def make_log_filename_for_job(self, job_name):
172+
def make_log_filename_for_job(self, job_id):
145173
"""
146-
Generates a unique temporary file path for storing logs of a job.
174+
Creates a log file path for a job, using the job name as the file name or returns a path to an existing file.
147175
148-
Parameters
149-
----------
150-
job_name : str
151-
Name of the Kubernetes job or pod.
176+
Parameters
177+
----------
178+
job_id : str
179+
ID of the Kubernetes job.
152180
153-
Returns
154-
-------
155-
str
156-
Path to the temporary log file for the given job.
181+
Returns
182+
-------
183+
str
184+
Path to the temporary log file for the given job.
157185
"""
158-
if self.pod_tmp_mapping.get(job_name) is not None:
159-
return self.pod_tmp_mapping[job_name]
160-
temp_dir = tempfile.gettempdir()
161-
app_temp_dir = os.path.join(temp_dir, 'job_logs')
162-
os.makedirs(app_temp_dir, exist_ok=True)
163-
fd, path = tempfile.mkstemp(prefix=f"{job_name}_logs_", suffix=".txt", dir=app_temp_dir)
164-
os.close(fd)
165-
self.pod_tmp_mapping[job_name] = path
166-
return path
167-
168-
def stream_logs(self, job_name):
186+
187+
if not os.path.isdir(self.logs_dir):
188+
os.makedirs(self.logs_dir)
189+
190+
log_file_path = os.path.join(self.logs_dir, f"{job_id}.txt")
191+
if os.path.exists(log_file_path):
192+
return log_file_path
193+
194+
with open(log_file_path, 'w') as file:
195+
pass
196+
197+
return log_file_path
198+
199+
200+
201+
def stream_logs(self, job_id, pod_name):
169202
"""
170203
Streams logs from a Kubernetes pod and writes them to a file.
171204
172205
Parameters
173206
----------
174-
job_name : str
175-
Name of the Kubernetes pod to stream logs from.
207+
job_id : str
208+
ID of the Kubernetes job to use as a log file name.
209+
210+
pod_name : str
211+
Name of the Kubernetes pod to read logs from.
176212
177213
Returns
178214
-------
@@ -181,20 +217,20 @@ def stream_logs(self, job_name):
181217
log_lines_counter = 0
182218
v1 = client.CoreV1Api()
183219
w = watch.Watch()
184-
log_file_path = self.make_log_filename_for_job(job_name)
220+
log_file_path = self.make_log_filename_for_job(job_id)
185221
last_n_lines = self.get_last_n_lines(log_file_path, self.num_lines_to_check)
186222
if len(last_n_lines) == 0:
187-
logger.info(f"Log file '{log_file_path}' is empty or not found. Starting fresh logs for job '{job_name}'.")
223+
logger.info(f"Log file '{log_file_path}' is empty or not found. Starting fresh logs for job '{job_id}'.")
188224

189225
try:
190226
with open(log_file_path, 'a') as log_file:
191227
temp_dir = os.path.dirname(log_file_path)
192228
with tempfile.NamedTemporaryFile(mode='w+', delete=False, dir=temp_dir,
193-
prefix=f"{job_name}_logs_tmp_", suffix=".txt") as temp_logs:
229+
prefix=f"{job_id}_logs_tmp_", suffix=".txt") as temp_logs:
194230
temp_file_path = temp_logs.name
195231
for line in w.stream(
196232
v1.read_namespaced_pod_log,
197-
name=job_name,
233+
name=pod_name,
198234
namespace=self.namespace,
199235
follow=True,
200236
_preload_content=False
@@ -214,9 +250,9 @@ def stream_logs(self, job_name):
214250
self.concatenate_and_delete_files(log_file_path, temp_file_path)
215251
else:
216252
os.remove(temp_file_path)
217-
logger.info(f"Removed temporary file '{temp_file_path}' after streaming logs for job '{job_name}'.")
253+
logger.info(f"Removed temporary file '{temp_file_path}' after streaming logs for job '{job_id}'.")
218254
except Exception as e:
219-
logger.exception(f"Error streaming logs for job '{job_name}': {e}")
255+
logger.exception(f"Error streaming logs for job '{job_id}': {e}")
220256

221257
def handle_events(self, event):
222258
"""
@@ -241,11 +277,11 @@ def handle_events(self, event):
241277
else:
242278
self.watcher_threads[thread_name] = threading.Thread(
243279
target=self.stream_logs,
244-
args=(pod_name,)
280+
args=(job_id, pod_name,)
245281
)
246282
self.watcher_threads[thread_name].start()
247283
elif pod.status.phase in ['Succeeded', 'Failed']:
248-
log_filename = self.pod_tmp_mapping.get(pod_name)
284+
log_filename = self.get_existing_log_filename(job_id)
249285
if log_filename is not None and os.path.isfile(log_filename) and os.path.getsize(log_filename) > 0:
250286
if self.object_storage_provider.object_exists(job_id):
251287
logger.info(f"Log file for job '{job_id}' already exists in storage.")

0 commit comments

Comments
 (0)