187 lines
4.9 KiB
Go
187 lines
4.9 KiB
Go
|
|
// Package task provides retry queue management for failed asynchronous tasks
|
||
|
|
package task
|
||
|
|
|
||
|
|
import (
|
||
|
|
"context"
|
||
|
|
"time"
|
||
|
|
|
||
|
|
"modelRT/database"
|
||
|
|
"modelRT/logger"
|
||
|
|
|
||
|
|
"github.com/gofrs/uuid"
|
||
|
|
"gorm.io/gorm"
|
||
|
|
)
|
||
|
|
|
||
|
|
// RetryQueue manages scheduling and execution of task retries
|
||
|
|
type RetryQueue struct {
|
||
|
|
db *gorm.DB
|
||
|
|
producer *QueueProducer
|
||
|
|
strategy RetryStrategy
|
||
|
|
}
|
||
|
|
|
||
|
|
// NewRetryQueue creates a new RetryQueue instance
|
||
|
|
func NewRetryQueue(db *gorm.DB, producer *QueueProducer, strategy RetryStrategy) *RetryQueue {
|
||
|
|
if strategy == nil {
|
||
|
|
strategy = DefaultRetryStrategy()
|
||
|
|
}
|
||
|
|
|
||
|
|
return &RetryQueue{
|
||
|
|
db: db,
|
||
|
|
producer: producer,
|
||
|
|
strategy: strategy,
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// ScheduleRetry schedules a failed task for retry based on retry strategy
|
||
|
|
func (q *RetryQueue) ScheduleRetry(ctx context.Context, taskID uuid.UUID, taskType TaskType, retryCount int, lastError error) error {
|
||
|
|
// Check if task should be retried
|
||
|
|
shouldRetry, delay := q.strategy.ShouldRetry(ctx, taskID.String(), retryCount, lastError)
|
||
|
|
if !shouldRetry {
|
||
|
|
// Mark task as permanently failed
|
||
|
|
logger.Info(ctx, "Task will not be retried, marking as failed",
|
||
|
|
"task_id", taskID,
|
||
|
|
"retry_count", retryCount,
|
||
|
|
"max_retries", q.strategy.GetMaxRetries(),
|
||
|
|
"last_error", lastError,
|
||
|
|
)
|
||
|
|
return database.FailAsyncTask(ctx, q.db, taskID, time.Now().Unix())
|
||
|
|
}
|
||
|
|
|
||
|
|
// Calculate next retry time
|
||
|
|
nextRetryTime := time.Now().Add(delay).Unix()
|
||
|
|
|
||
|
|
// Update task retry information in database
|
||
|
|
err := q.db.Transaction(func(tx *gorm.DB) error {
|
||
|
|
if err := database.UpdateTaskRetryInfo(ctx, tx, taskID, retryCount+1, nextRetryTime); err != nil {
|
||
|
|
return err
|
||
|
|
}
|
||
|
|
|
||
|
|
// Update error information
|
||
|
|
errorMsg := ""
|
||
|
|
if lastError != nil {
|
||
|
|
errorMsg = lastError.Error()
|
||
|
|
}
|
||
|
|
if err := database.UpdateTaskErrorInfo(ctx, tx, taskID, errorMsg, ""); err != nil {
|
||
|
|
// Log but don't fail the whole retry scheduling
|
||
|
|
logger.Warn(ctx, "Failed to update task error info",
|
||
|
|
"task_id", taskID,
|
||
|
|
"error", err,
|
||
|
|
)
|
||
|
|
}
|
||
|
|
|
||
|
|
// Task will be picked up by ProcessRetryQueue when next_retry_time is reached
|
||
|
|
return nil
|
||
|
|
})
|
||
|
|
|
||
|
|
if err != nil {
|
||
|
|
logger.Error(ctx, "Failed to schedule task retry",
|
||
|
|
"task_id", taskID,
|
||
|
|
"task_type", taskType,
|
||
|
|
"retry_count", retryCount,
|
||
|
|
"delay", delay,
|
||
|
|
"error", err,
|
||
|
|
)
|
||
|
|
return err
|
||
|
|
}
|
||
|
|
|
||
|
|
logger.Info(ctx, "Task scheduled for retry",
|
||
|
|
"task_id", taskID,
|
||
|
|
"task_type", taskType,
|
||
|
|
"retry_count", retryCount+1,
|
||
|
|
"next_retry_in", delay,
|
||
|
|
"next_retry_time", time.Unix(nextRetryTime, 0).Format(time.RFC3339),
|
||
|
|
)
|
||
|
|
|
||
|
|
return nil
|
||
|
|
}
|
||
|
|
|
||
|
|
// ProcessRetryQueue processes tasks that are due for retry
|
||
|
|
func (q *RetryQueue) ProcessRetryQueue(ctx context.Context, batchSize int) error {
|
||
|
|
// Get tasks due for retry
|
||
|
|
tasks, err := database.GetTasksForRetry(ctx, q.db, batchSize)
|
||
|
|
if err != nil {
|
||
|
|
logger.Error(ctx, "Failed to get tasks for retry", "error", err)
|
||
|
|
return err
|
||
|
|
}
|
||
|
|
|
||
|
|
if len(tasks) == 0 {
|
||
|
|
return nil
|
||
|
|
}
|
||
|
|
|
||
|
|
logger.Info(ctx, "Processing retry queue",
|
||
|
|
"task_count", len(tasks),
|
||
|
|
"batch_size", batchSize,
|
||
|
|
)
|
||
|
|
|
||
|
|
for _, task := range tasks {
|
||
|
|
select {
|
||
|
|
case <-ctx.Done():
|
||
|
|
return ctx.Err()
|
||
|
|
default:
|
||
|
|
// Publish task to queue for immediate processing
|
||
|
|
taskType := TaskType(task.TaskType)
|
||
|
|
if err := q.producer.PublishTask(ctx, task.TaskID, taskType, task.Priority); err != nil {
|
||
|
|
logger.Error(ctx, "Failed to publish retry task to queue",
|
||
|
|
"task_id", task.TaskID,
|
||
|
|
"task_type", taskType,
|
||
|
|
"error", err,
|
||
|
|
)
|
||
|
|
// Continue with other tasks
|
||
|
|
continue
|
||
|
|
}
|
||
|
|
|
||
|
|
// Update task status back to submitted
|
||
|
|
if err := database.UpdateAsyncTaskStatus(ctx, q.db, task.TaskID, "SUBMITTED"); err != nil {
|
||
|
|
logger.Warn(ctx, "Failed to update retry task status",
|
||
|
|
"task_id", task.TaskID,
|
||
|
|
"error", err,
|
||
|
|
)
|
||
|
|
}
|
||
|
|
|
||
|
|
// Clear next retry time since task is being retried now
|
||
|
|
if err := database.UpdateTaskRetryInfo(ctx, q.db, task.TaskID, task.RetryCount, 0); err != nil {
|
||
|
|
logger.Warn(ctx, "Failed to clear next retry time",
|
||
|
|
"task_id", task.TaskID,
|
||
|
|
"error", err,
|
||
|
|
)
|
||
|
|
}
|
||
|
|
|
||
|
|
logger.Info(ctx, "Retry task resubmitted",
|
||
|
|
"task_id", task.TaskID,
|
||
|
|
"task_type", taskType,
|
||
|
|
"retry_count", task.RetryCount,
|
||
|
|
)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return nil
|
||
|
|
}
|
||
|
|
|
||
|
|
// StartRetryScheduler starts a background goroutine to periodically process retry queue
|
||
|
|
func (q *RetryQueue) StartRetryScheduler(ctx context.Context, interval time.Duration, batchSize int) {
|
||
|
|
go func() {
|
||
|
|
ticker := time.NewTicker(interval)
|
||
|
|
defer ticker.Stop()
|
||
|
|
|
||
|
|
for {
|
||
|
|
select {
|
||
|
|
case <-ctx.Done():
|
||
|
|
logger.Info(ctx, "Retry scheduler stopping")
|
||
|
|
return
|
||
|
|
case <-ticker.C:
|
||
|
|
if err := q.ProcessRetryQueue(ctx, batchSize); err != nil {
|
||
|
|
logger.Error(ctx, "Error processing retry queue", "error", err)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}()
|
||
|
|
}
|
||
|
|
|
||
|
|
// GetRetryStats returns statistics about retry queue
|
||
|
|
func (q *RetryQueue) GetRetryStats(ctx context.Context) (int, error) {
|
||
|
|
tasks, err := database.GetTasksForRetry(ctx, q.db, 1000) // Large limit to count
|
||
|
|
if err != nil {
|
||
|
|
return 0, err
|
||
|
|
}
|
||
|
|
return len(tasks), nil
|
||
|
|
}
|