modelRT/task/retry_queue.go

187 lines
4.9 KiB
Go

// Package task provides retry queue management for failed asynchronous tasks
package task
import (
"context"
"time"
"modelRT/database"
"modelRT/logger"
"github.com/gofrs/uuid"
"gorm.io/gorm"
)
// RetryQueue manages scheduling and execution of task retries
type RetryQueue struct {
db *gorm.DB
producer *QueueProducer
strategy RetryStrategy
}
// NewRetryQueue creates a new RetryQueue instance
func NewRetryQueue(db *gorm.DB, producer *QueueProducer, strategy RetryStrategy) *RetryQueue {
if strategy == nil {
strategy = DefaultRetryStrategy()
}
return &RetryQueue{
db: db,
producer: producer,
strategy: strategy,
}
}
// ScheduleRetry schedules a failed task for retry based on retry strategy
func (q *RetryQueue) ScheduleRetry(ctx context.Context, taskID uuid.UUID, taskType TaskType, retryCount int, lastError error) error {
// Check if task should be retried
shouldRetry, delay := q.strategy.ShouldRetry(ctx, taskID.String(), retryCount, lastError)
if !shouldRetry {
// Mark task as permanently failed
logger.Info(ctx, "Task will not be retried, marking as failed",
"task_id", taskID,
"retry_count", retryCount,
"max_retries", q.strategy.GetMaxRetries(),
"last_error", lastError,
)
return database.FailAsyncTask(ctx, q.db, taskID, time.Now().Unix())
}
// Calculate next retry time
nextRetryTime := time.Now().Add(delay).Unix()
// Update task retry information in database
err := q.db.Transaction(func(tx *gorm.DB) error {
if err := database.UpdateTaskRetryInfo(ctx, tx, taskID, retryCount+1, nextRetryTime); err != nil {
return err
}
// Update error information
errorMsg := ""
if lastError != nil {
errorMsg = lastError.Error()
}
if err := database.UpdateTaskErrorInfo(ctx, tx, taskID, errorMsg, ""); err != nil {
// Log but don't fail the whole retry scheduling
logger.Warn(ctx, "Failed to update task error info",
"task_id", taskID,
"error", err,
)
}
// Task will be picked up by ProcessRetryQueue when next_retry_time is reached
return nil
})
if err != nil {
logger.Error(ctx, "Failed to schedule task retry",
"task_id", taskID,
"task_type", taskType,
"retry_count", retryCount,
"delay", delay,
"error", err,
)
return err
}
logger.Info(ctx, "Task scheduled for retry",
"task_id", taskID,
"task_type", taskType,
"retry_count", retryCount+1,
"next_retry_in", delay,
"next_retry_time", time.Unix(nextRetryTime, 0).Format(time.RFC3339),
)
return nil
}
// ProcessRetryQueue processes tasks that are due for retry
func (q *RetryQueue) ProcessRetryQueue(ctx context.Context, batchSize int) error {
// Get tasks due for retry
tasks, err := database.GetTasksForRetry(ctx, q.db, batchSize)
if err != nil {
logger.Error(ctx, "Failed to get tasks for retry", "error", err)
return err
}
if len(tasks) == 0 {
return nil
}
logger.Info(ctx, "Processing retry queue",
"task_count", len(tasks),
"batch_size", batchSize,
)
for _, task := range tasks {
select {
case <-ctx.Done():
return ctx.Err()
default:
// Publish task to queue for immediate processing
taskType := TaskType(task.TaskType)
if err := q.producer.PublishTask(ctx, task.TaskID, taskType, task.Priority); err != nil {
logger.Error(ctx, "Failed to publish retry task to queue",
"task_id", task.TaskID,
"task_type", taskType,
"error", err,
)
// Continue with other tasks
continue
}
// Update task status back to submitted
if err := database.UpdateAsyncTaskStatus(ctx, q.db, task.TaskID, "SUBMITTED"); err != nil {
logger.Warn(ctx, "Failed to update retry task status",
"task_id", task.TaskID,
"error", err,
)
}
// Clear next retry time since task is being retried now
if err := database.UpdateTaskRetryInfo(ctx, q.db, task.TaskID, task.RetryCount, 0); err != nil {
logger.Warn(ctx, "Failed to clear next retry time",
"task_id", task.TaskID,
"error", err,
)
}
logger.Info(ctx, "Retry task resubmitted",
"task_id", task.TaskID,
"task_type", taskType,
"retry_count", task.RetryCount,
)
}
}
return nil
}
// StartRetryScheduler starts a background goroutine to periodically process retry queue
func (q *RetryQueue) StartRetryScheduler(ctx context.Context, interval time.Duration, batchSize int) {
go func() {
ticker := time.NewTicker(interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
logger.Info(ctx, "Retry scheduler stopping")
return
case <-ticker.C:
if err := q.ProcessRetryQueue(ctx, batchSize); err != nil {
logger.Error(ctx, "Error processing retry queue", "error", err)
}
}
}
}()
}
// GetRetryStats returns statistics about retry queue
func (q *RetryQueue) GetRetryStats(ctx context.Context) (int, error) {
tasks, err := database.GetTasksForRetry(ctx, q.db, 1000) // Large limit to count
if err != nil {
return 0, err
}
return len(tasks), nil
}