Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions jobs/director/spec
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,12 @@ properties:
description: Client certificate for mutual TLS connections to an external metrics server
director.metrics_server.tls.private_key:
description: Client private key for mutual TLS connections to an external metrics server
director.metrics_server.file_retention_days:
description: 'Number of days to retain metric binary files in /var/vcap/store/director/metrics. Older files are automatically cleaned up by a scheduled job. Set to 0 to disable cleanup.'
default: 7
director.metrics_server.cleanup_schedule:
description: 'RufusScheduler cron formatted schedule for cleanup of stale metrics files'
default: '0 0 0 * * * UTC' # once every day at midnight UTC

# NATs
nats.address:
Expand Down
10 changes: 10 additions & 0 deletions jobs/director/templates/director.yml.erb
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,16 @@ params['scheduled_jobs'] << {
'schedule' => p('director.tasks_cleanup_schedule')
}

if p('director.metrics_server.enabled')
params['scheduled_jobs'] << {
'command' => 'ScheduledMetricsCleanup',
'schedule' => p('director.metrics_server.cleanup_schedule'),
'params' => [{
'retention_days' => p('director.metrics_server.file_retention_days')
}]
}
end

params['record_events'] = p('director.events.record_events')
if params['record_events']
params['scheduled_jobs'] << {
Expand Down
2 changes: 1 addition & 1 deletion src/bosh-director/bin/bosh-director
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ route_configuration = Bosh::Director::Api::RouteConfiguration.new(config)
rack_app = Puma::Rack::Builder.app do
use Rack::CommonLogger
if config.metrics_server_enabled
Prometheus::Client.config.data_store = Prometheus::Client::DataStores::DirectFileStore.new(dir: '/var/vcap/store/director/metrics')
Prometheus::Client.config.data_store = Prometheus::Client::DataStores::DirectFileStore.new(dir: config.metrics_dir)
use Bosh::Director::StripDeploymentsMiddlewareCollector
use Prometheus::Middleware::Exporter
end
Expand Down
1 change: 1 addition & 0 deletions src/bosh-director/lib/bosh/director.rb
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ module Director
require 'bosh/director/jobs/scheduled_dns_blobs_cleanup'
require 'bosh/director/jobs/scheduled_dns_tombstone_cleanup'
require 'bosh/director/jobs/scheduled_tasks_cleanup'
require 'bosh/director/jobs/scheduled_metrics_cleanup'
require 'bosh/director/jobs/create_snapshot'
require 'bosh/director/jobs/snapshot_deployment'
require 'bosh/director/jobs/snapshot_deployments'
Expand Down
4 changes: 4 additions & 0 deletions src/bosh-director/lib/bosh/director/config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,10 @@ def log_dir
File.dirname(@log_file_path) if @log_file_path
end

def metrics_dir
File.join(@base_dir, 'metrics')
end

def local_dns_enabled?
!!@local_dns_enabled
end
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
module Bosh::Director
module Jobs
class ScheduledMetricsCleanup < BaseJob
@queue = :normal

def self.job_type
:scheduled_metrics_cleanup
end

def self.has_work(params)
return false if params.first['retention_days'] <= 0

metrics_dir = Config.metrics_dir
return false unless File.directory?(metrics_dir)

cutoff_time = time_days_ago(params.first['retention_days'])

# Check if there are any files older than retention period
Dir.glob(File.join(metrics_dir, 'metric_*.bin')).any? do |file|
File.mtime(file) < cutoff_time
end
end

def self.time_days_ago(days)
Time.now - (days * 24 * 60 * 60)
end

def self.schedule_message
'clean up stale metrics files'
end

def initialize(params = {}) # rubocop:disable Lint/MissingSuper
@retention_days = params['retention_days']
@metrics_dir = Config.metrics_dir
end

def perform
return 'Metrics cleanup disabled (retention_days is 0)' if @retention_days <= 0
return "Metrics directory does not exist: #{@metrics_dir}" unless File.directory?(@metrics_dir)

cutoff_time = self.class.time_days_ago(@retention_days)
logger.info("Started cleanup of metrics files older than #{cutoff_time} from #{@metrics_dir}")

files_to_delete = stale_files(cutoff_time)
deleted_count, failed_count = delete_files(files_to_delete)

output = "Deleted #{deleted_count} metrics file(s) older than #{cutoff_time}."
output << " Failed to delete #{failed_count} file(s)." if failed_count.positive?
logger.info(output)
output
end

private

def stale_files(cutoff_time)
Dir.glob(File.join(@metrics_dir, 'metric_*.bin')).select do |file|
File.mtime(file) < cutoff_time
end
end

def delete_files(files)
deleted_count = 0
failed_count = 0

files.each do |file|
File.delete(file)
deleted_count += 1
logger.debug("Deleted metrics file: #{file}")
rescue StandardError => e
failed_count += 1
logger.warn("Failed to delete metrics file #{file}: #{e.message}")
end

[deleted_count, failed_count]
end
end
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
require 'spec_helper'

module Bosh::Director
describe Jobs::ScheduledMetricsCleanup do
subject { described_class.new(*params) }
let(:params) do
[{
'retention_days' => retention_days,
}]
end
let(:retention_days) { 7 }
let(:metrics_dir) { Dir.mktmpdir }
let(:time) { Time.now }
let(:seven_days_seconds) { 7 * 24 * 60 * 60 }
let(:eight_days_ago) { time - seven_days_seconds - 86400 }
let(:six_days_ago) { time - seven_days_seconds + 86400 }

before do
allow(Config).to receive(:metrics_dir).and_return(metrics_dir)
allow(Time).to receive(:now).and_return(time)
end

after do
FileUtils.rm_rf(metrics_dir) if File.directory?(metrics_dir)
end

describe '.job_type' do
it 'returns the job type' do
expect(described_class.job_type).to eq(:scheduled_metrics_cleanup)
end
end

describe '.schedule_message' do
it 'outputs a message' do
expect(described_class.schedule_message).to eq('clean up stale metrics files')
end
end

describe '.time_days_ago' do
it 'calculates time correctly' do
expect(described_class.time_days_ago(7)).to eq(time - seven_days_seconds)
end
end

describe '.has_work' do
context 'when retention_days is 0' do
let(:retention_days) { 0 }

it 'returns false' do
expect(described_class.has_work(params)).to eq(false)
end
end

context 'when metrics directory does not exist' do
before do
FileUtils.rm_rf(metrics_dir)
end

it 'returns false' do
expect(described_class.has_work(params)).to eq(false)
end
end

context 'when there are stale files' do
before do
old_file = File.join(metrics_dir, 'metric_old.bin')
File.write(old_file, 'data')
File.utime(eight_days_ago, eight_days_ago, old_file)
end

it 'returns true' do
expect(described_class.has_work(params)).to eq(true)
end
end

context 'when there are no stale files' do
before do
recent_file = File.join(metrics_dir, 'metric_recent.bin')
File.write(recent_file, 'data')
File.utime(six_days_ago, six_days_ago, recent_file)
end

it 'returns false' do
expect(described_class.has_work(params)).to eq(false)
end
end
end

describe '#perform' do
context 'when retention_days is 0' do
let(:retention_days) { 0 }

it 'returns disabled message' do
expect(subject.perform).to eq('Metrics cleanup disabled (retention_days is 0)')
end
end

context 'when metrics directory does not exist' do
before do
FileUtils.rm_rf(metrics_dir)
end

it 'returns directory not exist message' do
expect(subject.perform).to eq("Metrics directory does not exist: #{metrics_dir}")
end
end

context 'when there are files to clean up' do
let!(:old_file_1) { File.join(metrics_dir, 'metric_old_1.bin') }
let!(:old_file_2) { File.join(metrics_dir, 'metric_old_2.bin') }
let!(:recent_file) { File.join(metrics_dir, 'metric_recent.bin') }
let!(:other_file) { File.join(metrics_dir, 'other_file.txt') }

before do
# Create old files (older than retention period)
File.write(old_file_1, 'data1')
File.utime(eight_days_ago, eight_days_ago, old_file_1)

File.write(old_file_2, 'data2')
File.utime(eight_days_ago, eight_days_ago, old_file_2)

# Create recent file (within retention period)
File.write(recent_file, 'data3')
File.utime(six_days_ago, six_days_ago, recent_file)

# Create non-metric file (should not be deleted)
File.write(other_file, 'other')
File.utime(eight_days_ago, eight_days_ago, other_file)
end

it 'deletes only old metric files' do
subject.perform

expect(File.exist?(old_file_1)).to eq(false)
expect(File.exist?(old_file_2)).to eq(false)
expect(File.exist?(recent_file)).to eq(true)
expect(File.exist?(other_file)).to eq(true)
end

it 'returns success message with count' do
cutoff_time = time - seven_days_seconds
expect(subject.perform).to eq("Deleted 2 metrics file(s) older than #{cutoff_time}.")
end

it 'logs the cleanup operation' do
logger = double('logger', info: nil, debug: nil, warn: nil)
allow(subject).to receive(:logger).and_return(logger)

subject.perform

expect(logger).to have_received(:info).at_least(:once)
end
end

context 'when file deletion fails' do
let!(:protected_file) { File.join(metrics_dir, 'metric_protected.bin') }

before do
File.write(protected_file, 'data')
File.utime(eight_days_ago, eight_days_ago, protected_file)
allow(File).to receive(:delete).with(protected_file).and_raise(Errno::EACCES, 'Permission denied')
end

it 'logs warning and continues' do
logger = double('logger', info: nil, debug: nil, warn: nil)
allow(subject).to receive(:logger).and_return(logger)

result = subject.perform

expect(logger).to have_received(:warn).with(/Failed to delete metrics file/)
expect(result).to match(/Failed to delete 1 file\(s\)/)
end

it 'includes failure count in result message' do
cutoff_time = time - seven_days_seconds
result = subject.perform
expect(result).to eq("Deleted 0 metrics file(s) older than #{cutoff_time}. Failed to delete 1 file(s).")
end
end

context 'when there are no files to clean up' do
it 'returns message with zero count' do
cutoff_time = time - seven_days_seconds
expect(subject.perform).to eq("Deleted 0 metrics file(s) older than #{cutoff_time}.")
end
end

context 'with different retention periods' do
let(:retention_days) { 30 }
let(:thirty_one_days_ago) { time - (31 * 24 * 60 * 60) }
let!(:very_old_file) { File.join(metrics_dir, 'metric_very_old.bin') }

before do
File.write(very_old_file, 'data')
File.utime(thirty_one_days_ago, thirty_one_days_ago, very_old_file)
end

it 'respects the configured retention period' do
subject.perform
expect(File.exist?(very_old_file)).to eq(false)
end
end
end
end
end
Loading