diff --git a/jobs/director/spec b/jobs/director/spec index a7c88883521..038cb9d1fcc 100644 --- a/jobs/director/spec +++ b/jobs/director/spec @@ -282,6 +282,12 @@ properties: description: Client certificate for mutual TLS connections to an external metrics server director.metrics_server.tls.private_key: description: Client private key for mutual TLS connections to an external metrics server + director.metrics_server.file_retention_days: + description: 'Number of days to retain metric binary files in /var/vcap/store/director/metrics. Older files are automatically cleaned up by a scheduled job. Set to 0 to disable cleanup.' + default: 7 + director.metrics_server.cleanup_schedule: + description: 'RufusScheduler cron formatted schedule for cleanup of stale metrics files' + default: '0 0 0 * * * UTC' # once every day at midnight UTC # NATs nats.address: diff --git a/jobs/director/templates/director.yml.erb b/jobs/director/templates/director.yml.erb index 42aa781702a..9a8aa51f74f 100644 --- a/jobs/director/templates/director.yml.erb +++ b/jobs/director/templates/director.yml.erb @@ -162,6 +162,16 @@ params['scheduled_jobs'] << { 'schedule' => p('director.tasks_cleanup_schedule') } +if p('director.metrics_server.enabled') + params['scheduled_jobs'] << { + 'command' => 'ScheduledMetricsCleanup', + 'schedule' => p('director.metrics_server.cleanup_schedule'), + 'params' => [{ + 'retention_days' => p('director.metrics_server.file_retention_days') + }] + } +end + params['record_events'] = p('director.events.record_events') if params['record_events'] params['scheduled_jobs'] << { diff --git a/src/bosh-director/bin/bosh-director b/src/bosh-director/bin/bosh-director index 536a1bc16e4..c5e06970894 100755 --- a/src/bosh-director/bin/bosh-director +++ b/src/bosh-director/bin/bosh-director @@ -29,7 +29,7 @@ route_configuration = Bosh::Director::Api::RouteConfiguration.new(config) rack_app = Puma::Rack::Builder.app do use Rack::CommonLogger if config.metrics_server_enabled - Prometheus::Client.config.data_store = Prometheus::Client::DataStores::DirectFileStore.new(dir: '/var/vcap/store/director/metrics') + Prometheus::Client.config.data_store = Prometheus::Client::DataStores::DirectFileStore.new(dir: config.metrics_dir) use Bosh::Director::StripDeploymentsMiddlewareCollector use Prometheus::Middleware::Exporter end diff --git a/src/bosh-director/lib/bosh/director.rb b/src/bosh-director/lib/bosh/director.rb index 8696bb2d36a..7b6f150fefd 100644 --- a/src/bosh-director/lib/bosh/director.rb +++ b/src/bosh-director/lib/bosh/director.rb @@ -197,6 +197,7 @@ module Director require 'bosh/director/jobs/scheduled_dns_blobs_cleanup' require 'bosh/director/jobs/scheduled_dns_tombstone_cleanup' require 'bosh/director/jobs/scheduled_tasks_cleanup' +require 'bosh/director/jobs/scheduled_metrics_cleanup' require 'bosh/director/jobs/create_snapshot' require 'bosh/director/jobs/snapshot_deployment' require 'bosh/director/jobs/snapshot_deployments' diff --git a/src/bosh-director/lib/bosh/director/config.rb b/src/bosh-director/lib/bosh/director/config.rb index 25a99f281b3..6fd9f9e0b0e 100644 --- a/src/bosh-director/lib/bosh/director/config.rb +++ b/src/bosh-director/lib/bosh/director/config.rb @@ -265,6 +265,10 @@ def log_dir File.dirname(@log_file_path) if @log_file_path end + def metrics_dir + File.join(@base_dir, 'metrics') + end + def local_dns_enabled? !!@local_dns_enabled end diff --git a/src/bosh-director/lib/bosh/director/jobs/scheduled_metrics_cleanup.rb b/src/bosh-director/lib/bosh/director/jobs/scheduled_metrics_cleanup.rb new file mode 100644 index 00000000000..447bfdc4859 --- /dev/null +++ b/src/bosh-director/lib/bosh/director/jobs/scheduled_metrics_cleanup.rb @@ -0,0 +1,78 @@ +module Bosh::Director + module Jobs + class ScheduledMetricsCleanup < BaseJob + @queue = :normal + + def self.job_type + :scheduled_metrics_cleanup + end + + def self.has_work(params) + return false if params.first['retention_days'] <= 0 + + metrics_dir = Config.metrics_dir + return false unless File.directory?(metrics_dir) + + cutoff_time = time_days_ago(params.first['retention_days']) + + # Check if there are any files older than retention period + Dir.glob(File.join(metrics_dir, 'metric_*.bin')).any? do |file| + File.mtime(file) < cutoff_time + end + end + + def self.time_days_ago(days) + Time.now - (days * 24 * 60 * 60) + end + + def self.schedule_message + 'clean up stale metrics files' + end + + def initialize(params = {}) # rubocop:disable Lint/MissingSuper + @retention_days = params['retention_days'] + @metrics_dir = Config.metrics_dir + end + + def perform + return 'Metrics cleanup disabled (retention_days is 0)' if @retention_days <= 0 + return "Metrics directory does not exist: #{@metrics_dir}" unless File.directory?(@metrics_dir) + + cutoff_time = self.class.time_days_ago(@retention_days) + logger.info("Started cleanup of metrics files older than #{cutoff_time} from #{@metrics_dir}") + + files_to_delete = stale_files(cutoff_time) + deleted_count, failed_count = delete_files(files_to_delete) + + output = "Deleted #{deleted_count} metrics file(s) older than #{cutoff_time}." + output << " Failed to delete #{failed_count} file(s)." if failed_count.positive? + logger.info(output) + output + end + + private + + def stale_files(cutoff_time) + Dir.glob(File.join(@metrics_dir, 'metric_*.bin')).select do |file| + File.mtime(file) < cutoff_time + end + end + + def delete_files(files) + deleted_count = 0 + failed_count = 0 + + files.each do |file| + File.delete(file) + deleted_count += 1 + logger.debug("Deleted metrics file: #{file}") + rescue StandardError => e + failed_count += 1 + logger.warn("Failed to delete metrics file #{file}: #{e.message}") + end + + [deleted_count, failed_count] + end + end + end +end diff --git a/src/bosh-director/spec/unit/bosh/director/jobs/scheduled_metrics_cleanup_spec.rb b/src/bosh-director/spec/unit/bosh/director/jobs/scheduled_metrics_cleanup_spec.rb new file mode 100644 index 00000000000..6117f7b29b2 --- /dev/null +++ b/src/bosh-director/spec/unit/bosh/director/jobs/scheduled_metrics_cleanup_spec.rb @@ -0,0 +1,205 @@ +require 'spec_helper' + +module Bosh::Director + describe Jobs::ScheduledMetricsCleanup do + subject { described_class.new(*params) } + let(:params) do + [{ + 'retention_days' => retention_days, + }] + end + let(:retention_days) { 7 } + let(:metrics_dir) { Dir.mktmpdir } + let(:time) { Time.now } + let(:seven_days_seconds) { 7 * 24 * 60 * 60 } + let(:eight_days_ago) { time - seven_days_seconds - 86400 } + let(:six_days_ago) { time - seven_days_seconds + 86400 } + + before do + allow(Config).to receive(:metrics_dir).and_return(metrics_dir) + allow(Time).to receive(:now).and_return(time) + end + + after do + FileUtils.rm_rf(metrics_dir) if File.directory?(metrics_dir) + end + + describe '.job_type' do + it 'returns the job type' do + expect(described_class.job_type).to eq(:scheduled_metrics_cleanup) + end + end + + describe '.schedule_message' do + it 'outputs a message' do + expect(described_class.schedule_message).to eq('clean up stale metrics files') + end + end + + describe '.time_days_ago' do + it 'calculates time correctly' do + expect(described_class.time_days_ago(7)).to eq(time - seven_days_seconds) + end + end + + describe '.has_work' do + context 'when retention_days is 0' do + let(:retention_days) { 0 } + + it 'returns false' do + expect(described_class.has_work(params)).to eq(false) + end + end + + context 'when metrics directory does not exist' do + before do + FileUtils.rm_rf(metrics_dir) + end + + it 'returns false' do + expect(described_class.has_work(params)).to eq(false) + end + end + + context 'when there are stale files' do + before do + old_file = File.join(metrics_dir, 'metric_old.bin') + File.write(old_file, 'data') + File.utime(eight_days_ago, eight_days_ago, old_file) + end + + it 'returns true' do + expect(described_class.has_work(params)).to eq(true) + end + end + + context 'when there are no stale files' do + before do + recent_file = File.join(metrics_dir, 'metric_recent.bin') + File.write(recent_file, 'data') + File.utime(six_days_ago, six_days_ago, recent_file) + end + + it 'returns false' do + expect(described_class.has_work(params)).to eq(false) + end + end + end + + describe '#perform' do + context 'when retention_days is 0' do + let(:retention_days) { 0 } + + it 'returns disabled message' do + expect(subject.perform).to eq('Metrics cleanup disabled (retention_days is 0)') + end + end + + context 'when metrics directory does not exist' do + before do + FileUtils.rm_rf(metrics_dir) + end + + it 'returns directory not exist message' do + expect(subject.perform).to eq("Metrics directory does not exist: #{metrics_dir}") + end + end + + context 'when there are files to clean up' do + let!(:old_file_1) { File.join(metrics_dir, 'metric_old_1.bin') } + let!(:old_file_2) { File.join(metrics_dir, 'metric_old_2.bin') } + let!(:recent_file) { File.join(metrics_dir, 'metric_recent.bin') } + let!(:other_file) { File.join(metrics_dir, 'other_file.txt') } + + before do + # Create old files (older than retention period) + File.write(old_file_1, 'data1') + File.utime(eight_days_ago, eight_days_ago, old_file_1) + + File.write(old_file_2, 'data2') + File.utime(eight_days_ago, eight_days_ago, old_file_2) + + # Create recent file (within retention period) + File.write(recent_file, 'data3') + File.utime(six_days_ago, six_days_ago, recent_file) + + # Create non-metric file (should not be deleted) + File.write(other_file, 'other') + File.utime(eight_days_ago, eight_days_ago, other_file) + end + + it 'deletes only old metric files' do + subject.perform + + expect(File.exist?(old_file_1)).to eq(false) + expect(File.exist?(old_file_2)).to eq(false) + expect(File.exist?(recent_file)).to eq(true) + expect(File.exist?(other_file)).to eq(true) + end + + it 'returns success message with count' do + cutoff_time = time - seven_days_seconds + expect(subject.perform).to eq("Deleted 2 metrics file(s) older than #{cutoff_time}.") + end + + it 'logs the cleanup operation' do + logger = double('logger', info: nil, debug: nil, warn: nil) + allow(subject).to receive(:logger).and_return(logger) + + subject.perform + + expect(logger).to have_received(:info).at_least(:once) + end + end + + context 'when file deletion fails' do + let!(:protected_file) { File.join(metrics_dir, 'metric_protected.bin') } + + before do + File.write(protected_file, 'data') + File.utime(eight_days_ago, eight_days_ago, protected_file) + allow(File).to receive(:delete).with(protected_file).and_raise(Errno::EACCES, 'Permission denied') + end + + it 'logs warning and continues' do + logger = double('logger', info: nil, debug: nil, warn: nil) + allow(subject).to receive(:logger).and_return(logger) + + result = subject.perform + + expect(logger).to have_received(:warn).with(/Failed to delete metrics file/) + expect(result).to match(/Failed to delete 1 file\(s\)/) + end + + it 'includes failure count in result message' do + cutoff_time = time - seven_days_seconds + result = subject.perform + expect(result).to eq("Deleted 0 metrics file(s) older than #{cutoff_time}. Failed to delete 1 file(s).") + end + end + + context 'when there are no files to clean up' do + it 'returns message with zero count' do + cutoff_time = time - seven_days_seconds + expect(subject.perform).to eq("Deleted 0 metrics file(s) older than #{cutoff_time}.") + end + end + + context 'with different retention periods' do + let(:retention_days) { 30 } + let(:thirty_one_days_ago) { time - (31 * 24 * 60 * 60) } + let!(:very_old_file) { File.join(metrics_dir, 'metric_very_old.bin') } + + before do + File.write(very_old_file, 'data') + File.utime(thirty_one_days_ago, thirty_one_days_ago, very_old_file) + end + + it 'respects the configured retention period' do + subject.perform + expect(File.exist?(very_old_file)).to eq(false) + end + end + end + end +end