diff --git a/.alltests b/.alltests new file mode 100755 index 0000000000..75ccf9a527 --- /dev/null +++ b/.alltests @@ -0,0 +1,22 @@ +#!/bin/bash + +set -e + +TOP_DIR=$(dirname $(realpath "$0")) + +echo "==== Unit tests ====" +resetswift +$TOP_DIR/.unittests $@ + +echo "==== Func tests ====" +resetswift +startmain +$TOP_DIR/.functests $@ + +echo "==== Probe tests ====" +resetswift +$TOP_DIR/.probetests $@ + +echo "All tests runs fine" + +exit 0 diff --git a/.coveragerc b/.coveragerc index 5893dbe75f..6f8f341733 100644 --- a/.coveragerc +++ b/.coveragerc @@ -3,4 +3,5 @@ branch = True omit = /usr*,setup.py,*egg*,.venv/*,.tox/*,test/* [report] -ignore-errors = True +show_missing = True +ignore_errors = True diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000..e8bdd7d5cc --- /dev/null +++ b/.dockerignore @@ -0,0 +1,11 @@ +.tox +api-ref +cover +doc/manpages +doc/s3api +doc/source +examples +releasenotes +.stestr +test +tools diff --git a/.functests b/.functests index 9797f58a1e..3d1af25b73 100755 --- a/.functests +++ b/.functests @@ -1,15 +1,15 @@ #!/bin/bash -SRC_DIR=$(dirname $0) +# How-To debug functional tests: +# SWIFT_TEST_IN_PROCESS=1 tox -e func -- --pdb test.functional.tests.TestFile.testCopy -cd ${SRC_DIR}/test/functional -nosetests --exe $@ -func1=$? -cd - +SRC_DIR=$(dirname $(realpath "$0")) -cd ${SRC_DIR}/test/functionalnosetests -nosetests --exe $@ -func2=$? -cd - +cd ${SRC_DIR} > /dev/null +export TESTS_DIR=${SRC_DIR}/test/functional +ARGS="run --concurrency 1 $@" +stestr $ARGS || stestr run --concurrency 1 --failing +rvalue=$? +cd - > /dev/null -exit $((func1 + func2)) +exit $rvalue diff --git a/.gitignore b/.gitignore index ae7e6fcc59..46ae0a33b4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,27 @@ *.py[co] *.sw? +*~ doc/build/* dist +build +cover ChangeLog .coverage -swift.egg-info +*.egg +*.egg-info +.eggs/* .DS_Store .tox -pycscope.* \ No newline at end of file +pycscope.* +.idea +MANIFEST + +.testrepository/* +.stestr/* +.noseids +subunit.log +test/probe/.noseids +RELEASENOTES.rst +releasenotes/notes/reno.cache +/tools/playbooks/**/*.retry +.vscode/* diff --git a/.gitreview b/.gitreview index d7c52c0593..529e7ec93d 100644 --- a/.gitreview +++ b/.gitreview @@ -1,4 +1,4 @@ [gerrit] -host=review.openstack.org +host=review.opendev.org port=29418 project=openstack/swift.git diff --git a/.mailmap b/.mailmap index c0d6e31d9a..adcdf007d9 100644 --- a/.mailmap +++ b/.mailmap @@ -2,6 +2,7 @@ Greg Holt gholt Greg Holt gholt Greg Holt gholt Greg Holt gholt +Greg Holt Greg Holt John Dickinson Michael Barton @@ -9,6 +10,7 @@ Michael Barton Michael Barton Mike Barton Clay Gerrard Clay Gerrard +Clay Gerrard Clay Gerrard clayg David Goetz David Goetz @@ -17,19 +19,133 @@ Anne Gentle annegentle Fujita Tomonori Greg Lange Greg Lange +Chmouel Boudjnah Gaurav B. Gangalwar gaurav@gluster.com <> Joe Arnold Kapil Thangavelu kapil.foss@gmail.com <> Samuel Merritt +Samuel Merritt Morita Kazutaka -Zhong Yue Luo lzyeval +Zhongyue Luo Russ Nelson Marcelo Martins Andrew Clay Shafer Soren Hansen +Soren Hansen Ye Jia Xu monsterxx03 Victor Rodionov Florian Hines Jay Payne Doug Weimer Li Riqiang lrqrun +Cory Wright +Julien Danjou +David Hadas +Yaguang Wang ywang19 +Liu Siqi dk647 +James E. Blair +James E. Blair +Kun Huang +Michael Shuler +Ilya Kharin +Dmitry Ukov Ukov Dmitry +Tom Fifield Tom Fifield +Sascha Peilicke Sascha Peilicke +Zhenguo Niu +Peter Portante +Christian Schwede +Christian Schwede +Constantine Peresypkin +Madhuri Kumari madhuri +Morgan Fainberg +Hua Zhang +Yummy Bian +Alistair Coles +Alistair Coles +Tong Li +Paul Luse +Yuan Zhou +Jola Mirecka +Ning Zhang +Mauro Stettler +Pawel Palucki +Guang Yee +Jing Liuqing +Lorcan Browne +Eohyung Lee +Harshit Chitalia +Richard Hawkins +Sarvesh Ranjan +Minwoo Bae Minwoo B +Jaivish Kothari +Michael Matur +Kazuhiro Miyahara +Alexandra Settle +Kenichiro Matsuda +Atsushi Sakai +Takashi Natsume +Nakagawa Masaaki nakagawamsa +Romain Le Disez Romain LE DISEZ +Romain Le Disez +Donagh McCabe +Donagh McCabe +Eamonn O'Toole +Gerry Drudy +Mark Seger +Timur Alperovich +Mehdi Abaakouk +Richard Hawkins +Ondrej Novy +Ondrej Novy +Peter Lisák +Peter Lisák Peter Lisák +Ke Liang +Daisuke Morita +Andreas Jaeger +Hugo Kuo +Gage Hugo +Oshrit Feder +Larry Rensing +Ben Keller +Chaozhe Chen +Brian Cline +Brian Cline +Dharmendra Kushwaha +Zhang Guoqing +Kato Tomoyuki +Liang Jingtao +Yu Yafei +Zheng Yao +Paul Dardeau +Cheng Li +Nandini Tata +Flavio Percoco +Tin Lam +Hisashi Osanai +Bryan Keller +Doug Hellmann +zhangdebo1987 zhangdebo +Thomas Goirand +Thiago da Silva +Kota Tsuyuzaki +Kota Tsuyuzaki +Kota Tsuyuzaki +Ehud Kaldor +Takashi Kajinami +Takashi Kajinami +Yuxin Wang Wang Yuxin +Gilles Biannic gillesbiannic +melissaml +Ashwin Nair indianwhocodes +Romain de Joux +Takashi Natsume +Pete Zaitcev +Mandell Degerness +Anish Kachinthaya +Daanish Khan +Chris Smart +Ashwin Nair +Chinemerem Chigbo Chinemerem +Chinemerem Chigbo +Tim Burke +Christian Ohanaja diff --git a/.manpages b/.manpages new file mode 100755 index 0000000000..69fcfc74d5 --- /dev/null +++ b/.manpages @@ -0,0 +1,18 @@ +#!/bin/sh + +RET=0 +for MAN in doc/manpages/* ; do + OUTPUT=$(LC_ALL=en_US.UTF-8 MANROFFSEQ='' MANWIDTH=80 man --warnings -E UTF-8 -l \ + -Tutf8 -Z "$MAN" 2>&1 >/dev/null) + if [ -n "$OUTPUT" ] ; then + RET=1 + echo "$MAN:" + echo "$OUTPUT" + fi +done + +if [ "$RET" -eq "0" ] ; then + echo "All manpages are fine" +fi + +exit "$RET" diff --git a/.probetests b/.probetests index 5d9f69915e..9a7aafda63 100755 --- a/.probetests +++ b/.probetests @@ -1,7 +1,9 @@ #!/bin/bash -cd $(dirname $0)/test/probe -nosetests --exe +SRC_DIR=$(dirname $(realpath "$0")) + +cd ${SRC_DIR}/test/probe +pytest $@ rvalue=$? cd - diff --git a/.stestr.conf b/.stestr.conf new file mode 100644 index 0000000000..3e674fb026 --- /dev/null +++ b/.stestr.conf @@ -0,0 +1,2 @@ +[DEFAULT] +test_path=./test/functional diff --git a/.unittests b/.unittests index 6fc2eb1ed9..9873b17974 100755 --- a/.unittests +++ b/.unittests @@ -1,7 +1,9 @@ #!/bin/bash -cd $(dirname $0)/test/unit -nosetests --exe --with-coverage --cover-package swift --cover-erase $@ +TOP_DIR=$(dirname $(realpath "$0")) + +cd $TOP_DIR/test/unit +pytest --cov-report=html:"$TOP_DIR"/cover $@ rvalue=$? rm -f .coverage cd - diff --git a/.zuul.yaml b/.zuul.yaml new file mode 100644 index 0000000000..c580994d2f --- /dev/null +++ b/.zuul.yaml @@ -0,0 +1,773 @@ +- job: + name: swift-tox-base + parent: openstack-tox-py310 + nodeset: ubuntu-jammy + description: | + Base job for swift-tox jobs. + + It sets TMPDIR to an XFS mount point created via + tools/test-setup.sh. + timeout: 5400 + vars: + tox_environment: + TMPDIR: '{{ ansible_env.HOME }}/xfstmp' + COLUMNS: 240 + +- job: + name: swift-tox-py37 + parent: swift-tox-base + description: | + Run unit-tests for swift under cPython version 3.7. + + Uses tox with the ``py37`` environment. + It sets TMPDIR to an XFS mount point created via + tools/test-setup.sh. + vars: + tox_envlist: py37 + python_use_pyenv: True + python_version: 3.7 + post-run: tools/playbooks/common/cover-post.yaml + +- job: + name: swift-tox-py38 + parent: swift-tox-base + nodeset: ubuntu-focal + description: | + Run unit-tests for swift under cPython version 3.8. + + Uses tox with the ``py38`` environment. + It sets TMPDIR to an XFS mount point created via + tools/test-setup.sh. + vars: + tox_envlist: py38 + python_version: 3.8 + post-run: tools/playbooks/common/cover-post.yaml + +- job: + name: swift-tox-py38-arm64 + parent: swift-tox-py38 + nodeset: ubuntu-focal-arm64 + description: | + Run unit tests for an OpenStack Python project under cPython + version 3.8 on top of arm64 architecture. + timeout: 10800 + +- job: + name: swift-tox-py312-arm64 + parent: swift-tox-py312 + nodeset: debian-bookworm-arm64 + description: | + Run unit tests for an OpenStack Python project under cPython + version 3.12 on top of arm64 architecture. + timeout: 10800 + +- job: + name: swift-tox-py39 + parent: swift-tox-base + nodeset: ubuntu-focal + description: | + Run unit-tests for swift under cPython version 3.9. + + Uses tox with the ``py39`` environment. + It sets TMPDIR to an XFS mount point created via + tools/test-setup.sh. + vars: + tox_envlist: py39 + python_version: 3.9 + post-run: tools/playbooks/common/cover-post.yaml + +- job: + name: swift-tox-py310 + parent: swift-tox-base + nodeset: ubuntu-jammy + description: | + Run unit-tests for swift under cPython version 3.10. + + Uses tox with the ``py310`` environment. + It sets TMPDIR to an XFS mount point created via + tools/test-setup.sh. + vars: + tox_envlist: py310 + python_version: '3.10' + post-run: tools/playbooks/common/cover-post.yaml + +- job: + name: swift-tox-py311 + parent: swift-tox-base + nodeset: ubuntu-jammy + description: | + Run unit-tests for swift under cPython version 3.11. + + Uses tox with the ``py311`` environment. + It sets TMPDIR to an XFS mount point created via + tools/test-setup.sh. + vars: + tox_envlist: py311 + python_version: '3.11' + +- job: + name: swift-tox-py312 + parent: swift-tox-base + nodeset: debian-bookworm + description: | + Run unit-tests for swift under cPython version 3.12. + + Uses tox with the ``py312`` environment. + It sets TMPDIR to an XFS mount point created via + tools/test-setup.sh. + vars: + tox_envlist: py312 + python_use_pyenv: True + python_version: '3.12' + +- job: + name: swift-tox-py313 + parent: swift-tox-base + nodeset: ubuntu-noble + description: | + Run unit-tests for swift under cPython version 3.13. + + Uses tox with the ``py313`` environment. + It sets TMPDIR to an XFS mount point created via + tools/test-setup.sh. + vars: + tox_envlist: py313 + python_use_pyenv: True + python_version: '3.13' + +- job: + name: swift-tox-func-py312 + parent: swift-tox-base + nodeset: ubuntu-noble + description: | + Run functional tests for swift under cPython version 3.12. + + Uses tox with the ``func-py3`` environment. + It sets TMPDIR to an XFS mount point created via + tools/test-setup.sh. + vars: + tox_envlist: func + python_version: 3.12 + +- job: + name: swift-tox-func-py39-centos-9-stream + parent: swift-tox-func-py312 + nodeset: centos-9-stream + vars: + python_version: 3.9 + +- job: + name: swift-tox-func-encryption-py39-centos-9-stream + parent: swift-tox-func-py39-centos-9-stream + vars: + tox_envlist: func-encryption + +- job: + name: swift-tox-func-ec-py39-centos-9-stream + parent: swift-tox-func-py39-centos-9-stream + vars: + tox_envlist: func-ec + +- job: + name: swift-tox-func-encryption-py312 + parent: swift-tox-func-py312 + description: | + Run functional tests for swift under cPython version 3.12. + + Uses tox with the ``func-encryption-py3`` environment. + It sets TMPDIR to an XFS mount point created via + tools/test-setup.sh. + vars: + tox_envlist: func-encryption + +- job: + name: swift-tox-func-encryption-py312-arm64 + parent: swift-tox-func-encryption-py312 + nodeset: ubuntu-noble-arm64 + description: | + Run functional tests for swift under cPython version 3.12 + on top of arm64 architecture. + + Uses tox with the ``func-encryption-py3`` environment. + It sets TMPDIR to an XFS mount point created via + tools/test-setup.sh. + timeout: 10800 + +- job: + name: swift-tox-func-py312-arm64 + parent: swift-tox-func-py312 + nodeset: ubuntu-noble-arm64 + description: | + Run functional tests for swift under cPython version 3.12 + on top of arm64 architecture. + + Uses tox with the ``func-py3`` environment. + It sets TMPDIR to an XFS mount point created via + tools/test-setup.sh. + timeout: 10800 + +- job: + name: swift-tox-func-ec-py312 + parent: swift-tox-func-py312 + description: | + Run functional tests for swift under cPython version 3.12. + + Uses tox with the ``func-ec-py3`` environment. + It sets TMPDIR to an XFS mount point created via + tools/test-setup.sh. + vars: + tox_envlist: func-ec + +- job: + name: swift-dsvm-functional + parent: devstack-minimal + description: | + Setup a Swift/Keystone environment and run Swift's func tests. + required-projects: + - opendev.org/openstack/requirements + - opendev.org/openstack/swift + - opendev.org/openstack/keystone + timeout: 5400 + vars: + tox_environment: + COLUMNS: 240 + tox_constraints_file: '{{ ansible_user_dir }}/src/opendev.org/openstack/requirements/upper-constraints.txt' + # This tox env get run twice; once for Keystone and once for tempauth + tox_envlist: func,s3api + devstack_localrc: + SWIFT_HASH: changeme + # We don't need multiple replicas to run purely functional tests. + # In fact, devstack special cases some things when there's only + # one replica. + SWIFT_REPLICAS: 1 + # One replica => no need for replicators, etc. + SWIFT_START_ALL_SERVICES: False + devstack_services: + keystone: true + swift: true + s3api: true + zuul_work_dir: src/opendev.org/openstack/swift + pre-run: tools/playbooks/dsvm/pre.yaml + run: tools/playbooks/dsvm/run.yaml + post-run: tools/playbooks/dsvm/post.yaml + +- job: + name: swift-dsvm-functional-ipv6 + parent: swift-dsvm-functional + vars: + devstack_localrc: + SERVICE_IP_VERSION: 6 + SERVICE_HOST: "" + +- job: + name: swift-tox-func-ceph-s3tests-tempauth + parent: unittests + voting: false + nodeset: centos-9-stream + description: | + Setup a SAIO dev environment and run ceph-s3tests + timeout: 5400 + vars: + tox_environment: + COLUMNS: 240 + s3_acl: yes + pre-run: + - tools/playbooks/common/install_dependencies.yaml + - tools/playbooks/saio_single_node_setup/setup_saio.yaml + - tools/playbooks/saio_single_node_setup/add_s3api.yaml + - tools/playbooks/saio_single_node_setup/make_rings.yaml + - tools/playbooks/common/restart_swift.yaml + run: tools/playbooks/ceph-s3tests/run.yaml + post-run: + - tools/playbooks/probetests/post.yaml + - tools/playbooks/ceph-s3tests/post.yaml + +- job: + name: swift-tox-func-s3api-compat-tests-tempauth + parent: unittests + nodeset: centos-9-stream + description: | + Setup a SAIO dev environment and run our s3api test suite + timeout: 1800 + vars: + tox_environment: + COLUMNS: 240 + s3_acl: yes + pre-run: + - tools/playbooks/common/install_dependencies.yaml + - tools/playbooks/saio_single_node_setup/setup_saio.yaml + - tools/playbooks/saio_single_node_setup/add_s3api.yaml + - tools/playbooks/saio_single_node_setup/make_rings.yaml + - tools/playbooks/common/restart_swift.yaml + run: tools/playbooks/s3api-tests/run.yaml + post-run: + - tools/playbooks/probetests/post.yaml + +- job: + name: swift-probetests-centos-9-stream + parent: unittests + nodeset: centos-9-stream + description: | + Setup a SAIO dev environment and run Swift's probe tests + under Python 3. + timeout: 7200 + vars: + tox_environment: + COLUMNS: 240 + s3_acl: no + pre-run: + - tools/playbooks/common/install_dependencies.yaml + - tools/playbooks/saio_single_node_setup/setup_saio.yaml + - tools/playbooks/saio_single_node_setup/make_rings.yaml + - tools/playbooks/saio_single_node_setup/add_s3api.yaml + run: tools/playbooks/probetests/run.yaml + post-run: tools/playbooks/probetests/post.yaml + +- job: + name: swift-probetests-centos-9-stream-arm64 + parent: swift-probetests-centos-9-stream + nodeset: + nodes: + - name: swift-centos-9-stream-arm64 + label: centos-9-stream-arm64 + description: | + Setup a SAIO dev environment and run Swift's probe tests + under Python 3 on top of arm64 architecture. + timeout: 10800 + +- job: + name: swift-func-cors + parent: swift-probetests-centos-9-stream + description: | + Setup a SAIO dev environment and run Swift's CORS functional tests + timeout: 1200 + pre-run: + - tools/playbooks/cors/install_selenium.yaml + run: tools/playbooks/cors/run.yaml + post-run: tools/playbooks/cors/post.yaml + +- nodeset: + name: swift-five-nodes-centos-9 + nodes: + - name: test-runner1 + label: centos-9-stream + - name: proxy1 + label: centos-9-stream + - name: account1 + label: centos-9-stream + - name: container1 + label: centos-9-stream + - name: object1 + label: centos-9-stream + groups: + - name: test-runner + nodes: + - test-runner1 + - name: swift-cluster + nodes: + - proxy1 + - account1 + - container1 + - object1 + - name: proxy + nodes: + - proxy1 + - name: account + nodes: + - account1 + - name: container + nodes: + - container1 + - name: object + nodes: + - object1 + - name: storage + nodes: + - account1 + - container1 + - object1 + +- job: + name: swift-multinode-rolling-upgrade + parent: multinode + nodeset: swift-five-nodes-centos-9 + description: | + Build a 4 node swift cluster and run functional tests + timeout: 5400 + vars: + tox_envlist: func + pre-run: + - tools/playbooks/multinode_setup/pre.yaml + - tools/playbooks/common/install_dependencies.yaml + - tools/playbooks/multinode_setup/configure_loopback.yaml + - tools/playbooks/multinode_setup/common_config.yaml + - tools/playbooks/multinode_setup/make_rings.yaml + run: tools/playbooks/multinode_setup/run.yaml + post-run: tools/playbooks/probetests/post.yaml + +- job: + name: swift-multinode-rolling-upgrade-wallaby + parent: swift-multinode-rolling-upgrade + vars: + previous_swift_version: wallaby-eom + tox_envlist: func-py3 + +- job: + name: swift-multinode-rolling-upgrade-xena + parent: swift-multinode-rolling-upgrade + vars: + previous_swift_version: xena-eom + tox_envlist: func-py3 + +- job: + name: swift-multinode-rolling-upgrade-yoga + parent: swift-multinode-rolling-upgrade + vars: + previous_swift_version: yoga-eom + tox_envlist: func-py3 + +- job: + name: swift-multinode-rolling-upgrade-zed + parent: swift-multinode-rolling-upgrade + vars: + previous_swift_version: zed-eom + tox_envlist: func-py3 + +- job: + name: swift-multinode-rolling-upgrade-antelope + parent: swift-multinode-rolling-upgrade + vars: + previous_swift_version: 2023.1-eom + tox_envlist: func-py3 + +- job: + name: swift-multinode-rolling-upgrade-bobcat + parent: swift-multinode-rolling-upgrade + vars: + previous_swift_version: 2023.2-eol + tox_envlist: func-py3 + +- job: + name: swift-multinode-rolling-upgrade-caracal + parent: swift-multinode-rolling-upgrade + vars: + previous_swift_version: 2024.1-eom + tox_envlist: func-py3 + +- job: + name: swift-multinode-rolling-upgrade-dalmatian + parent: swift-multinode-rolling-upgrade + vars: + previous_swift_version: origin/stable/2024.2 + tox_envlist: func-py3 + +- job: + name: swift-multinode-rolling-upgrade-epoxy + parent: swift-multinode-rolling-upgrade + vars: + previous_swift_version: origin/stable/2025.1 + +- job: + name: swift-multinode-rolling-upgrade-flamingo + parent: swift-multinode-rolling-upgrade + vars: + previous_swift_version: origin/stable/2025.2 + +- job: + name: swift-multinode-rolling-upgrade-master + parent: swift-multinode-rolling-upgrade + vars: + previous_swift_version: origin/master + +- job: + name: swift-tox-lower-constraints + parent: openstack-tox-lower-constraints + vars: + python_use_pyenv: True + python_version: 3.7 + tox_environment: + TMPDIR: '{{ ansible_env.HOME }}/xfstmp' + +# Image building jobs +- secret: + name: swift-dockerhub-2026 + data: + username: screamingfrenzy + password: !encrypted/pkcs1-oaep + # Updated via `zuul-client --zuul-url https://zuul.opendev.org encrypt --tenant openstack --project openstack/swift --secret-name swift-dockerhub-2026 --infile /tmp/secret --field-name password` + # Be careful not to include any trailing newlines in the secret file! It should contain *exactly* the password: no more, no less. + # Note that we've seen issues with some special characters causing issues in the ansible plumbing; new passwords should probably avoid using double quotes ("), single quotes ('), and dollar signs ($). + - R0AnQRtM0ObjNvqQgc/E9KH1o4CeNwzPB4jUc0ji6BHjHL4zXPi1P29L1BZgc+MlG/Ps/ + SLHTQPyUJpmrlVyFKBhvQcs+0qpeF56TqHYN1sW0j7IHHRLvfRHzKO5oarWeVWJi34PRx + cGdXIT3oYvwWq8D6Lr/XgBOH4eotpfxxR7IYUM7Ti07MHBiVtzZHMvV0/I/YzkaK1oZ2C + n5AYmII/h57YtgTpOOM5dHL3Q6qOaQ4s9nkvyia5R7aRjZW+L1MqP9UjpslGF5AXkAoI5 + xafRwXdCC7L7cwktmvWUgH8mLjoBhVMZR9wn8IqrsrwUI0qs1QvI6N3mC+lIFTdFfDyfx + bJy8umeIkdZ5AmuDEstpb/PORW4q+JfRPFtxtBPxj7E5GrYC5AaBId+8/nHy0vQmcYfON + RT0/lQ+xTlfEkSu2PkIoQsXG1Xxdmeb+j/dG/eTlxQVCyza1DITzOfCLwwrdvHxJvez8M + CvUoS5q0+R+9NRvM1Pc+qTJM7GIe2MNwCn1TcClJd0tspcm2I+uxf/LhX/53qGzs94YrO + +ccqWd4W1ejajbIhCI9VmxhudxJfxxly17g5keJkxDH+akKg4vbg78agiS3NhIkFIiKOn + zQGlA92Ufvd7VnMXJvH+wzK+9RV822i0f/Vfz1aE53Xq1r0bnUPtYFiNlpgrn0= + +- job: + name: swift-build-image + parent: opendev-build-docker-image + voting: false + description: Build SAIO docker images. + vars: &swift_image_vars + docker_images: + - context: . + repository: openstackswift/saio + tags: + - latest + - py3 + +- job: + name: swift-upload-image + parent: opendev-upload-docker-image + voting: false + description: Build SAIO docker images and upload to Docker Hub. + secrets: + name: docker_credentials + secret: swift-dockerhub-2026 + pass-to-parent: true + vars: *swift_image_vars + +- job: + name: swift-promote-image + parent: opendev-promote-docker-image + voting: false + description: Promote previously uploaded Docker images. + secrets: + name: docker_credentials + secret: swift-dockerhub-2026 + pass-to-parent: true + vars: *swift_image_vars + +- job: + name: swift-tox-func-py39-centos-9-stream-fips + parent: swift-tox-func-py39-centos-9-stream + voting: false + description: | + Functional testing on a FIPS enabled Centos 9 system + vars: + nslookup_target: 'opendev.org' + enable_fips: true + +- job: + name: swift-tox-func-encryption-py39-centos-9-stream-fips + parent: swift-tox-func-encryption-py39-centos-9-stream + voting: false + description: | + Functional encryption testing on a FIPS enabled + Centos 9 system + vars: + nslookup_target: 'opendev.org' + enable_fips: true + +- job: + name: swift-tox-func-ec-py39-centos-9-stream-fips + parent: swift-tox-func-ec-py39-centos-9-stream + voting: false + description: | + Functional EC testing on a FIPS enabled Centos 9 system + vars: + nslookup_target: 'opendev.org' + enable_fips: true + +# TODO(gmann): As per the 2025.1 testing runtime, we need to run at least +# one job on jammy. This job can be removed as per the future testing +# runtime (whenever we start testing Ubuntu 26.04 as default version). +- job: + name: tempest-integrated-object-storage-ubuntu-jammy + description: This is integrated object-storage job testing on Ubuntu jammy(22.04) + parent: tempest-integrated-object-storage + nodeset: openstack-single-node-jammy + +- project-template: + name: swift-jobs-arm64 + description: | + Runs tests for an OpenStack Python project under the CPython + version 3 releases designated for testing on top of ARM64 architecture. + check-arm64: + jobs: + - swift-tox-py312-arm64 + experimental: + jobs: + - swift-tox-py38-arm64 + - swift-tox-func-encryption-py312-arm64 + - swift-tox-func-py312-arm64 + +- project: + templates: + - publish-openstack-docs-pti + - periodic-stable-jobs + - check-requirements + - release-notes-jobs-python3 + - integrated-gate-object-storage + - swift-jobs-arm64 + check: + jobs: + - swift-tox-func-py39-centos-9-stream-fips: + irrelevant-files: &functest-irrelevant-files + - ^(api-ref|doc|releasenotes)/.*$ + - ^test/(cors|probe|s3api)/.*$ + - ^(.gitreview|.mailmap|AUTHORS|CHANGELOG|.*\.rst)$ + - swift-tox-func-encryption-py39-centos-9-stream-fips: + irrelevant-files: *functest-irrelevant-files + - swift-tox-func-ec-py39-centos-9-stream-fips: + irrelevant-files: *functest-irrelevant-files + - swift-build-image: + irrelevant-files: &docker-irrelevant-files + - ^(api-ref|doc|releasenotes)/.*$ + - ^test/(functional|probe)/.*$ + + # Unit tests + - swift-tox-py37: + irrelevant-files: &unittest-irrelevant-files + - ^(api-ref|doc|releasenotes)/.*$ + - ^test/(cors|functional|probe|s3api)/.*$ + - swift-tox-py39: + irrelevant-files: *unittest-irrelevant-files + - swift-tox-py312: + irrelevant-files: *unittest-irrelevant-files + - swift-tox-py313: + irrelevant-files: *unittest-irrelevant-files + + # Functional tests + - swift-tox-func-py312: + irrelevant-files: *functest-irrelevant-files + - swift-tox-func-encryption-py312: + irrelevant-files: *functest-irrelevant-files + - swift-tox-func-ec-py312: + irrelevant-files: *functest-irrelevant-files + + # Other tests + - swift-func-cors: + irrelevant-files: + - ^(api-ref|releasenotes)/.*$ + # Keep doc/saio -- we use those sample configs in the saio playbooks + - ^doc/(requirements.txt|(manpages|s3api|source)/.*)$ + - ^test/(unit|functional|probe|s3api)/.*$ + - ^(.gitreview|.mailmap|AUTHORS|CHANGELOG)$ + - swift-tox-func-ceph-s3tests-tempauth: + irrelevant-files: + - ^(api-ref|releasenotes)/.*$ + # Keep doc/saio -- we use those sample configs in the saio playbooks + # Also keep doc/s3api -- it holds known failures for these tests + - ^doc/(requirements.txt|(manpages|source)/.*)$ + - ^test/.*$ + - ^(.gitreview|.mailmap|AUTHORS|CHANGELOG|.*\.rst)$ + - swift-tox-func-s3api-compat-tests-tempauth: + irrelevant-files: + - ^(api-ref|releasenotes)/.*$ + # Keep doc/saio -- we use those sample configs in the saio playbooks + - ^doc/(requirements.txt|(manpages|s3api|source)/.*)$ + - ^test/(cors|unit|functional|probe)/.*$ + - ^(.gitreview|.mailmap|AUTHORS|CHANGELOG|.*\.rst)$ + - swift-probetests-centos-9-stream: + irrelevant-files: &probetest-irrelevant-files + - ^(api-ref|releasenotes)/.*$ + # Keep doc/saio -- we use those sample configs in the saio playbooks + - ^doc/(requirements.txt|(manpages|s3api|source)/.*)$ + - ^test/(cors|unit|functional|s3api)/.*$ + - ^(.gitreview|.mailmap|AUTHORS|CHANGELOG|.*\.rst)$ + - swift-dsvm-functional: + irrelevant-files: *functest-irrelevant-files + - swift-dsvm-functional-ipv6: + irrelevant-files: *functest-irrelevant-files + - swift-tox-lower-constraints: + irrelevant-files: *unittest-irrelevant-files + - openstack-tox-pep8: + irrelevant-files: &pep8-irrelevant-files + - ^(api-ref|etc|examples|releasenotes)/.*$ + # Keep doc/manpages -- we want to syntax check them + - ^doc/(requirements.txt|(saio|s3api|source)/.*)$ + - swift-multinode-rolling-upgrade: + irrelevant-files: *functest-irrelevant-files + - tempest-integrated-object-storage: + irrelevant-files: &tempest-irrelevant-files + - ^(api-ref|doc|releasenotes)/.*$ + - ^test/.*$ + - ^(.gitreview|.mailmap|AUTHORS|CHANGELOG|.*\.rst)$ + - tempest-integrated-object-storage-ubuntu-jammy: + irrelevant-files: *tempest-irrelevant-files + - tempest-ipv6-only: + irrelevant-files: *tempest-irrelevant-files + - openstacksdk-functional-devstack: + irrelevant-files: *tempest-irrelevant-files + - grenade: + irrelevant-files: *tempest-irrelevant-files + - grenade-skip-level: + irrelevant-files: *tempest-irrelevant-files + - grenade-skip-level-always: + irrelevant-files: *tempest-irrelevant-files + gate: + jobs: + # For gate jobs, err towards running more jobs (so, generally avoid + # using irrelevant-files). Exceptions should mainly be made for + # long-running jobs, like probetests or (once they move to + # in-tree definitions) dsvm jobs. + - swift-upload-image: + irrelevant-files: *docker-irrelevant-files + - swift-tox-py37 + - swift-tox-py39 + - swift-tox-py312 + - swift-tox-py313 + - swift-tox-func-py312 + - swift-tox-func-encryption-py312 + - swift-tox-func-ec-py312 + - swift-func-cors + - swift-tox-func-s3api-compat-tests-tempauth + - swift-probetests-centos-9-stream: + irrelevant-files: *probetest-irrelevant-files + - swift-dsvm-functional: + irrelevant-files: *functest-irrelevant-files + - swift-dsvm-functional-ipv6: + irrelevant-files: *functest-irrelevant-files + - swift-tox-lower-constraints: + irrelevant-files: *unittest-irrelevant-files + - openstack-tox-pep8: + irrelevant-files: *pep8-irrelevant-files + - swift-multinode-rolling-upgrade: + irrelevant-files: *functest-irrelevant-files + - tempest-integrated-object-storage: + irrelevant-files: *tempest-irrelevant-files + - tempest-integrated-object-storage-ubuntu-jammy: + irrelevant-files: *tempest-irrelevant-files + - tempest-ipv6-only: + irrelevant-files: *tempest-irrelevant-files + - openstacksdk-functional-devstack: + irrelevant-files: *tempest-irrelevant-files + - grenade: + irrelevant-files: *tempest-irrelevant-files + - grenade-skip-level: + irrelevant-files: *tempest-irrelevant-files + - grenade-skip-level-always: + irrelevant-files: *tempest-irrelevant-files + experimental: + jobs: + - swift-tox-py38 + - swift-tox-py310 + - swift-tox-py311 + - swift-tox-func-py39-centos-9-stream + - swift-tox-func-encryption-py39-centos-9-stream + - swift-tox-func-ec-py39-centos-9-stream + - swift-multinode-rolling-upgrade-wallaby + - swift-multinode-rolling-upgrade-xena + - swift-multinode-rolling-upgrade-yoga + - swift-multinode-rolling-upgrade-zed + - swift-multinode-rolling-upgrade-antelope + - swift-multinode-rolling-upgrade-bobcat + - swift-multinode-rolling-upgrade-caracal + - swift-multinode-rolling-upgrade-dalmatian + - swift-multinode-rolling-upgrade-epoxy + - swift-multinode-rolling-upgrade-flamingo + - swift-multinode-rolling-upgrade-master: + branches: master + + post: + jobs: + - publish-openstack-python-branch-tarball + promote: + jobs: + - swift-promote-image diff --git a/AUTHORS b/AUTHORS index e123d379a9..1b3f4faf28 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,7 +1,7 @@ Maintainer ---------- -OpenStack, LLC. -IRC: #openstack on irc.freenode.net +OpenStack Foundation +IRC: #openstack on irc.oftc.net Original Authors ---------------- @@ -13,82 +13,494 @@ Jay Payne (letterj@gmail.com) Will Reese (wreese@gmail.com) Chuck Thier (cthier@gmail.com) +Core Emeritus +------------- +Chmouel Boudjnah (chmouel@enovance.com) +Florian Hines (syn@ronin.io) +Greg Holt (gholt@rackspace.com) +Paul Luse (paul.e.luse@intel.com) +Donagh McCabe (donagh.mccabe@gmail.com) +Hisashi Osanai (osanai.hisashi@gmail.com) +Jay Payne (letterj@gmail.com) +Peter Portante (peter.portante@redhat.com) +Will Reese (wreese@gmail.com) +Chuck Thier (cthier@gmail.com) +Darrell Bishop (darrell@swiftstack.com) +David Goetz (david.goetz@rackspace.com) +Greg Lange (greglange@gmail.com) +Janie Richling (jrichli@us.ibm.com) +Michael Barton (mike@weirdlooking.com) +Mahati Chamarthy (mahati.chamarthy@gmail.com) +Samuel Merritt (smerritt@nvidia.com) +Romain Le Disez (romain.ledisez@ovh.net) +Pete Zaitcev (zaitcev@yahoo.com) + Contributors ------------ -Jesse Andrews (anotherjesse@gmail.com) -Joe Arnold (joe@swiftstack.com) -Ionuț Arțăriși (iartarisi@suse.cz) -Darrell Bishop (darrell@swiftstack.com) -James E. Blair (james.blair@rackspace.com) -Chmouel Boudjnah (chmouel@chmouel.com) -Clark Boylan (clark.boylan@gmail.com) -Russell Bryant (rbryant@redhat.com) -Devin Carlen (devin.carlen@gmail.com) -Thierry Carrez (thierry@openstack.org) -François Charlier (francois.charlier@enovance.com) -Ray Chen (oldsharp@163.com) -Brian Cline (bcline@softlayer.com) -Julien Danjou (julien.danjou@enovance.com) -Dan Dillinger (dan.dillinger@sonian.net) -Tom Fifield (fifieldt@unimelb.edu.au) -Gaurav B. Gangalwar (gaurav@gluster.com) +Aaron Rosen (arosen@nicira.com) +Ade Lee (alee@redhat.com) +Adrian Smith (adrian_f_smith@dell.com) +Adrien Pensart (adrien.pensart@corp.ovh.com) +afariasa (afariasa@redhat.com) +Akihiro Motoki (amotoki@gmail.com) +Akihito Takai (takaiak@nttdata.co.jp) +Alex Gaynor (alex.gaynor@gmail.com) +Alex Holden (alex@alexjonasholden.com) +Alex Pecoraro (alex.pecoraro@emc.com) +Alex Szarka (szarka@inf.u-szeged.hu) +Alex Yang (alex890714@gmail.com) +Alexander Fadeev (fadeevab.com@gmail.com) +Alexandra Settle (asettle@suse.com) +Alexandre Lécuyer (alexandre.lecuyer@corp.ovh.com) +Alfredo Moralejo (amoralej@redhat.com) +Alistair Coles (alistairncoles@gmail.com) +Andreas Jaeger (aj@suse.de) +Andrew Clay Shafer (acs@parvuscaptus.com) +Andrew Hale (andy@wwwdata.eu) +Andrew Welleck (awellec@us.ibm.com) +Andy McCrae (andy.mccrae@gmail.com) +Anh Tran (anhtt@vn.fujitsu.com) +Anish Kachinthaya (anishk23733@gmail.com) +Ankur Gupta (ankur.gupta@intel.com) Anne Gentle (anne@openstack.org) +aolivo (aolivo@blizzard.com) +Arnaud JOST (arnaud.jost@ovh.net) +arzhna (arzhna@gmail.com) +Ashwin Nair (nairashwin952013@gmail.com) +Atsushi Sakai (sakaia@jp.fujitsu.com) +Aymeric Ducroquetz (aymeric.ducroquetz@ovhcloud.com) +Azhagu Selvan SP (tamizhgeek@gmail.com) +Azmain Adib (adib1905@gmail.com) +baiwenteng (baiwenteng@inspur.com) +Ben Keller (bjkeller@us.ibm.com) +Ben Martin (blmartin@us.ibm.com) +bhavani.cr (bhavani.r@nectechnologies.in) +Bill Huber (wbhuber@us.ibm.com) +Bob Ball (bob.ball@citrix.com) +Brent Roskos (broskos@internap.com) +Brian Cline (bcline@softlayer.com) +Brian Curtin (brian.curtin@rackspace.com) +Brian D. Burns (iosctr@gmail.com) +Brian K. Jones (bkjones@gmail.com) +Brian Ober (bober@us.ibm.com) +Brian Reitz (brian.reitz@oracle.com) +Bryan Keller (kellerbr@us.ibm.com) +Béla Vancsics (vancsics@inf.u-szeged.hu) +Caleb Tennis (caleb.tennis@gmail.com) +Callum Dickinson (callum.dickinson@catalystcloud.nz) +Cao Xuan Hoang (hoangcx@vn.fujitsu.com) +Carlos Cavanna (ccavanna@ca.ibm.com) +Catherine Northcott (catherine@northcott.nz) +Cedric Dos Santos (cedric.dos.sant@gmail.com) +Changbin Liu (changbin.liu@gmail.com) +ChangBo Guo(gcb) (eric.guo@easystack.cn) +Chaozhe Chen (chaozhe.chen@easystack.cn) +Charles Hsu (charles0126@gmail.com) +chenaidong1 (chen.aidong@zte.com.cn) +cheng (li.chenga@h3c.com) +Cheng Li (shcli@cn.ibm.com) +chengebj5238 (chengebj@inspur.com) +chenxiangui (chenxiangui@inspur.com) +Chetan Mishra (chetan.s115@gmail.com) +Chinemerem Chigbo (chinemeremchigbo@Outlook.com) +Chmouel Boudjnah (chmouel@enovance.com) +Chris Smart (distroguy@gmail.com) +Chris Wedgwood (cw@f00f.org) +Christian Berendt (berendt@b1-systems.de) +Christian Hugo (hugo.christian@web.de) +Christian Ohanaja (christianohanaja431@gmail.com) +Christian Schwede (cschwede@redhat.com) +Christopher Bartz (bartz@dkrz.de) +Christopher MacGown (chris@pistoncloud.com) +Chuck Short (chuck.short@canonical.com) +Clark Boylan (clark.boylan@gmail.com) Clay Gerrard (clay.gerrard@gmail.com) -Mark Gius (launchpad@markgius.com) +Clément Contini (ccontini@cloudops.com) +Colin Nicholson (colin.nicholson@iomart.com) +Colleen Murphy (colleen.murphy@suse.com) +Conrad Weidenkeller (conrad.weidenkeller@rackspace.com) +Constantine Peresypkin (constantine.peresypk@rackspace.com) +Corey Bryant (corey.bryant@canonical.com) +Cory Wright (cory.wright@rackspace.com) +Cristian A Sanchez (cristian.a.sanchez@intel.com) +CY Chiang (cychiang@cht.com.tw) +Cyril Roelandt (cyril@redhat.com) +Daanish Khan (daanish1337@gmail.com) +Dae S. Kim (dae@velatum.com) +Daisuke Morita (morita.daisuke@ntti3.com) +Dan Dillinger (dan.dillinger@sonian.net) +Dan Hersam (dan.hersam@hp.com) +Dan Prince (dprince@redhat.com) +dangming (dangming@unitedstack.com) +Daniele Pizzolli (dpizzolli@fbk.eu) +Daniele Valeriani (daniele@dvaleriani.net) +Darrell Bishop (darrell@swiftstack.com) +Darryl Tam (dtam@swiftstack.com) David Goetz (david.goetz@rackspace.com) -Jonathan Gonzalez V (jonathan.abdiel@gmail.com) -David Hadas (david.hadas@gmail.com) -Soren Hansen (soren@linux2go.dk) -Doug Hellmann (doug.hellmann@dreamhost.com) +David Hadas (davidh@il.ibm.com) +David Liu (david.liu@cn.ibm.com) +David Moreau Simard (dmsimard@iweb.com) +David Rabel (rabel@b1-systems.de) +Dean Troyer (dtroyer@gmail.com) +Denis V. Meltsaykin (dmeltsaykin@mirantis.com) Derek Higgins (derekh@redhat.com) +Devin Carlen (devin.carlen@gmail.com) +Dharmendra Kushwaha (dharmendra.kushwaha@nectechnologies.in) +Dhriti Shikhar (dhrish20@gmail.com) +Dieter Plaetinck (dieter@vimeo.com) +Dirk Mueller (dirk@dmllr.de) +Dmitriy Ukhlov (dukhlov@mirantis.com) +Dmitry Ukov (dukov@mirantis.com) +Dolph Mathews (dolph.mathews@gmail.com) +Donagh McCabe (donagh.mccabe@gmail.com) +dongu (gmj03003@gmail.com) +Doron Chen (cdoron@il.ibm.com) +Doug Hellmann (doug@doughellmann.com) +Doug Weimer (dweimer@gmail.com) +Dr. Jens Harbott (harbott@osism.tech) +Dragos Manolescu (dragosm@hp.com) +Drew Balfour (andrew.balfour@oracle.com) +Eamonn O'Toole (eamonn.otoole@hpe.com) +Ed Leafe (ed.leafe@rackspace.com) +Edward Hope-Morley (opentastic@gmail.com) +Ehud Kaldor (ehud@unfairfunction.org) +Ellen Leahy (ellen.mar.leahy@hpe.com) +Elod Illes (elod.illes@est.tech) +Emett Speer (speer.emett@gmail.com) +Emile Snyder (emile.snyder@gmail.com) +Emmanuel Cazenave (contact@emcaz.fr) +Eohyung Lee (liquidnuker@gmail.com) +Eran Rom (eranr@il.ibm.com) +Eugene Kirpichov (ekirpichov@gmail.com) +Ewan Mellor (ewan.mellor@citrix.com) +Fabien Boucher (fabien.boucher@enovance.com) +Falk Reimann (falk.reimann@sap.com) +FatemaKhalid (fatemakhalid96@gmail.com) +Felipe Reyes (freyes@tty.cl) +Ferenc Horváth (hferenc@inf.u-szeged.hu) +Filippo Giunchedi (fgiunchedi@wikimedia.org) +Flavio Percoco (flaper87@gmail.com) +Florent Flament (florent.flament-ext@cloudwatt.com) +Florent Vennetier (florent.vennetier@ovhcloud.com) Florian Hines (syn@ronin.io) +François Charlier (francois.charlier@enovance.com) +Fujita Tomonori (fujita.tomonori@lab.ntt.co.jp) +Félix Cantournet (felix.cantournet@cloudwatt.com) +Gage Hugo (gh159m@att.com) +Ganesh Maharaj Mahalingam (ganesh.mahalingam@intel.com) +gaobin (gaobin@inspur.com) +gaofei (gao.fei@inspur.com) +Gaurav B. Gangalwar (gaurav@gluster.com) +gecong1973 (ge.cong@zte.com.cn) +gengchc2 (geng.changcai2@zte.com.cn) +Gerard Gine (ggine@swiftstack.com) +Gerry Drudy (gerry.drudy@hpe.com) +Ghanshyam Mann (gmann@ghanshyammann.com) +Gil Vernik (gilv@il.ibm.com) +Gilles Biannic (gilles.biannic@corp.ovh.com) +Gleb Samsonov (sams-gleb@yandex.ru) +Gonéri Le Bouder (goneri.lebouder@enovance.com) +Graham Hayes (graham.hayes@hpe.com) +Gregory Haynes (greg@greghaynes.net) +Grzegorz Grasza (xek@redhat.com) +Guang Yee (guang.yee@hpe.com) +guotao (guotao.bj@inspur.com) +Gábor Antal (antal@inf.u-szeged.hu) +Ha Van Tu (tuhv@vn.fujitsu.com) +Hamdi Roumani (roumani@ca.ibm.com) +Hanxi Liu (hanxi.liu@easystack.cn) +Harshada Mangesh Kakad (harshadak@metsi.co.uk) +Harshit Chitalia (harshit@acelio.com) +HCLTech-SSW (hcl_ss_oss@hcl.com) +Hervé Beraud (hberaud@redhat.com) +hgangwx (hgangwx@cn.ibm.com) +Hisashi Osanai (osanai.hisashi@gmail.com) +Hodong Hwang (hodong.hwang@kt.com) +Hou Ming Wang (houming.wang@easystack.cn) +houweichao (houwch@gohighsec.com) +Hu Bing (hubingsh@cn.ibm.com) +Hua Zhang (zhuadl@cn.ibm.com) +Hugo Kuo (tonytkdk@gmail.com) +Ilya Kharin (ikharin@mirantis.com) +Ionuț Arțăriși (iartarisi@suse.cz) Iryoung Jeong (iryoung@gmail.com) -Paul Jimenez (pj@place.org) -Brian K. Jones (bkjones@gmail.com) -Morita Kazutaka (morita.kazutaka@gmail.com) +its-not-a-bug-its-a-feature (david.cole@sohonet.com) +Ivan Anfimov (lazekteam@gmail.com) +Jaivish Kothari (jaivish.kothari@nectechnologies.in) +Jake Yip (jake.yip@ardc.edu.au) +James E. Blair (jeblair@openstack.org) +James Nguyen (ngmqng@gmail.com) +James Page (james.page@ubuntu.com) +Jamie Lennox (jlennox@redhat.com) +Jan Zerebecki (jan.openstack@zerebecki.de) +Janie Richling (jrichli@us.ibm.com) +Jason Johnson (jajohnson@softlayer.com) +Jay S. Bryant (jsbryant@us.ibm.com) +Jens Harbott (j.harbott@x-ion.de) +Jeremy Stanley (fungi@yuggoth.org) +Jesse Andrews (anotherjesse@gmail.com) +Ji-Wei (ji.wei3@zte.com.cn) +Jian Zhang (jian.zhang@intel.com) +Jiangmiao Gao (tolbkni@gmail.com) +Jianjian Huo (jhuo@nvidia.com) +jiaqi07 (wangjiaqi07@inspur.com) +Jing Liuqing (jing.liuqing@99cloud.net) +jinyuanliu (liujinyuan@inspur.com) +Joanna H. Huang (joanna.huitzu.huang@gmail.com) +Joe Arnold (joe@swiftstack.com) +Joe Gordon (jogo@cloudscaling.com) +Joe Yang (jyang@swiftstack.com) +Joel Wright (joel.wright@sohonet.com) +John Leach (john@johnleach.co.uk) +Jola Mirecka (jola.mirecka@hp.com) +Jon Snitow (otherjon@swiftstack.com) +Jonathan Gonzalez V (jonathan.abdiel@gmail.com) +Jonathan Hinson (jlhinson@us.ibm.com) Josh Kearney (josh@jk0.org) -Ed Leafe (ed.leafe@rackspace.com) -Tong Li (litong01@us.ibm.com) -Victor Lowther (victor.lowther@gmail.com) -Zhong Yue Luo (lzyeval@gmail.com) -Dragos Manolescu (dragosm@hp.com) Juan J. Martinez (juan@memset.com) +Julien Danjou (julien@danjou.info) +junboli (junbo85.li@gmail.com) +Kai Zhang (zakir.exe@gmail.com) +Kapil Thangavelu (kapil.foss@gmail.com) +karen chan (karen@karen-chan.com) +Kato Tomoyuki (kato.tomoyuki@jp.fujitsu.com) +Kazuhiro Miyahara (miyahara.kazuhiro@lab.ntt.co.jp) +Ke Liang (ke.liang@easystack.cn) +Kenichiro Matsuda (matsuda_kenichi@jp.fujitsu.com) +Keshava Bharadwaj (kb.sankethi@gmail.com) +kim woo seok (rladntjr4@gmail.com) +Kiyoung Jung (kiyoung.jung@kt.com) +Koert van der Veer (koert@cloudvps.com) +Konrad Kügler (swamblumat-eclipsebugs@yahoo.de) +Kota Tsuyuzaki (bloodeagle40234@gmail.com) +Ksenia Demina (kdemina@mirantis.com) +Kuan-Lin Chen (kuanlinchen@synology.com) +Kun Huang (gareth@unitedstack.com) +Larry Rensing (lr699s@att.com) +Leah Klearman (lklrmn@gmail.com) +Li Riqiang (lrqrun@gmail.com) +Liang Jingtao (liang.jingtao@zte.com.cn) +lijunbo (lijunbo@fiberhome.com) +likui (likui@yovole.com) +Lin Yang (lin.a.yang@intel.com) +Lingxian Kong (anlin.kong@gmail.com) +lingyongxu (lyxu@fiberhome.com) +Liu Siqi (meizu647@gmail.com) +liujiong (liujiong@gohighsec.com) +liuyamin (liuyamin@fiberhome.com) +Lokesh S (lokesh.s@hp.com) +Lorcan Browne (lorcan.browne@hpe.com) +Luciano Lo Giudice (luciano.logiudice@canonical.com) +Luis de Bethencourt (luis@debethencourt.com) +Luong Anh Tuan (tuanla@vn.fujitsu.com) +lvxianguo (lvxianguo@inspur.com) +M V P Nitesh (m.nitesh@nectechnologies.in) +Madhuri Kumari (madhuri.rai07@gmail.com) +Mahati Chamarthy (mahati.chamarthy@gmail.com) +Mandell Degerness (mdegerness@nvidia.com) +manuvakery1 (manu.km@idrive.com) +maoshuai (fwsakura@163.com) Marcelo Martins (btorch@gmail.com) -Donagh McCabe (donagh.mccabe@hp.com) -Andy McCrae (andy.mccrae@gmail.com) +Maria Malyarova (savoreux69@gmail.com) +Mark Gius (launchpad@markgius.com) +Mark Seger (mark.seger@hpe.com) +Martin Geisler (martin@geisler.net) +Martin Kletzander (mkletzan@redhat.com) +Maru Newby (mnewby@internap.com) +Masaki Tsukuda (tsukuda.masaki@po.ntts.co.jp) +Mathias Bjoerkqvist (mbj@zurich.ibm.com) +Matt Kassawara (mkassawara@gmail.com) +Matt Riedemann (mriedem@us.ibm.com) +Matthew Oliver (matt@oliver.net.au) +Matthew Vernon (mvernon@wikimedia.org) +Matthieu Huin (mhu@enovance.com) +Mauro Stettler (mauro.stettler@gmail.com) +Mehdi Abaakouk (sileht@redhat.com) +melissaml (ma.lei@99cloud.net) +Michael Matur (michael.matur@gmail.com) +Michael Shuler (mshuler@gmail.com) +Michele Valsecchi (mvalsecc@redhat.com) +Mike Fedosin (mfedosin@mirantis.com) +Mingyu Li (li.mingyu@99cloud.net) +Minwoo Bae (minwoob@us.ibm.com) +Mitsuhiro SHIGEMATSU (shigematsu.mitsuhiro@lab.ntt.co.jp) +mmcardle (mark.mcardle@sohonet.com) +Mohamed Hassaneen (mohammedashoor89@gmail.com) +Mohammed Al-Jawaheri (mjawaheri02@gmail.com) +Mohit Motiani (mohit.motiani@intel.com) +Monty Taylor (mordred@inaugust.com) +Morgan Fainberg (morgan.fainberg@gmail.com) +Morita Kazutaka (morita.kazutaka@gmail.com) +Motonobu Ichimura (motonobu@gmail.com) +Nada El-Mestkawy (nadamaged05@gmail.com) +Nadeem Syed (snadeem.hameed@gmail.com) +Nakagawa Masaaki (nakagawamsa@nttdata.co.jp) +Nakul Dahiwade (nakul.dahiwade@intel.com) +Nam Nguyen Hoai (namnh@vn.fujitsu.com) +Nandini Tata (nandini.tata@intel.com) +Naoto Nishizono (nishizono.naoto@po.ntts.co.jp) +Nassim Babaci (nassim.babaci@cloudwatt.com) +Nathan Kinder (nkinder@redhat.com) +nathang15 (nguyennathan1502@gmail.com) +Nelson Almeida (nelsonmarcos@gmail.com) +Newptone (xingchao@unitedstack.com) +ngcjny (noguchi.junya@fujitsu.com) +Ngo Quoc Cuong (cuongnq@vn.fujitsu.com) +Nguyen Hai (nguyentrihai93@gmail.com) +Nguyen Hung Phuong (phuongnh@vn.fujitsu.com) +Nguyen Phuong An (AnNP@vn.fujitsu.com) +Nguyen Quoc Viet (nguyenqviet98@gmail.com) +Nicholas Njihia (nicholas.njihia@canonical.com) +Nicolas Helgeson (nh202b@att.com) +Nicolas Trangez (ikke@nicolast.be) +Ning Zhang (ning@zmanda.com) +Nirmal Thacker (nirmalthacker@gmail.com) +niuke (niuke19970315@163.com) +npraveen35 (npraveen35@gmail.com) +Olga Saprycheva (osapryc@us.ibm.com) +Ondrej Novy (ondrej.novy@firma.seznam.cz) +Or Ozeri (oro@il.ibm.com) +Oshrit Feder (oshritf@il.ibm.com) +Paul Dardeau (paul.dardeau@intel.com) +Paul Jimenez (pj@place.org) +Paul Luse (paul.e.luse@intel.com) Paul McMillan (paul.mcmillan@nebula.com) -Ewan Mellor (ewan.mellor@citrix.com) +Pavel Kvasnička (pavel.kvasnicka@firma.seznam.cz) +Pawel Palucki (pawel.palucki@gmail.com) +Pearl Yajing Tan (pearl.y.tan@seagate.com) +pengyuesheng (pengyuesheng@gohighsec.com) +Peter Lisák (peter.lisak@gmail.com) +Peter Portante (peter.portante@redhat.com) +Petr Kovar (pkovar@redhat.com) +Philippe SERAPHIN (philippe.seraphin@infomaniak.com) +Pradeep Kumar Singh (pradeep.singh@nectechnologies.in) +Prashanth Pai (ppai@redhat.com) +Pádraig Brady (pbrady@redhat.com) +Qiaowei Ren (qiaowei.ren@intel.com) +Rafael Rivero (rafael@cloudscaling.com) +Rainer Toebbicke (Rainer.Toebbicke@cern.ch) +rajat29 (rajat.sharma@nectechnologies.in) +Ray Chen (oldsharp@163.com) +Rebecca Finn (rebeccax.finn@intel.com) +Renich Bon Ćirić (renich@cloudsigma.com) +Ricardo Ferreira (ricardo.sff@gmail.com) +Richard Hawkins (richard.hawkins@rackspace.com) +ricolin (ricolin@ricolky.com) +Robert Francis (robefran@ca.ibm.com) +Robin Naundorf (r.naundorf@fh-muenster.de) +Romain de Joux (romain.de-joux@ovhcloud.com) +Russ Nelson (russ@crynwr.com) +Russell Bryant (rbryant@redhat.com) +Sachin Patil (psachin@redhat.com) +Sam Morrison (sorrison@gmail.com) Samuel Merritt (sam@swiftstack.com) +Sarafraj Singh (Sarafraj.Singh@intel.com) +Sarvesh Ranjan (saranjan@cisco.com) +Sascha Peilicke (saschpe@gmx.de) +Saverio Proto (saverio.proto@switch.ch) +Scott Simpson (sasimpson@gmail.com) +Sean McGinnis (sean.mcginnis@gmail.com) +Sean Mooney (work@seanmooney.info) +SeongSoo Cho (ppiyakk2@printf.kr) +Sergey Kraynev (skraynev@mirantis.com) +Sergey Lukjanov (slukjanov@mirantis.com) +Shane Wang (shane.wang@intel.com) +shangxiaobj (shangxiaobj@inspur.com) +shaofeng_cheng (chengsf@winhong.com) +Shashank Kumar Shankar (shashank.kumar.shankar@intel.com) +Shashirekha Gundur (shashirekha.j.gundur@intel.com) +Shilla Saebi (shilla.saebi@gmail.com) +Shreeya Deshpande (shreeyad@nvidia.com) +Shri Javadekar (shrinand@maginatics.com) +Simeon Gourlin (simeon.gourlin@infomaniak.com) +Sivasathurappan Radhakrishnan (siva.radhakrishnan@intel.com) +Soren Hansen (soren@linux2go.dk) +Stefan Majewsky (stefan.majewsky@sap.com) Stephen Milton (milton@isomedia.com) -Russ Nelson (russ@crynwr.com) -Maru Newby (mnewby@internap.com) -Colin Nicholson (colin.nicholson@iomart.com) -Eamonn O'Toole (eamonn.otoole@hp.com) -Constantine Peresypkin (constantine@litestack.com) -Dan Prince (dprince@redhat.com) -Felipe Reyes (freyes@tty.cl) -Li Riqiang (lrqrun@gmail.com) +Steve Kowalik (steven@wedontsleep.org) +Steve Martinelli (stevemar@ca.ibm.com) +Steven Lang (Steven.Lang@hgst.com) +Sushil Kumar (sushil.kumar2@globallogic.com) +Takashi Kajinami (kajinamit@oss.nttdata.com) +Takashi Natsume (takanattie@gmail.com) +TheSriram (sriram@klusterkloud.com) +Thiago da Silva (thiagodasilva@gmail.com) +Thibault Person (thibault.person@ovhcloud.com) +Thierry Carrez (thierry@openstack.org) +Thomas Goirand (thomas@goirand.fr) +Thomas Herve (therve@redhat.com) +Thomas Leaman (thomas.leaman@hp.com) +Tiago Primini (primini@gmail.com) +Tim Burke (tim.burke@gmail.com) +Timothy Okwii (tokwii@cisco.com) +Timur Alperovich (timur.alperovich@gmail.com) +Tin Lam (tinlam@gmail.com) +Tobias Stevenson (tstevenson@vbridges.com) +Tom Fifield (tom@openstack.org) +Tomas Matlocha (tomas.matlocha@firma.seznam.cz) +tone-zhang (tone.zhang@linaro.org) +Tong Li (litong01@us.ibm.com) +Tovin Seven (vinhnt@vn.fujitsu.com) +Tra Bui (trabui.0517@gmail.com) +Travis McPeak (tmcpeak@us.ibm.com) +Tushar Gohad (tushar.gohad@intel.com) +Van Hung Pham (hungpv@vn.fujitsu.com) +venkatamahesh (venkatamaheshkotha@gmail.com) +Venkateswarlu Pallamala (p.venkatesh551@gmail.com) +Victor Lowther (victor.lowther@gmail.com) Victor Rodionov (victor.rodionov@nexenta.com) -Brent Roskos (broskos@internap.com) -Michael Shuler (mshuler@rackspace.com) -Andrew Clay Shafer (acs@parvuscaptus.com) -Scott Simpson (sasimpson@gmail.com) -Adrian Smith (adrian_f_smith@dell.com) -Monty Taylor (mordred@inaugust.com) -Caleb Tennis (caleb.tennis@gmail.com) -Rainer Toebbicke (Rainer.Toebbicke@cern.ch) -Fujita Tomonori (fujita.tomonori@lab.ntt.co.jp) -Kapil Thangavelu (kapil.foss@gmail.com) -Dean Troyer (dtroyer@gmail.com) -Kota Tsuyuzaki (tsuyuzaki.kota@lab.ntt.co.jp) +Victor Stinner (vstinner@redhat.com) +Viktor Varga (vvarga@inf.u-szeged.hu) +Vil Surkin (mail@vills.me) Vincent Untz (vuntz@suse.com) -Daniele Valeriani (daniele@dvaleriani.net) -Chris Wedgwood (cw@f00f.org) -Conrad Weidenkeller (conrad.weidenkeller@rackspace.com) -Doug Weimer (dweimer@gmail.com) -Cory Wright (cory.wright@rackspace.com) +Vitaly Bordyug (vbordug@gmail.com) +Vladimir Vechkanov (vvechkanov@mirantis.com) +Vu Cong Tuan (tuanvc@vn.fujitsu.com) +vxlinux (yan.wei7@zte.com.cn) +Walter Doekes (walter+github@wjd.nu) +wangdequn (wangdequn@inspur.com) +wanghongtaozz (wanghongtaozz@inspur.com) +wanghui (wang_hui@inspur.com) +wangqi (wang.qi@99cloud.net) +Wei LingFei (weilingfei@uniontech.com) +whoami-rajat (rajatdhasmana@gmail.com) +wu.shiming (wushiming@yovole.com) +Wu Wenxiang (wu.wenxiang@99cloud.net) +Wyllys Ingersoll (wyllys.ingersoll@evault.com) +xhancar (pavel.hancar@gmail.com) +XieYingYun (smokony@sina.com) +Yaguang Wang (yaguang.wang@intel.com) +Yan Xiao (yanxiao@nvidia.com) +yanghuichan (yanghc@fiberhome.com) +Yatin Kumbhare (yatinkumbhare@gmail.com) Ye Jia Xu (xyj.asmy@gmail.com) -Alex Yang (alex890714@gmail.com) -Pete Zaitcev (zaitcev@kotori.zaitcev.us) -Ning Zhang (ning@zmanda.com) +Yee (mail.zhang.yee@gmail.com) +Yu Yafei (yu.yafei@zte.com.cn) Yuan Zhou (yuan.zhou@intel.com) +yuhui_inspur (yuhui@inspur.com) +Yummy Bian (yummy.bian@gmail.com) +Yuriy Taraday (yorik.sar@gmail.com) +Yushiro FURUKAWA (y.furukawa_2@jp.fujitsu.com) +Yuxin Wang (wang.yuxin@ostorage.com.cn) +Zack M. Davis (zdavis@swiftstack.com) +Zap Chang (zapchang@gmail.com) +zengjia (zengjia@awcloud.com) +Zhang Guoqing (zhang.guoqing@99cloud.net) +Zhang Jinnan (ben.os@99cloud.net) +zhang.lei (zhang.lei@99cloud.net) +zhangboye (zhangboye@inspur.com) +zhangdebo1987 (zhangdebo@inspur.com) +zhangyanxian (zhangyanxianmail@163.com) +Zhao Lei (zhaolei@cn.fujitsu.com) +zhaoleilc (15247232416@163.com) +Zheng Yao (zheng.yao1@zte.com.cn) +zheng yin (yin.zheng@easystack.cn) +Zhenguo Niu (zhenguo@unitedstack.com) +zhengwei6082 (zhengwei6082@fiberhome.com) +ZhijunWei (wzj334965317@outlook.com) +ZhiQiang Fan (aji.zqfan@gmail.com) +ZhongShengping (chdzsp@163.com) +Zhongyue Luo (zhongyue.nah@intel.com) +zhufl (zhu.fanglei@zte.com.cn) +zhulingjie (easyzlj@gmail.com) +翟小君 (zhaixiaojun@gohighsec.com) diff --git a/CHANGELOG b/CHANGELOG index c5aada64c2..1aff48e4e4 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,4555 @@ +swift (2.37.0) + + * The s3token middleware now passes service auth tokens to Keystone + if credentials are provided. This is required to enable S3 API + access for Keystone users when using Keystone >25.0.0, !=26.0.0, + !=26.0.1, !=27.0.0, !=28.0.0. See etc/proxy-server.conf-sample for + configuration details. For more information, see + https://security.openstack.org/ossa/OSSA-2025-002.html and + https://bugs.launchpad.net/keystone/+bug/2119646 + + * The s3token middleware now caches credential secrets for one minute + by default, if credentials are provided. Secret-caching typically + reduces the load on Keystone and is required for Keystone users to + be able to use signed aws-chunked transfers. To return to prior + behavior, explicitly set `secret_cache_duration = 0` in the + `[filter:s3api]` section of your proxy-server.conf. + + * Improved checksum validation for S3 API DeleteObjects requests. + + * POST requests are more likely to receive a 503 response in the + face of backend inconsistencies. + + * The KMS keymaster now supports selecting the endpoint returned in the + Keystone catalog via the `barbican_region_name` configuration option. + This may be useful in multi-region deployments which have multiple + endpoints. + + * The request line-length limit is now configurable for all WSGI servers + via the `max_request_line` option in the `[swift-constraints]` section + of swift.conf. By default, continue to use eventlet's default of 8192 + bytes. + + * Writes to sharded containers are less likely to have their updates + sent to the root container. This uses a new cooperative-token + mechanism to limit the number of concurrent shard range queries + to the root container; see the `[app:proxy-server]` section of + etc/proxy-server.conf-sample for configuration options. + + * The following new metrics were added when using labeled metrics: + + * The proxy-logging middleware may now emit real-time transfer metrics. + See the `statsd_emit_buffer_xfer_bytes_seconds` option in + etc/proxy-server.conf-sample for more information. + + * The proxy-logging middleware now includes an `api` label whose value + may be `swift` or `S3` depending on whether the client request is + serviced by the swift API or S3 API. + + * The s3api middleware now emits a counter recording the usage of + various protocol-related headers. + + * The container-sharder now emits a timing metric for the length of + time between shard range creation and cleaving. + + * `swift-manage-shard-ranges` now defaults to committing pending + updates before looking for shard range boundaries. A new option, + `--skip-commits`, may be used to restore previous behavior. + + * Added a `--clobber-hardlink-collisions` option to `swift-object-relinker`. + With this option enabled during the relink phase the relinker will + quarantine the colliding file in the new target part dir and retry the + relink. During the cleanup phase it will ignore the un-matched inode + "collision" and allow the cleanup of the old file in the old part dir + similar to tombstones. + + * Fixed the `swift_dir` option for WSGI servers; the file + `/etc/swift/swift.conf` no longer needs to exist when that option + is set. + + * Fixed an object-server error when there is a part-power increase in + progress and there was an issue marking the file in the new partition + space as durable. + + * Device names are now included in sharded database IDs, similar to + regular databases. This provides more context when examining + incoming/outgoing sync tables or sharding CleaveContexts. + + * Database replicators now clean up temporary files older than + `reclaim_age`. + + * Removed fallback support using netifaces; `getifaddrs` is now always + used to determine available IP addresses. + + * Various other minor bug fixes and improvements. + + +swift (2.36.0, OpenStack Flamingo) + + * S3 API + + * Added support for aws-chunked transfers. Recent AWS clients recently + began defaulting to this mode. See also: + https://docs.aws.amazon.com/AmazonS3/latest/API/sigv4-streaming.html + + * Added support for verifying additional checksums during upload. All + algorithms currently supported by AWS are supported: CRC64NVME, + CRC32, CRC32C, SHA1, and SHA256. See also: + https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html + Note that some algorithms require the availability of additional + libraries: ISA-L or anycrc. + + * Added support for create-without-overwrite conditional writes. + + * Fixed HTTP framing issues when returning errors for a request with + `Expect: 100-continue`. + + * Improved various error messages to better imitate AWS responses. + + * Let clients request heartbeats during COPYs by including + the query parameter `heartbeat=on`. + + With heartbeating turned on, the proxy will start its response + immediately with 202 Accepted then send a single whitespace + character periodically until the request completes. At that + point, a final summary chunk will be sent which includes a + "Response Status" key indicating success or failure. + + * Labeled metrics + + * Added support for emitting labeled statsd metrics in a variety of + formats. Middleware authors should see the documentation at + https://docs.openstack.org/swift/latest/misc.html#swift.common.statsd_client.LabeledStatsdClient + for more information. + + * Instrumented proxy-logging with labeled metrics. See + `proxy-server.conf-sample` for more information. + + * Instrumented the object-server with labeled metrics. See + `object-server.conf-sample` for more information. + + * Added `access_user_id` logging field; out-of-tree auth middlewares should + use `environ['swift.access_logging']['user_id']` to populate this field. + + * Introduced an extensible ring format. This allows both more than 65,536 + devices and more data structures to be in a ring. For more information, + see https://docs.openstack.org/swift/latest/overview_ring_format.html#ring-v2. + + * SSYNC connections are now promptly terminated when subrequests timeout. + + * Python 3.13 (with the GIL enabled) is now supported. Free-threaded + builds remain untested. + + * Removed support for Python 3.6. Flamingo (2025.2) will be the final + stable release to support Python 3.7 and 3.8. + + * Removed support for pickled ring files. These have not been written + since Swift 1.7.0. + + * Fixed a recursion error in the account-quota middleware. + + * Fixed an error in `invalidate_hash` when the partition is deleted while + waiting for the partition lock. + + * Various other minor bug fixes and improvements. + + +swift (2.35.0, OpenStack Epoxy) + + * Removed the use of `eval` in the xprofile middleware. Note that this + middleware is only intended for development purposes and is not + intended for use in production systems. + + * Account listings now include storage policy information for the + containers listed. + + * The S3 API no longer requires a `Content-MD5` header for `DeleteObjects` + requests when using v4 signatures. + + * Uploads to containers with object versioning enabled now require a + `Content-Length` or `Transfer-Encoding: chunked` header, similar to + other containers. + + * Fixed a server error when deleting a specific version via the S3 API. + + * `X-Open-Expired` now works properly with `?part-number` requests. + + * Fixed `Content-Type` and `Content-Length` headers in account and + container `HEAD` responses (with eventlet>=0.38.0). + + * Object expiration fixes: + + * The configuration options `expiring_objects_container_divisor` and + `expiring_objects_account_name` are now deprecated. + + If a cluster was deployed with a non-standard account name, operators + should remove the option from all configs so they are using a supported + configuration going forward, but will need to deploy stand-alone expirer + processes with legacy expirer config to clean-up old expiration tasks + from the previously configured account name. + + * Added a new configuration option, `round_robin_task_cache_size`, to + adjust the number of tasks to cache before processing. This may be used + to trade faster expirer start-up for more lumpy container-server load. + + * The object-expirer now better handles errors during listings. + + * The object-expirer now respects the `internal_client_conf_path` + configuration option in legacy `object-expirer.conf` configs. + + * WSGI process management improvements: + + * Added a `stale_worker_timeout` configuration option to the WSGI + servers. Once this time elapses following a reload, the manager + process will issue SIGKILLs to any remaining stale workers. + + * Improved the reliability of `swift-reload`. + + * Object updater observability improvements: + + * Added last start time to recon dumps as `object_updater_last`. + + * Added information (including target account/container) from oldest + failed updates to recon dumps, both per-device and aggregated for the + node. Use the new `async_tracker_max_entries` and + `async_tracker_dump_count` options to adjust how many records to + collect. + + * Separated unlinks-due-to-outdated-updates from + unlinks-due-to-fully-processed-updates in logged stats. + + * Added the option to tune down ETag validation in the object-server + during full-object reads. By default, every full read will continue + to have its ETag validated as bytes are streamed to the proxy-server. + The `etag_validate_pct` option may be used to configure approximately + what percentage of full-object reads should be validated; reducing this + can improve performance when object-servers are CPU-constrained. + + Partial reads continue to never have their ETag validated in the + object-server. The object-auditor continues to periodically validate + every object's ETag. + + * The `cooperative_period` option for the object-server now affects PUT + requests in a way similar to GET responses. + + * Fixed an issue with the object-reconstructor that would prevent + reconstruction of objects with non-ASCII header names. + + * Fixed an issue with the container-reconciler that could cause reconciler + databases to have conflicting rows that could not be resolved. + + * Removed use of the deprecated cgi module. + + * Various other minor bug fixes and improvements. + + +swift (2.34.0, OpenStack Dalmatian) + + * Middleware features: + + * The static large object (SLO) middleware now supports including + a `?part-number=` query parameter to request just part of a + large object. This may be used to enable efficient parallel + downloads. For more information, see + https://docs.openstack.org/swift/latest/overview_large_objects.html#retrieving-a-large-object + + * The S3 API middleware now supports the `?partNumber=` query + parameter, enabling parallel downloads. + + * The KMS keymaster now supports overriding the endpoint returned + in the Keystone catalog via the `barbican_endpoint` configuration + option. This may be useful in multi-region deployments which have + multiple endpoints. + + * The backend ratelimiter now supports dynamic reloading of limits. + The new configuration options `backend_ratelimit_conf_path` and + `config_reload_interval` control which file is reloaded and how + frequently, respectively. + + * The backend ratelimiter now supports per-method, per-device + ratelimits. See `etc/backend-ratelimit.conf-sample` for more + information. + + * S3 API improvements: + + * Error response reasons are now logged as part of the `log_info` + field. This can be especially useful when diagnosing HEAD + failures, which necessarily have no response body. + + * Fixed a server error when using non-ASCII access key IDs. + + * Fixed several checksum-related error responses to be more AWS-like. + + * Account quota improvements: + + * Overall account byte quotas should now be set with the + `X-Account-Quota-Bytes` header. The `X-Account-Meta-Quota-Bytes` + header is now deprecated. + + * The account quota middleware now supports object-count quotas + in addition to byte-count quotas, similar to the container + quota middleware. For more information, see + https://docs.openstack.org/swift/latest/middleware.html#module-swift.common.middleware.account_quotas + + * Using an `X-Remove-Account-Quota-Bytes-Policy-` header + now removes the per-policy quota, rather than reducing it to zero. + + * Object expiration improvements: + + * Added per-account and per-container reaping delays. These may be + used to offer some grace period in which to recover expired objects. + + * Added a proxy-server configuration option: `allow_open_expired`. + This defaults to false; if true, clients may intereact with expired + objects by including an `X-Open-Expired: true` header in GET, HEAD, + or POST requests. + + * Expiring object queue entries now include the size of the object to + be expired in the `swift_expirer_bytes` parameter of the queue entry's + content-type. + + * Added metrics to count skipped, delayed, and assigned tasks as + they're enumerated. + + * Proxy-server fixes: + + * Object POSTs now 503 rather than trusting the 404s that might be + returned from handoffs when primaries are overloaded. + + * Client disconnects should always be logged within the context of + the appropriate client request. Previously, there were some cases + where logging would occur during general garbage collection, leading + to incorrect or missing transaction IDs in logs. + + * The proxy-logging middleware now emits timing and transfer stats for + more requests such as auth requests. These will be labeled `UNKNOWN` + rather than `account`, `container`, etc. + + * Fixed a server error when the Swift request path has missing account + or container components. + + * Python 3.12 is now supported. + + * `EUCLEAN` errors are better handled on the object server. + + * `swift-account-info` now supports the `--sync` option to display the + contents of the incoming and outgoing sync tables, similar to + `swift-container-info`. + + * The `swift-drive-audit` tool now works with ISO timestamps in kernel + logs. + + * The `swift-recon-cron` tool now better handles missing directories. + + * Dependency update: lxml must be at least 4.2.3. + + * Various other minor bug fixes and improvements. + + +swift (2.33.0, OpenStack Caracal) + + * S3 API fixes: + + * When the `+segments` container's storage policy differs from that of + the primary container, completed manifests are now written with the + correct policy in the primary container. + + * Add basic read support for object locking. This improves + compatibility with an Ansible S3 module. Write support is not + yet implemented, so get-object-lock-configuration will always 404. + + * If there's a conflict deleting the in-progress-upload marker when + completing a multipart-upload, a 503 is now returned to the client, + prompting it to retry. + + * Added `Accept-Ranges: bytes` to object responses. Range requests + have always been supported; now, that support is properly advertised. + + * Static large object fixes: + + * Fixed a server error when handling conditional GET requests. + + * Return an error if the SLO manifest could not be parsed. Previously, + a zero-byte response was returned. + + * Proxy server fixes: + + * Added a new `swift.proxy_logging_status` request environment key that + middlewares may use to override the logged status for a request. + + * Transaction IDs are included in more error responses. + + * Added a counter metric when caching shard ranges. + + * The `recoverable_node_timeout` option no longer applies to + `X-Newest` GET requests. + + * Improved error-handling in multi-part range responses. + + * Sharding fixes: + + * Prevent resets of a shard range's epoch. + + * Cleaned up `X-Backend-*` headers in listing responses. + + * Reduced the frequency of `Reclaimable db stuck waiting for shrinking` + messages when a root DB has been deleted but its shards have not been + shrunk away. + + * The more-efficient shard range structure from the last release is now + used when fetching ranges from the backend. + + * Include more information in shard-replication warnings. + + * Object server fixes: + + * Object POSTs and chunked PUTs are no longer accepted when the target + drive is already past its `fallocate_reserve`. DELETEs are still + allowed. + + * Added the ability to configure cooperative yielding when servicing + GET responses, via the `cooperative_period` option. See the example + config for more information. + + * Invalid `hashes.invalid` entries are now ignored, rather than + causing a complete partition rehash. + + * Prefix-based tempurls may now be used to explore staticweb + listings within that prefix. Note that this opens a new ability + to list containers from tempurls, but only if staticweb's + `X-Container-Meta-Web-Listings` is enabled. + + * When generating index pages from listings, staticweb now sends an HTML5 + doctype. This makes them https://peps.python.org/pep-0503/ compliant, + allowing their continued use for simple Python package repositories. + + * Per-service `auto_create_account_prefix` configuration options have + been removed. These options were deprecated in favor of `swift.conf` + configuration in Swift 2.24.0, part of the OpenStack Ussuri release. + + * Added a `swift-reload` command to assist with safely reloading WSGI + servers. + + * Daemons now send `STOPPING` and `RELOADING` systemd notifications + when the service is configured with `Type=notify`. + + * Added more metrics to the container-server, allowing GET and PUT timings + to be broken out for listings, shard range operations, and container + creation. + + * Daemons send object updates via the replication network in more cases. + + * `swift-account-info` and `swift-container-info` now accept a `--sync` + flag to show information from the incoming/outgoing sync tables. + + * Several fixes to prepare for Python 3.12 support. While not yet tested + in the gate, initial manual testing looks promising. + + * Added support for recent versions of eventlet. + + * The dark-data object audit watcher now works with sharded containers. + Previously, it would think that all data files were absent from + listings. + + * Various other minor bug fixes and improvements. + + +swift (2.32.0, OpenStack Bobcat) + + * Python 3 fixes: + + * Python 3 object servers can now read unencrypted non-ASCII metadata + that was written under Python 2. + + * Ssync no longer corrupts unencrypted non-ASCII metadata during + transfers. + + * Fixed an encoding issue when writing non-ASCII object names to sharded + containers and shard range caching is not enabled. + + * Fixed an encoding issue when handling non-ASCII account names. + + * Fixed a `generator already executing` error on client disconnect. + + * Suppressed `RemoteDisconnected` tracebacks. + + * Metrics improvements: + + * Metrics are now emitted for a variety of S3 error responses, in the + form `s3api..[.]` + + * Fixed an issue that prevented proxy servers from emitting metrics and + logs for backend requests made when getting account or container info. + + * Account and container info metrics now include the response status code + when backend requests are made. + + * Added timing metrics to the container sharder for various operations. + + * Python 3.11 is now supported. + + * Added the ability for reseller admins to set per-policy account quotas by + posting metadata of the form `X-Account-Quota-Bytes-Policy-`. + + * Added a `keepalive_timeout` option to the proxy server to limit how long + to wait for a client to initiate a request, separate from the general + `client_timeout` option. Note that this requires eventlet 0.34.0 + (currently unreleased) or later. + + * Added a `keep_cache_slo_manifest` option to the object server to better + control whether SLO manifests are dropped from the page cache. + + * WSGI servers now accept a `--test-config` option that may be used to + validate configuration changes before reloading/restarting the server. + + * The structure of cached shard ranges has changed, improving performance + when listing or writing to sharded containers. Note that immediately + after upgrade, the new structures will all be cache misses, which may + lead to a thundering herd problem. To avoid this, upgrade just a few + nodes first, let them service some fraction of traffic to populate the + cache, then upgrade the rest of the cluster. + + * A variety of performance improvements have been made for sharded + container databases. + + * Various logging and metrics improvements when talking to memcache. + + * Fixed ssync's handling of timestamp offsets. Previously, this could cause + ssync to fail with a 409 Conflict, causing the transfer to fail and + preventing handoffs from clearing. + + * Fixed an issue where an erasure-coded PUT could prevent other requests + from being processed when network calls rarely or never blocked. + + * Fixed an issue when downloading an SLO manifest would hit a recoverable + error and attempt to resume from another node. This would manifest as + either a pyeclib decode error or an unexpected empty response. + + * The proxy server now applies error-limiting to the correct node when + handling a recoverable node error. + + * Account, container, and object log fields are now correctly identified + when returning `BadDigest` responses to S3 requests. + + * Previously, under some circumstances, a non-standard config option such + as `RECLAIM_AGE` might get parsed as `reclaim_age` for some processes + but ignored by others. Now, all config parsing is case-sensitive; + non-standard names will always be ignored. + + * Reduced the backend load of making `?versions` requests to a container + that has never had object versioning enabled. + + * Improved formatting of meta and sysmeta for `swift-account-info` and + `swift-container-info`. + + * The standard-library `logging` module is no longer monkey-patched when + importing `swift.common.utils`, making it easier to re-use swift code + in other contexts. + + * Removed the hard dependency on netifaces; it may still be used if the + `getifaddrs` C function is not available. This fallback support may be + removed in a future release. + + * Various other minor bug fixes and improvements. + + +swift (2.31.1, OpenStack Antelope) + + * Sharding fixes + + * Shards no longer report stats to the root database when they are in + the CREATED state. + + * Sharding metadata is no longer cleared when databases are deleted. + This could previously cause deleted shards that still had rows to + become stuck and never move them to the correct database. + + * Fixed a performance regression in the handling of misplaced objects. + + * Swift path and on-disk path are now included with all sharder logging. + + * `s3token` no longer mangles request paths that include the Access Key ID. + + * User metadata is now exposed via CORS when encryption is enabled, + matching the behavior when encryption is not enabled. + + * Fewer backend requests are now required when account or container + information is missing from memcache. + + * Fixed logging of IP and port in the proxy-server; in particular, + internal clients now correctly log about the replication IP/port. + + * Fixed a bug in the object replicator that would cause an under-reporting + of failures. + + * Various other minor bug fixes. + + +swift (2.31.0) + + * S3 API improvements + + * Fixed a security issue in how `s3api` handles XML parsing that allowed + authenticated S3 clients to read arbitrary files from proxy servers. + Refer to CVE-2022-47950 for more information. + + * Fixed a server error when handling malformed CompleteMultipartUpload + requests. + + * Improved error reporting when attempting to set invalid `X-Delete-At` + or `X-Delete-After` values via the S3 API. + + * Sharding improvements + + * Sync more shard ranges from the root database to the shards. This + helps ensure shard range repairs effected at the root make their way + to shards that would otherwise be stuck trying to further divide + into sub-shards. + + * Added a `merge` subcommand to `swift-manage-shard-ranges` to merge + arbitrary shard ranges into a container DB. Minimal safety checks + are performed; it should only be used for emergency shard range + manipulation by expert users. + + * Improved performance of `delimiter` listings for sharded containers. + + * Added more safety checks to the `repair` subcommand of + `swift-manage-shard-ranges`. + + * Better handle `EOFError` and `KeyboardInterrupt` when prompting for + input in `swift-manage-shard-ranges`. + + * Warnings are now emitted when sharding appears to have become stuck. + Use the new `container_sharding_timeout` option to configure the + "stuck" threshold; the default is 48 hours. + + * Stop warning about transient overlaps when auditing shard ranges. + + * Metrics improvements + + * Added timing stats for memcached operations. + + * Renamed and improved the granularity of shard range cache and + backend stats. Metrics dashboards may need to be updated. + + * Emit stats when backend nodes are error-limited. + + * Added support for Python 3.10. + + * Added an optional `backend_ratelimit` middleware for backend servers. + See the backend server sample configuration files for more information. + + * Added the ability to configure a chance to skip checking memcache when + querying account and container information. This allows some fraction + of traffic to go to disk and refresh memcache before the key ages out. + Recommended values for the new `account_existence_skip_cache_pct` and + `container_existence_skip_cache_pct` options are in the range of + 0.0 to 0.01. + + * Static large object segments may now be deleted asynchronously by + default. Operators may return to the old behavior by disabling the + `allow_async_delete` option in the `[filter:slo]` section + in their proxy-server.conf. + + * Absolute-form request targets are now accepted. This enables access for + certain clients and SDKs (including some older versions of rclone that + were using an old version of aws-sdk-go). + + * Fixed a path-rewriting bug introduced in Python 3.7.14, 3.8.14, 3.9.14, + and 3.10.6 that could cause some `domain_remap` requests to be routed to + the wrong object. + + * Fixed a server error when attempting to access data in a deleted + container that had an erasure-coded storage policy. + + * Improved error messages to clients that encounter errors using the + `formpost` middleware. + + * Removed some inappropriate error-suppression when locking account and + container databases. + + * Improved server start-up time when using multiple workers. + + * Removed some unnecessary locking when logging. + + * Added some basic object-metadata validation; invalid diskfiles will be + quarantined via the auditor or reconstructor. + + * Enhanced logging when error-limiting a backend node. + + * Various other minor bug fixes and improvements. + + +swift (2.30.1, zed stable backports) + + * Fixed a security issue in how `s3api` handles XML parsing that allowed + authenticated S3 clients to read arbitrary files from proxy servers. + Refer to CVE-2022-47950 for more information. + + * Fixed a path-rewriting bug introduced in Python 3.7.14, 3.8.14, 3.9.14, + and 3.10.6 that could cause some `domain_remap` requests to be routed to + the wrong object. + + +swift (2.30.0, OpenStack Zed) + + * Sharding improvements + + * The `swift-manage-shard-ranges` tool has a new mode to repair gaps + in the namespace. + + * Misplaced tombstone records are now properly cleaved. + + * Fixed a bug where the sharder could fail to find a device to use for + cleaving. + + * Databases marked deleted are now processed by the sharder. + + * More information is now synced to the fresh database when sharding. + Previously, a database could lose the fact that it had been marked + as deleted. + + * Shard ranges with no rows to cleave could previously be left in the + CREATED state after cleaving. Now, they are advanced to CLEAVED. + + * Metrics are now emitted for whether databases used for cleaving + were created or already existed, allowing a better understanding + of the reason for handoffs in the cluster. + + * Misplaced-record stats are now also emitted to statsd. Previously, + these were only available in logs. + + * S3 API improvements + + * Constant-time string comparisons are now used when checking signatures. + + * Fixed cross-policy object copies. Previously, copied data would + always be written using the source container's policy. Now, the + destination container's policy will be used, avoiding availability + issues and unnecessary container-reconciler work. + + * More headers are now copied from multi-part upload markers to their + completed objects, including `Content-Encoding`. + + * When running with `s3_acl` disabled, `bucket-owner-full-control` and + `bucket-owner-read` canned ACLs will be translated to the same Swift + ACLs as `private`. + + * The S3 ACL and Delete Multiple APIs are now less case-sensitive. + + * Improved the error message when deleting a bucket that's ever had + versioning enabled and still has versions in it. + + * `LastModified` timestamps in listings are now rounded up to whole + seconds, like they are in responses from AWS. + + * Proxy logging for Complete Multipart Upload requests is now more + consistent when requests have been retried. + + * Logging improvements + + * Signal handling is more consistently logged at notice level. + Previously, signal handling would sometimes be logged at info + or error levels. + + * The message template for proxy logging may now include a + `{domain}` field for the client-provided `Host` header. + + * The object-replicator now logs successful rsync transfers at debug + instead of info. + + * Added a `log_rsync_transfers` option to the object-replicator. + Set it to false to disable logging rsync "send" lines; during + large rebalances, such logging can overwhelm log aggregation + while providing little useful information. + + * Transaction IDs are now only included in daemon log lines + in a request/response context. + + * Fixed a socket leak when clients try to delete a non-SLO as though + it were a Static Large Object. + + * The formpost digest algorithm is now configurable via the new + `allowed_digests` option, and support is added for both SHA-256 + and SHA-512. Supported formpost digests are exposed to clients in + `/info`. Additionally, formpost signatures can now be base64 encoded. + + * Added metrics to the formpost and tempurl middlewares to monitor + digest usage in signatures. + + * SHA-1 signatures are now deprecated for the formpost and tempurl + middlewares. At some point in the future, SHA-1 will no longer be + enabled by default; eventually, support for it will be removed + entirely. + + * Improved compatibility with certain FIPS-mode-enabled systems. + + * Added a `ring_ip` option for various object services. This may be + used to find own devices in the ring in a containerized environment + where the `bind_ip` may not appear in the ring at all. + + * Account and container replicators can now be configured with a + `handoff_delete` option, similar to object replicators and + reconstructors. See the sample config for more information. + + * Developers using Swift's memcache client may now opt in to having + a `MemcacheConnectionError` be raised when no connection succeeded + using a new `raise_on_error` keyword argument to `get`/`set`. + + * The tempurl middleware has been updated to return a 503 if storing a + token in memcache fails. Third party authentication middlewares are + encouraged to also use the new `raise_on_error` keyword argument + when storing ephemeral tokens in memcache. + + * Pickle support has been removed from Swift's memcache client. Support + had been deprecated since Swift 1.7.0. + + * Device names are now included in new database IDs. This provides more + context when examining incoming/outgoing sync tables or sharding + CleaveContexts. + + * Database replication connections are now closed following an error + or timeout. This prevents a traceback in some cases when the replicator + tries to reuse the connection. + + * `ENOENT` and `ENODATA` errors are better handled in the object + replicator and auditor. + + * Improved object update throughput by shifting some shard range + filtering from Python to SQL. + + * Include `Vary: Origin` header when CORS responses vary by origin. + + * The staticweb middleware now allows empty listings at the root of + a container. Previously, this would result in a 404 response. + + * Ring builder output tables better display weights over 1000. + + * Various other minor bug fixes and improvements. + + +swift (2.29.2, yoga stable backports) + + * Fixed a security issue in how `s3api` handles XML parsing that allowed + authenticated S3 clients to read arbitrary files from proxy servers. + Refer to CVE-2022-47950 for more information. + + * Constant-time string comparisons are now used when checking S3 API + signatures. + + * Fixed a path-rewriting bug introduced in Python 3.7.14, 3.8.14, 3.9.14, + and 3.10.6 that could cause some `domain_remap` requests to be routed to + the wrong object. + + * Improved compatibility with certain FIPS-mode-enabled systems. + + +swift (2.29.1, OpenStack Yoga) + + * This is the final stable branch that will support Python 2.7. + + * Fixed s3v4 signature calculation when the client sends an un-encoded + path in the request. + + * Fixed multiple issues in s3api involving Multipart Uploads with + non-ASCII names. + + * The object-updater now defers rate-limited updates to the end of its + cycle; these deferred updates will be processed (at the limited rate) + until the configured `interval` elapses. A new `max_deferred_updates` + option may be used to bound the deferral queue. + + * Empty account and container partition directories are now cleaned up + immediately after replication, rather than needing to wait for an + additional replication cycle. + + * The object-expirer now only cleans up empty containers. Previously, it + would attempt to delete all processed containers, regardless of whether + there were entries which were skipped or had errors. + + * A new `item_size_warning_threshold` option may be used to monitor for + values that are approaching the limit of what can be stored in memcache. + See the memcache sample config for more information. + + * Internal clients now correctly use their configured User-Agent in + backend requests, rather than only using it for logging. + + * Various other minor bug fixes and improvements. + + +swift (2.29.0) + + * S3 API improvements + + * CORS preflights are now allowed for pre-signed URLs. + + * The `storage_domain` option now accepts a comma-separated list of + storage domains. This allows multiple storage domains to configured + for use with virtual-host style addressing. + + * Fixed the types of configured values in /info response. + + * Fixed a server error when trying to copy objects with non-ASCII names. + + * Fixed a server error when uploading objects with very long names. + A KeyTooLongError is now returned. + + * Fixed an error when multi-deleting MPUs when SLO async-deletes + are enabled. + + * Fixed an error that allowed list-uploads and list-parts requests to + return incomplete or out-of-order results. + + * Fixed several bugs when dealing with non-ASCII object names and + multipart uploads. + + * Reduced the overhead of retrieving bucket and object ACLs. + + * Replication, reconstruction, and diskfile improvements + + * The reconstructor now uses the replication network to fetch fragments + for reconstruction. + + * Added the ability to limit how many objects per handoff partition + will be reverted in a reconstructor cycle using the new + `max_objects_per_revert` option. This may be useful to reduce + ssync timeouts and lock contention, ensuring that progress is made + during rebalances. + + * Ensure that non-durable data and .meta files are purged from handoffs + after syncing. + + * Fixed tracebacks when there's a race to mark a file durable or delete it. + + * Improved cooperative multitasking during ssync. + + * Upon detecting a ring change, the reconstructor now only aborts the + jobs for that ring and continues processing jobs for other rings. + + * Fixed a traceback when logging about a lock timeout in the replicator. + + * Object updater improvements + + * Added the ability to ratelimit updates (approximately) per-container + using the new `max_objects_per_container_per_second` option. This may + be used to limit requests to already-overloaded containers while still + making progress on updates to other containers. + + * Added timing stats by response code. + + * Updates are now sent over the replication network. + + * Fixed a race condition where swift would attempt to quarantine + recently-deleted updates. + + * Memcache improvements + + * Added the ability to configure a chance to skip checking memcache when + querying shard ranges. This allows some fraction of traffic to go to + disk and refresh memcache before the key ages out. Recommended values + for the new `container_updating_shard_ranges_skip_cache_pct` and + `container_listing_shard_ranges_skip_cache_pct` options are in the + range of 0.0 to 0.1. + + * Added stats for shard range cache hits, misses, and skips. + + * Improved handling of timeouts and other errors when obtaining a + connection to memcached. + + * Recon improvements + + * Added object-reconstructor stats to recon. + + * Each object-server IP is now queried only once when reporting disk + usage. Previously, each port in the ring would be queried; when using + servers-per-port, this could dramatically overstate the disk capacity + in the cluster. + + * Fixed a security issue where tempurl and s3api signatures were logged in + full. This allowed an attacker with access to log data to perform replay + attacks, potentially accessing or overwriting cluster data. Now, such + signatures are redacted in a manner similar to auth tokens; see the + `reveal_sensitive_prefix` option in `proxy-server.conf`. + + See CVE-2017-8761 for more information. + + * Added a new `swift.common.registry` module. This includes helper + functions `register_sensitive_header` and `register_sensitive_param` + which third party middleware authors may use to flag headers and query + parameters for redaction when logging. For more information, see + https://docs.openstack.org/swift/latest/misc.html#module-swift.common.registry + + * Added the ability to configure project-scope read-only roles for + keystoneauth using the new `project_reader_roles` option. + + * The cname_lookup middleware now works with dnspython 2.0 and later. + + * The internal clients used by the container-reconciler, container-sharder, + container-sync, and object-expirer daemons now use a more-descriptive + `-ic` log name, rather than `swift`. If you previously + configured the `log_name` option in `internal-client.conf`, you must + now use the `set log_name = ` syntax to configure it, even if + no value is set in the `[DEFAULT]` section. This may be done prior to + upgrading. + + * Fixed a bug that allowed some statsd metrics to be annotated with the + wrong backend layer. + + * The `StatsdClient.set_prefix` method is now deprecated and + may be removed in a future release; by extension, so is the + `LogAdapter.set_statsd_prefix` method. Middleware developers should + use the `statsd_tail_prefix` argument to `get_logger` instead. + + * Fixed a traceback in the account-server when there's no account + database on disk to receive a container update. The account-server + now correctly 404s. + + * The container-updater will quarantine container databases if all + replicas for the account respond 404. + + * Fixed a proxy-server error when the read-only middleware tried to + handle non-Swift paths (such as may be used by third-party middleware). + + * Some client behaviors that the proxy previously logged at warning have + been lowered to info. + + * Removed translations from most logging. + + * Various other minor bug fixes and improvements. + + +swift (2.28.1, xena stable backports) + + * Fixed a security issue in how `s3api` handles XML parsing that allowed + authenticated S3 clients to read arbitrary files from proxy servers. + Refer to CVE-2022-47950 for more information. + + * Constant-time string comparisons are now used when checking S3 API + signatures. + + * Fixed a path-rewriting bug introduced in Python 3.7.14, 3.8.14, 3.9.14, + and 3.10.6 that could cause some `domain_remap` requests to be routed to + the wrong object. + + * Improved compatibility with certain FIPS-mode-enabled systems. + + * Ensure that non-durable data and .meta files are purged from handoffs + after syncing. + + +swift (2.28.0, OpenStack Xena) + + * Sharding improvements: + + * When building a listing from shards, any failure to retrieve + listings will result in a 503 response. Previously, failures + fetching a partiucular shard would result in a gap in listings. + + * Container-server logs now include the shard path in the referer + field when receiving stat updates. + + * Added a new config option, `rows_per_shard`, to specify how many + objects should be in each shard when scanning for ranges. The default + is `shard_container_threshold / 2`, preserving existing behavior. + + * Added a new config option, `minimum_shard_size`. When scanning + for shard ranges, if the final shard would otherwise contain + fewer than this many objects, the previous shard will instead + be expanded to the end of the namespace (and so may contain up + to `rows_per_shard + minimum_shard_size` objects). This reduces + the number of small shards generated. The default value is + `rows_per_shard / 5`. + + * Added a new config option, `shrink_threshold`, to specify the + absolute size below which a shard will be considered for shrinking. + This overrides the `shard_shrink_point` configuration option, which + expressed this as a percentage of `shard_container_threshold`. + `shard_shrink_point` is now deprecated. + + * Similar to above, `expansion_limit` was added as an absolute-size + replacement for the now-deprecated `shard_shrink_merge_point` + configuration option. + + * The sharder now correctly identifies and fails audits for shard + ranges that overlap exactly. + + * The sharder and swift-manage-shard-ranges now consider total row + count (instead of just object count) when deciding whether a shard + is a candidate for shrinking. + + * If the sharder encounters shard range gaps while cleaving, it will + now log an error and halt sharding progress. Previously, rows may + not have been moved properly, leading to data loss. + + * Sharding cycle time and last-completion time are now available via + swift-recon. + + * Fixed an issue where resolving overlapping shard ranges via shrinking + could prematurely mark created or cleaved shards as active. + + * `swift-manage-shard-ranges` improvements: + + * Exit codes are now applied more consistently: + + - 0 for success + - 1 for an unexpected outcome + - 2 for invalid options + - 3 for user exit + + As a result, some errors that previously resulted in exit code 2 + will now exit with code 1. + + * Added a new 'repair' command to automatically identify and + optionally resolve overlapping shard ranges. + + * Added a new 'analyze' command to automatically identify overlapping + shard ranges and recommend a resolution based on a JSON listing + of shard ranges such as produced by the 'show' command. + + * Added a `--includes` option for the 'show' command to only output + shard ranges that may include a given object name. + + * Added a `--dry-run` option for the 'compact' command. + + * The 'compact' command now outputs the total number of compactible + sequences. + + * S3 API improvements: + + * Added an option, `ratelimit_as_client_error`, to return 429s for + rate-limited responses. Several clients/SDKs have seem to support + retries with backoffs on 429, and having it as a client error + cleans up logging and metrics. By default, Swift will respond 503, + matching AWS documentation. + + * Fixed a server error in bucket listings when `s3_acl` is enabled + and staticweb is configured for the container. + + * Fixed a server error when a client exceeds `client_timeout` during an + upload. Now, a `RequestTimeout` error is correctly returned. + + * Fixed a server error when downloading multipart uploads/static large + objects that have missing or inaccessible segments. This is a state + that cannot arise in AWS, so a new `BrokenMPU` error is returned, + indicating that retrying the request is unlikely to succeed. + + * Fixed several issues with the prefix, marker, and delimiter + parameters that would be mirrored back to clients when listing + buckets. + + * Partition power increase improvements: + + * The relinker now spawns multiple subprocesses to process disks + in parallel. By default, one worker is spawned per disk; use the + new `--workers` option to control how many subprocesses are used. + Use `--workers=0` to maintain the previous behavior. + + * The relinker now performs eventlet-hub selection the same way as + other daemons. In particular, `epolls` will no longer be selected, + as it seemed to cause occassional hangs. + + * The relinker can now target specific storage policies or + partitions by using the new `--policy` and `--partition` + options. + + * Partitions that encountered errors during relinking are no longer + marked as completed in the relinker state file. This ensures that + a subsequent relink will retry the failed partitions. + + * Partition cleanup is more robust, decreasing the likelihood of + leaving behind mostly-empty partitions from the old partition + power. + + * Improved relinker progress logging, and started collecting + progress information for swift-recon. + + * Cleanup is more robust to files and directories being deleted by + another process. + + * The relinker better handles data found from earlier partition power + increases. + + * The relinker better handles tombstones found for the same object + but with different inodes. + + * The reconciler now defers working on policies that have a partition + power increase in progress to avoid issues with concurrent writes. + + * Erasure coding fixes: + + * Added the ability to quarantine EC fragments that have no (or few) + other fragments in the cluster. A new configuration option, + `quarantine_threshold`, in the reconstructor controls the point at + the fragment will be quarantined; the default (0) will never + quarantine. Only fragments older than `quarantine_age` (default: + `reclaim_age`) may be quarantined. Before quarantining, the + reconstructor will attempt to fetch fragments from handoff nodes + in addition to the usual primary nodes; a new `request_node_count` + option (default `2 * replicas`) limits the total number of nodes to + contact. + + * Added a delay before deleting non-durable data. A new configuration + option, `commit_window` in the `[DEFAULT]` section of + object-server.conf, adjusts this delay; the default is 60 seconds. This + improves the durability of both back-dated PUTs (from the reconciler or + container-sync, for example) and fresh writes to handoffs by preventing + the reconstructor from deleting data that the object-server was still + writing. + + * Improved proxy-server and object-reconstructor logging when data + cannot be reconstructed. + + * Fixed an issue where some but not all fragments having metadata + applied could prevent reconstruction of missing fragments. + + * Server-side copying of erasure-coded data to a replicated policy no + longer copies EC sysmeta. The previous behavior had no material + effect, but could confuse operators examining data on disk. + + * Python 3 fixes: + + * Fixed a server error when performing a PUT authorized via + tempurl with some proxy pipelines. + + * Fixed a server error during GET of a symlink with some proxy + pipelines. + + * Fixed an issue with logging setup when /dev/log doesn't exist + or is not a UNIX socket. + + * The container-reconciler now scales out better with new `processes`, + `process`, and `concurrency` options, similar to the object-expirer. + + * The dark-data audit watcher now skips objects younger than a new + configurable `grace_age` period. This avoids issues where data + could be flagged, quarantined, or deleted because of listing + consistency issues. The default is one week. + + * The dark-data audit watcher now requires that all primary locations + for an object's container agree that the data does not appear in + listings to consider data "dark". Previously, a network partition + that left an object node isolated could cause it to quarantine or + delete all of its data. + + * More daemons now support systemd notify sockets. + + * `EPIPE` errors no longer log tracebacks. + + * The account and container auditors now log and update recon before + going to sleep. + + * The object-expirer logs fewer client disconnects. + + * `swift-recon-cron` now includes the last time it was run in the recon + information. + + * `EIO` errors during read now cause object diskfiles to be quarantined. + + * The formpost middleware now properly supports uploading multiple files + with different content-types. + + * Various other minor bug fixes and improvements. + + +swift (2.27.0, OpenStack Wallaby) + + * Added "audit watcher" hooks to allow operators to run arbitrary code + against every diskfile in a cluster. For more information, see + https://docs.openstack.org/swift/latest/development_watchers.html + + * Added support for system-scoped "reader" roles when authenticating using + Keystone. Operators may configure this using the `system_reader_roles` + option in the `[filter:keystoneauth]` section of their proxy-server.conf. + + A comparable group, `.reseller_reader`, is now available for development + purposes when authenticating using tempauth. + + * Allow static large object segments to be deleted asynchronously. + Operators may opt into this new behavior by enabling the new + `allow_async_delete` option in the `[filter:slo]` section + in their proxy-server.conf. For more information, see + https://docs.openstack.org/swift/latest/overview_large_objects.html#deleting-a-large-object + + * Added the ability to connect to memcached over TLS. See the + `tls_*` options in etc/memcache.conf-sample + + * The proxy-server now caches 'listing' shards, improving listing + performance for sharded containers. A new config option, + `recheck_listing_shard_ranges`, controls the cache time and defaults to + 10 minutes; set it to 0 to disable caching (the previous behavior). + + * Added a new optional proxy-logging field `{wire_status_int}` for the + status code returned to the client. For more information, see + https://docs.openstack.org/swift/latest/logs.html#proxy-logs + + * Errors downloading a Static Large Object that cause a shorter-than-expected + response are now logged as 500s. + + * Memcache client error-limiting is now configurable. See the + `error_suppression_*` options in etc/memcache.conf-sample + + * Added `tasks_per_second` option to rate-limit the object-expirer. + + * Added `usedforsecurity` annotations for use on FIPS-compliant systems. + + * Added an option to write EC fragments with legacy CRC to ensure a smooth + upgrade from liberasurecode<=1.5.0 to >=1.6.2. For more information, see + https://bugs.launchpad.net/liberasurecode/+bug/1886088 + + * **Known Issue**: Operators should verify that encryption is not enabled + in their reconciler pipelines; having it enabled there may harm data + durability. For more information, see https://launchpad.net/bugs/1910804 + + * S3 API improvements: + + * Fixed a bug that prevented the s3api pipeline validation described in + proxy-server.conf-sample from being performed. As documented, operators + can disable this via the `auth_pipeline_check` option if proxy startup + fails with validation errors. + + * Make allowable clock skew configurable, with a default value of + 15 minutes to match AWS. Note that this was previously hardcoded at + 5 minutes; operators may want to preserve the prior behavior by setting + `allowable_clock_skew = 300` in the `[filter:s3api]` section of their + proxy-server.conf. + + * Fixed an issue where SHA mismatches in client XML payloads would cause + a server error. Swift now correctly responds with a client error about + the bad digest. + + * Fixed an issue where non-base64 signatures would cause a server error. + Swift now correctly responds with a client error about the invalid + digest. + + * Container ACLs are now cloned to the `+segments` container when it is + created. + + * The correct storage policy is now logged for S3 requests. + + * Added the ability to configure auth region in s3token middleware. + + * CORS-related headers are now passed through appropriately when using + the S3 API. Note that allowed origins and other container metadata + must still be configured through the Swift API as documented at + https://docs.openstack.org/swift/latest/cors.html + + Preflight requests do not contain enough information to map a + bucket to an account/container pair; a new cluster-wide option + `cors_preflight_allow_origin` may be configured for such OPTIONS + requests. The default (blank) rejects all S3 preflight requests. + + * Sharding improvements: + + * Prevent shard databases from losing track of their root database when + deleted. + + * Prevent sharded root databases from being reclaimed to ensure that + shards can detect that they have been deleted. + + * A `--no-auto-shard` option has been added to `swift-container-sharder`. + + * The sharder daemon has been enhanced to better support the shrinking + of shards that are no longer required. Shard containers will now + discover from their root container if they should be shrinking. They + will also discover the shards into which they should shrink, which may + include the root container itself. + + * A 'compact' command has been added to `swift-manage-shard-ranges` that + enables sequences of contiguous shards with low object counts to be + compacted into another existing shard, or into the root container. + + * `swift-manage-shard-ranges` can now accept a config file; this + may be used to ensure consistency of threshold values with the + container-sharder config. + + * Overlapping shrinking shards no longer generate audit warnings; these + are expected to sometimes overlap. + + * The sharding progress reports in recon cache now continue to be included + for a period of time after sharding has completed. The time period + may be configured using the `recon_sharded_timeout` option in the + `[container-sharder]` section of container-server.conf, and defaults + to 12 hours. + + * Add root containers with compactible ranges to recon cache. + + * Expose sharding statistics in the backend recon middleware. + + * Replication improvements: + + * Fixed a race condition in ssync that could lead to a loss of data + durability (or even loss of data, for two-replica policies) when some + object servers have outdated rings. Replication via rsync is likely + still affected by a similar bug. + + * Non-durable fragments can now be reverted from handoffs. + + * The post-rsync REPLICATE call no longer recalculates hashes immediately. + + * Hashes are no longer invalidated after a successful ssync; they were + already invalidated during the data transfer. + + * Reduced log noise for common ssync errors. + + * Python 3 fixes: + + * Added support for Python 3.9. + + * Staticweb correctly handles listings when paths include non-ASCII + characters. + + * S3 API now allows multipart uploads with non-ASCII characters in the + object name. + + * Fixed an import-ordering issue in `swift-dispersion-populate`. + + * Partition power increase improvements: + + * Fixed a bug where stale state files would cause misplaced data during + multiple partition power increases. + + * Removed a race condition that could cause newly-written data to not be + linked into the new partition for the new partition power. + + * Improved safety during cleanup to ensure files have been relinked + appropriately before unlinking. + + * Added an option to drop privileges when running the relinker as root. + + * Added an option to rate-limit how quickly data files are relinked or + cleaned up. This may be used to reduce I/O load during partition power + increases, improving end-user performance. + + * Rehash partitions during the partition power increase. Previously, we + relied on the replication engine to perform the rehash, which could + cause an unexpected I/O spike after a partition power increase. + + * Warn when relinking/cleaning up and any disks are unmounted. + + * Log progress per partition when relinking/cleaning up. + + * During clean-up, stop warning about tombstones that got reaped from + the new location but not the old. + + * Added the ability to read options from object-server.conf, similar to + background daemons. + + * Turned off thread-logging when monkey-patching with eventlet. This + addresses a potential hang in the proxy-server while logging client + disconnects. + + * Fixed a bug that could cause EC GET responses to return a server error. + + * Fixed an issue with `swift-drive-audit` when run around New Year's. + + * Server errors encountered when validating the first segment of a Static or + Dynamic Large Object now return a 503 to the client, rather than a 409. + + * Errors when setting keys in memcached are now logged. This helps + operators detect when shard ranges for caching have gotten too large to + be stored, for example. + + * Various other minor bug fixes and improvements. + + +swift (2.26.0, OpenStack Victoria) + + * Extend concurrent reads to erasure coded policies. Previously, the + options `concurrent_gets` and `concurrency_timeout` only applied to + replicated policies. + + * Add a new `concurrent_ec_extra_requests` option to allow the proxy to + make some extra backend requests immediately. The proxy will respond as + soon as there are enough responses available to reconstruct. + + * The concurrent read options (`concurrent_gets`, `concurrency_timeout`, + and `concurrent_ec_extra_requests`) may now be configured per + storage-policy. + + * Replication servers can now handle all request methods. This allows + ssync to work with a separate replication network. + + * All background daemons now use the replication network. This allows + better isolation between external, client-facing traffic and internal, + background traffic. Note that during a rolling upgrade, replication + servers may respond with `405 Method Not Allowed`. To avoid this, + operators should remove the config option `replication_server = true` + from their replication servers; this will allow them to handle all + request methods before upgrading. + + * S3 API improvements: + + * Fixed some SignatureDoesNotMatch errors when using the AWS .NET SDK. + + * Add basic read support for object tagging. This improves + compatibility with AWS CLI version 2. Write support is not + yet implemented, so the tag set will always be empty. + + * CompleteMultipartUpload requests may now be safely retried. + + * Improved quota-exceeded error messages. + + * Improved logging and statsd metrics. Be aware that this will cause + an increase in the proxy-logging statsd metrics emited for S3 + responses. However, this should more accurately reflect the state + of the system. + + * S3 requests are now less demanding on the container layer. + + * Python 3 bug fixes: + + * Fixed an error when reading encrypted data that was written while + running Python 2 for a path that includes non-ASCII characters. This + was caused by a difference in string types that resulted in + ambiguity when decrypting. To prevent the ambiguity for new data, set + `meta_version_to_write = 3` in your keymaster configuration after + upgrading all proxy servers. + + If upgrading from Swift 2.20.0 or Swift 2.19.1 or earlier, set + `meta_version_to_write = 1` in your keymaster configuration prior + to upgrading. + + * Object expiration respects the `expiring_objects_container_divisor` + config option. + + * `fallocate_reserve` may be specified as a percentage in more places. + + * The ETag-quoting middleware no longer raises TypeErrors. + + * Sharding improvements: + + * Prevent object updates from auto-creating shard containers. This + ensures more consistent listings for sharded containers during + rebalances. + + * Deleted shard containers are no longer considered root containers. + This prevents unnecessary sharding audit failures and allows the + deleted shard database to actually be unlinked. + + * `swift-container-info` now summarizes shard range information. + Pass `-v`/`--verbose` if you want to see all of them. + + * Improved container-sharder stat reporting to reduce load on root + container databases. + + * Don't inject shard ranges when user quits. + + * Servers now open one listen socket per worker, ensuring each worker + serves roughly the same number of concurrent connections. + + * Server workers may now be gracefully terminated via `SIGHUP` or + `SIGUSR1`. The parent process will then spawn a fresh worker. + + * During rebalances, clients should no longer get 404s for data that + exists but whose replicas are overloaded. + + * Improved cache management for account and container responses. + + * Allow proxy-logging middlewares to be configured more independently. + + * Allow operators to pass either raw or URL-quoted paths to + swift-get-nodes. Notably, this allows swift-get-nodes to work with + the reserved namespace used for object versioning. + + * Container read ACLs now work with object versioning. This only + allows access to the most-recent version via an unversioned URL. + + * Improved how containers reclaim deleted rows to reduce locking and object + update throughput. + + * Large object reads log fewer client disconnects. + + * Allow ratelimit to be placed multiple times in a proxy pipeline, + such as both before s3api and auth (to handle swift requests without + needing to make an auth decision) and after (to limit S3 requests). + + * Shuffle object-updater work. This somewhat reduces the impact a + single overloaded database has on other containers' listings. + + * Fix a proxy-server error when retrieving erasure coded data when + there are durable fragments but not enough to reconstruct. + + * Fix an error in the proxy server when finalizing data. + + * Improve performance when increasing partition power. + + * Various other minor bug fixes and improvements. + + +swift (2.25.1, ussuri stable backports) + + * Python 3 bug fixes: + + * Fixed an error when reading encrypted data that was written while + running Python 2 for a path that includes non-ASCII characters. This + was caused by a difference in string types that resulted in + ambiguity when decrypting. To prevent the ambiguity for new data, set + `meta_version_to_write = 3` in your keymaster configuration after + upgrading all proxy servers. + + If upgrading from Swift 2.20.0 or Swift 2.19.1 or earlier, set + `meta_version_to_write = 1` in your keymaster configuration prior + to upgrading. + + * Object expiration respects the `expiring_objects_container_divisor` + config option. + + * `fallocate_reserve` may be specified as a percentage in more places. + + * The ETag-quoting middleware no longer raises TypeErrors. + + * Improved how containers reclaim deleted rows to reduce locking and object + update throughput. + + * Fix a proxy-server error when retrieving erasure coded data when + there are durable fragments but not enough to reconstruct. + + * Fixed some SignatureDoesNotMatch errors when using the AWS .NET SDK. + + * Region name config option is now respected when configuring S3 credential + caching. + + +swift (2.25.0, OpenStack Ussuri) + + * WSGI server processes can now notify systemd when they are ready. + + * Added `ttfb` (Time to First Byte) and `pid` (Process ID) to the set + of available proxy-server log fields. For more information, see + https://docs.openstack.org/swift/latest/logs.html + + * Improved proxy-server performance by reducing unnecessary locking, + memory copies, and eventlet scheduling. + + * Reduced object-replicator and object-reconstructor CPU usage by only + checking that the device list is current when rings change. + + * Improved performance of sharded container listings when performing + prefix listings. + + * Improved container-sync performance when data has already been + deleted or overwritten. + + * Account quotas are now enforced even on empty accounts. + + * Getting an SLO manifest with `?format=raw` now responds with an ETag + that matches the MD5 of the generated body rather than the MD5 of + the manifest stored on disk. + + * Provide useful status codes in logs for some versioning and symlink + subrequests that were previously logged as 499. + + * Fixed 500 from cname_lookup middleware. Previously, if the looked-up + domain was used by domain_remap to update the request path, the + server would respond Internal Error. + + * On Python 3, fixed an issue when reading or writing objects with a + content-type like `message/*`. Previously, Swift would fail to respond. + + * On Python 3, fixed a RecursionError in swift-dispersion-report when + using TLS. + + * Fixed a bug in the new object versioning API that would cause more + than `limit` results to be returned when listing. + + * Various other minor bug fixes and improvements. + + +swift (2.24.0) + + * Added a new object versioning mode, with APIs for querying and + accessing old versions. For more information, see the documentation + at https://docs.openstack.org/swift/latest/middleware.html#module-swift.common.middleware.versioned_writes.object_versioning + + * Added support for S3 versioning using the above new mode. + + * Added a new middleware to allow accounts and containers to opt-in to + RFC-compliant ETags. This may be useful when using Swift as an origin + for some content delivery networks. For more information, see the + documentation at https://docs.openstack.org/swift/latest/middleware.html#module-swift.common.middleware.etag_quoter + Clients should be aware of the fact that ETags may be quoted for RFC + compliance; this may become the default behavior in some future release. + + * Proxy, account, container, and object servers now support "seamless + reloads" via `SIGUSR1`. This is similar to the existing graceful + restarts but keeps the server socket open the whole time, reducing + service downtime. + + * New buckets created via the S3 API will now store multi-part upload + data in the same storage policy as other data rather than the + cluster's default storage policy. + + * Device region and zone can now be changed via `swift-ring-builder`. + Note that this may cause a lot of data movement on the next rebalance + as the builder tries to reach full dispersion. + + * Added support for Python 3.8. + + * The container sharder can now handle containers with special + characters in their names. + + * Internal client no longer logs object DELETEs as status 499. + + * Objects with an `X-Delete-At` value in the far future no longer cause + backend server errors. + + * The bulk extract middleware once again allows clients to specify metadata + (including expiration timestamps) for all objects in the archive. + + * Container sync now synchronizes static symlinks in a way similar to + static large objects. + + * `swift_source` is set for more sub-requests in the proxy-server. See + https://docs.openstack.org/swift/latest/logs.html#swift-source + + * Errors encountered while validating static symlink targets no longer + cause BadResponseLength errors in the proxy-server. + + * On Python 3, the KMS keymaster now works with secrets stored + in Barbican with a text/plain payload-content-type. + + * On Python 3, the formpost middleware now works with unicode file names. + + * Several utility scripts now work better on Python 3: + + * swift-account-audit + + * swift-dispersion-populate + + * swift-drive-recon + + * swift-recon + + * On Python 3, certain S3 API headers are now lower case as they + would be coming from AWS. + + * Per-service `auto_create_account_prefix` settings are now deprecated + and may be ignored in a future release; if you need to use this, please + set it in the `[swift-constraints]` section of /etc/swift/swift.conf. + + * Various other minor bug fixes and improvements. + + +swift (2.23.3, train stable backports) + + * Sharding improvements: + + * Prevent object updates from auto-creating shard containers. This + ensures more consistent listings for sharded containers during + rebalances. + + * Deleted shard containers are no longer considered root containers. + This prevents unnecessary sharding audit failures and allows the + deleted shard database to actually be unlinked. + + * The sharder daemon has been enhanced to better support the shrinking + of shards that are no longer required. Shard containers will now + discover from their root container if they should be shrinking. They + will also discover the shards into which they should shrink, which may + include the root container itself. + + * Improved performance of sharded container listings when performing + prefix listings. + + * Improved container-sharder stat reporting to reduce load on root + container databases. + + * The container sharder can now handle containers with special + characters in their names. + + * `swift-container-info` now summarizes shard range information. + Pass `-v`/`--verbose` if you want to see all of them. + + * Don't inject shard ranges when user quits. + + * Various other minor bug fixes and improvements. + + * Python 3 bug fixes: + + * Fixed a potential server error when uploading data via a tempurl. + + * Fixed a potential server error when getting symlink details. + + * Added the ability to connect to memcached over TLS. See the + `tls_*` options in etc/memcache.conf-sample + + +swift (2.23.2, train stable backports) + + * Python 3 bug fixes: + + * Fixed an error when reading encrypted data that was written while + running Python 2 for a path that includes non-ASCII characters. This + was caused by a difference in string types that resulted in + ambiguity when decrypting. To prevent the ambiguity for new data, set + `meta_version_to_write = 3` in your keymaster configuration after + upgrading all proxy servers. + + If upgrading from Swift 2.20.0 or Swift 2.19.1 or earlier, set + `meta_version_to_write = 1` in your keymaster configuration prior + to upgrading. + + * Fixed an issue when reading or writing objects with a content-type + like `message/*`. Previously, Swift would fail to respond. + + * Object expiration respects the `expiring_objects_container_divisor` + config option. + + * `fallocate_reserve` may be specified as a percentage in more places. + + * The formpost middleware now works with unicode file names. + + * Certain S3 API headers are now lower case as they would be coming + from AWS. + + * Improved how containers reclaim deleted rows to reduce locking and object + update throughput. + + * Fix a proxy-server error when retrieving erasure coded data when + there are durable fragments but not enough to reconstruct. + + * Fixed 500 from cname_lookup middleware. Previously, if the looked-up + domain was used by domain_remap to update the request path, the + server would respond Internal Error. + + * The bulk extract middleware once again allows clients to specify metadata + (including expiration timestamps) for all objects in the archive. + + * Errors encountered while validating static symlink targets no longer + cause BadResponseLength errors in the proxy-server. + + * Fixed some SignatureDoesNotMatch errors when using the AWS .NET SDK. + + * Various other minor bug fixes and improvements. + + +swift (2.23.1, train stable backports) + + * On Python 3, the KMS keymaster now works with secrets stored + in Barbican with a text/plain payload-content-type. + + * Several utility scripts now work better on Python 3: + + * swift-account-audit + + * swift-dispersion-populate + + * swift-drive-recon + + * swift-recon + + +swift (2.23.0, OpenStack Train) + + * Python 3.6 and 3.7 are now fully supported. Several py3-related + fixes are included: + + * Removed a request-smuggling vector when running a mixed + py2/py3 cluster. + + * Allow fallocate_reserve to be specified as a percentage. + + * Fixed listings for sharded containers. + + * Fixed non-ASCII account metadata handling. + + * Fixed rsync output parsing. + + * Fixed some title-casing of headers. + + If you've been testing Swift on Python 3, upgrade at your earliest + convenience. + + * Added "static symlinks", which perform some validation as they + follow redirects and include more information about their target + in container listings. + + * Multi-character strings may now be used as delimiters in account + and container listings. + + * Sharding improvements + + * Container metadata related to sharding are now removed when no + longer needed. + + * Empty container databases (such as might be created on handoffs) + now shard much more quickly. + + * The proxy-server now ignores 404 responses from handoffs that have + no data when deciding on the correct response for object requests, + similar to what it already does for account and container requests. + + * Static Large Object sizes in listings for versioned containers are + now more accurate. + + * When refetching Static Large Object manifests, non-manifest responses + are now handled better. + + * S3 API now translates 503 Service Unavailable responses to a more + S3-like response instead of raising an error. + + * Improved proxy-to-backend requests to be more RFC-compliant. + + * Dependency update: eventlet must be at least 0.25.0. This also + dragged forward minimum-supported versions of dnspython (1.15.0), + greenlet (0.3.2), and six (1.10.0). + + * Various other minor bug fixes and improvements. + + +swift (2.22.0) + + * Experimental support for Python 3.6 and 3.7 is now available. + Note that this requires eventlet>=0.25.0. All unit tests pass, + and running functional tests under Python 2 will pass against + services running under Python 3. Expect full support in the + next minor release. + + * Log formats are now more configurable and include support for + anonymization. See the log_msg_template option in proxy-server.conf + and https://docs.openstack.org/swift/latest/logs.html#proxy-logs + for more information. + + * Added an operator tool, swift-container-deleter, to asynchronously + delete some or all objects in a container using the object expirers. + + * Swift-all-in-one Docker images are now built and published to + https://hub.docker.com/r/openstackswift/saio. These are intended + for use as development targets, but will hopefully be useful as a + starting point for other work involving containerizing Swift. + + * The object-expirer may now be configured in object-server.conf. + This is in anticipation of a future change to allow the + object-expirer to be deployed on all nodes that run object-servers. + + * Correctness improvements + + * The proxy-server now ignores 404 responses from handoffs without + databases when deciding on the correct response for account and + container requests. + + * Object writes to a container whose existence cannot be verified + now 503 instead of 404. + + * Sharding improvements + + * The container-replicator now only attempts to fetch shard ranges if + the remote indicates that it has shard ranges. Further, it does so + with a timeout to prevent the process from hanging in certain cases. + + * The proxy-server now caches 'updating' shards, improving write + performance for sharded containers. A new config option, + `recheck_updating_shard_ranges`, controls the cache time; set it to + 0 to disable caching. + + * The container-replicator now correctly enqueues container-reconciler + work for sharded containers. + + * S3 API improvements + + * Unsigned payloads work with v4 signatures once more. + + * Multipart upload parts may now be copied from other multipart uploads. + + * CompleteMultipartUpload requests with a Content-MD5 now work. + + * Content-Type can now be updated when copying an object. + + * Fixed v1 listings that end with a non-ASCII object name. + + * Background corruption-detection improvements + + * Detect and remove invalid entries from hashes.pkl + + * When object path is not a directory, just quarantine it, + rather than the whole suffix. + + * Dependency updates: we've increased our minimum supported version + of cryptography to 2.0.2 and netifaces to 0.8. This is largely due + to the difficulty of continuing to test with the old versions. + + If running Swift under Python 3, eventlet must be at least 0.25.0. + + * Various other minor bug fixes and improvements. + + +swift (2.21.1, stein stable backports) + + * Sharding improvements + + * The container-replicator now only attempts to fetch shard ranges if + the remote indicates that it has shard ranges. Further, it does so + with a timeout to prevent the process from hanging in certain cases. + + * The container-replicator now correctly enqueues container-reconciler + work for sharded containers. + + * Container metadata related to sharding are now removed when no + longer needed. + + * S3 API improvements + + * Unsigned payloads work with v4 signatures once more. + + * Multipart upload parts may now be copied from other multipart uploads. + + * CompleteMultipartUpload requests with a Content-MD5 now work. + + * Content-Type can now be updated when copying an object. + + * Fixed v1 listings that end with a non-ASCII object name. + + * Background corruption-detection improvements + + * Detect and remove invalid entries from hashes.pkl + + * When object path is not a directory, just quarantine it, + rather than the whole suffix. + + * Static Large Object sizes in listings for versioned containers are + now more accurate. + + * When refetching Static Large Object manifests, non-manifest responses + are now handled better. + + * Cross-account symlinks now store correct account information in + container listings. This was previously fixed in 2.22.0. + + * Requesting multiple ranges from a Dynamic Large Object now returns the + entire object instead of incorrect data. This was previously fixed in + 2.23.0. + + * When making backend requests, the proxy-server now ensures query + parameters are always properly quoted. Previously, the proxy would + encounter an error on Python 2.7.17 if the client included non-ASCII + query parameters in object requests. This was previously fixed in + 2.23.0. + + +swift (2.21.0, OpenStack Stein) + + * Change the behavior of the EC reconstructor to perform a + fragment rebuild to a handoff node when a primary peer responds + with 507 to the REPLICATE request. This changes EC to match the + existing behavior of replication when drives fail. After a + rebalance of EC rings (potentially removing unmounted/failed + devices), it's most IO efficient to run in handoffs_only mode to + avoid unnecessary rebuilds. + + * O_TMPFILE support is now detected by attempting to use it + instead of looking at the kernel version. This allows older + kernels with backported patches to take advantage of the + O_TMPFILE functionality. + + * Add slo_manifest_hook callback to allow other middlewares to + impose additional constraints on or make edits to SLO manifests + before being written. For example, a middleware could enforce + minimum segment size or insert data segments. + + * Fixed an issue with multi-region EC policies that caused the EC + reconstructor to constantly attempt cross-region rebuild + traffic. + + * Fixed an issue where S3 API v4 signatures would not be validated + against the body of the request, allowing a replay attack if + request headers were captured by a malicious third party. + + * Display crypto data/metadata details in swift-object-info. + + * formpost can now accept a content-encoding parameter. + + * Fixed an issue where multipart uploads with the S3 API would + sometimes report an error despite all segments being upload + successfully. + + * Multipart object segments are now actually deleted when the + multipart object is deleted via the S3 API. + + * Swift now returns a 503 (instead of a 500) when an account + auto-create fails. + + * Fixed a bug where encryption would store the incorrect key + metadata if the object name starts with a slash. + + * Fixed an issue where an object server failure during a client + download could leave an open socket between the proxy and + client. + + * Fixed an issue where deleted EC objects didn't have their + on-disk directories cleaned up. This would cause extra resource + usage on the object servers. + + * Fixed issue where bulk requests using xml and expect + 100-continue would return a malformed HTTP response. + + * Various other minor bug fixes and improvements. + + +swift (2.20.0) + + * S3 API compatibility updates + + * Swift can now cache the S3 secret from Keystone to use for + subsequent requests. This functionality is disabled by default but + can be enabled by setting the `secret_cache_duration` in the s3token + section of the proxy server config to a number greater than 0. + + * s3api now mimics the AWS S3 behavior of periodically sending + whitespace characters on a Complete Multipart Upload request to keep + the connection from timing out. Note that since a request could fail + after the initial 200 OK response has been sent, it is important to + check the response body to determine if the request succeeded. + + * s3api now properly handles x-amz-metadata-directive headers on + COPY operations. + + * s3api now uses concurrency (default 2) to handle multi-delete + requests. This allows multi-delete requests to be processed much + more quickly. + + * s3api now mimics some forms of AWS server-side encryption + based on whether Swift's at-rest encryption functionality is enabled. + Note that S3 API users are now able to know more about how the + cluster is configured than they were previously, ie knowledge of + encryption at-rest functionality being enabled or not. + + * s3api responses now include a '-' in multipart ETags. + + For new multipart-uploads via the S3 API, the ETag that is + stored will be calculated in the same way that AWS uses. This + ETag will be used in GET/HEAD responses, bucket listings, and + conditional requests via the S3 API. Accessing the same object + via the Swift API will use the SLO Etag; however, in JSON + container listings the multipart upload etag will be exposed + in a new "s3_etag" key. Previously, some S3 clients would complain + about download corruption when the ETag did not have a '-'. + + * S3 ETag for SLOs now include a '-'. + + Ordinary objects in S3 use the MD5 of the object as the ETag, + just like Swift. Multipart Uploads follow a different format, notably + including a dash followed by the number of segments. To that end + (and for S3 API requests *only*), SLO responses via the S3 API have a + literal '-N' added on the end of the ETag. + + * The default location is now set to "us-east-1". This is more likely + to be the default region that a client will try when using v4 + signatures. + + Deployers with clusters that relied on the old implicit default + location of "US" should explicitly set `location = US` in the + `[filter:s3api]` section of proxy-server.conf before upgrading. + + * Add basic support for ?versions bucket listings. We still do not + have support for toggling S3 bucket versioning, but we can at least + support getting the latest versions of all objects. + + * Fixed an issue with SSYNC requests to ensure that only one request + can be running on a partition at a time. + + * Data encryption updates + + * The kmip_keymaster middleware can now be configured directly in the + proxy-server config file. The existing behavior of using an external + config file is still supported. + + * Multiple keymaster middlewares are now supported. This allows + migration from one key provider to another. + + Note that secret_id values must remain unique across all keymasters + in a given pipeline. If they are not unique, the right-most keymaster + will take precedence. + + When looking for the active root secret, only the right-most + keymaster is used. + + * Prevent PyKMIP's kmip_protocol logger from logging at DEBUG. + Previously, some versions of PyKMIP would include all wire + data when the root logger was configured to log at DEBUG; this + could expose key material in logs. Only the kmip_keymaster was + affected. + + * Fixed an issue where a failed drive could prevent the container sharder + from making progress. + + * Storage policy definitions in swift.conf can now define the diskfile + to use to access objects. See the included swift.conf-sample file for + a description of usage. + + * The EC reconstructor will now attempt to remove empty directories + immediately, while the inodes are still cached, rather than waiting + until the next run. + + * Added a keep_idle config option to configure KEEPIDLE time for TCP + sockets. The default value is the old constant of 600. + + * Add databases_per_second to the account-replicator, + container-replicator, and container-sharder. This prevents them from + using a full CPU core when they are not IO limited. + + * Allow direct_client users to overwrite the X-Timestamp header. + + * Various other minor bug fixes and improvements. + + +swift (2.19.2, rocky stable backports) + + * Sharding improvements + + * The container-replicator now only attempts to fetch shard ranges if + the remote indicates that it has shard ranges. Further, it does so + with a timeout to prevent the process from hanging in certain cases. + + * The container-replicator now correctly enqueues container-reconciler + work for sharded containers. + + * S3 API improvements + + * Fixed an issue where v4 signatures would not be validated against + the body of the request, allowing a replay attack if request headers + were captured by a malicious third party. Note that unsigned payloads + still function normally. + + * CompleteMultipartUpload requests with a Content-MD5 now work. + + * Fixed v1 listings that end with a non-ASCII object name. + + * Multipart object segments are now actually deleted when the + multipart object is deleted via the S3 API. + + * Fixed an issue that caused Delete Multiple Objects requests with + large bodies to 400. This was previously fixed in 2.20.0. + + * Fixed an issue where non-ASCII Keystone EC2 credentials would not get + mapped to the correct account. This was previously fixed in 2.20.0. + + * Background corruption-detection improvements + + * Detect and remove invalid entries from hashes.pkl + + * When object path is not a directory, just quarantine it, + rather than the whole suffix. + + + * Fixed a bug where encryption would store the incorrect key + metadata if the object name starts with a slash. + + * Fixed an issue where an object server failure during a client + download could leave an open socket between the proxy and + client. + + * Static Large Object sizes in listings for versioned containers are + now more accurate. + + * When refetching Static Large Object manifests, non-manifest responses + are now handled better. + + * Cross-account symlinks now store correct account information in + container listings. This was previously fixed in 2.22.0. + + * Requesting multiple ranges from a Dynamic Large Object now returns the + entire object instead of incorrect data. This was previously fixed in + 2.23.0. + + * When making backend requests, the proxy-server now ensures query + parameters are always properly quoted. Previously, the proxy would + encounter an error on Python 2.7.17 if the client included non-ASCII + query parameters in object requests. This was previously fixed in + 2.23.0. + + +swift (2.19.1, rocky stable backports) + + * Prevent PyKMIP's kmip_protocol logger from logging at DEBUG. + Previously, some versions of PyKMIP would include all wire + data when the root logger was configured to log at DEBUG; this + could expose key material in logs. Only the kmip_keymaster was + affected. + + * Fixed an issue where a failed drive could prevent the container sharder + from making progress. + + * Fixed a bug in how Swift uses eventlet that was exposed under high + concurrency. + + +swift (2.19.0, OpenStack Rocky) + + * TempURLs now support IP range restrictions. Please see + https://docs.openstack.org/swift/latest/middleware.html#client-usage + for more information on how to use this additional restriction. + + * Add support for multiple root encryption secrets for the trivial + and KMIP keymasters. This allows operators to rotate encryption + keys over time without needing to re-encrypt all existing data + in the cluster. Please see the included sample config files for + instructions on how to multiple encryption keys. + + * The object updater now supports two configuration settings: + "concurrency" and "updater_workers". The latter controls how many + worker processes are spawned, while the former controls how many + concurrent container updates are performed by each worker + process. This should speed the processing of async_pendings. + + On upgrade, a node configured with concurrency=N will still handle + async updates N-at-a-time, but will do so using only one process + instead of N. + + If you have a config file like this: + + [object-updater] + concurrency = + + and you want to take advantage of faster updates, then do this: + + [object-updater] + concurrency = 8 # the default; you can omit this line + updater_workers = + + If you want updates to be processed exactly as before, do this: + + [object-updater] + concurrency = 1 + updater_workers = + + * When listing objects in a container in json format, static large + objects (SLOs) will now include an additional new "slo_etag" key + that matches the etag returned when requesting the SLO. The + existing "hash" key remains unchanged as the MD5 of the SLO + manifest. Text and XML listings are unaffected by this change. + + * Log deprecation warnings for `run_pause`. This setting was + deprecated in Swift 2.4.0 and is replaced by `interval`. + It may be removed in a future release. + + * Object reconstructor logs are now prefixed with information + about the specific worker process logging the message. This + makes reading the logs and understanding the messages much simpler. + + * Lower bounds of dependencies have been updated to reflect what + is actually tested. + + * SSYNC replication mode now removes as much of the directory + structure as possible as soon at it observes that the directory + is empty. This reduces the work needed for subsequent replication + passes. + + * The container-updater now reports zero objects and bytes used for + child DBs in sharded containers. This prevents double-counting in + utilization reports. + + * Add fallocate_reserve to account and container servers. This + allows disks shared between account/container and object rings to + avoid getting 100% full. The default value of 1% matches the + existing default on object servers. + + * Added an experimental `swift-ring-composer` CLI tool to build + composite rings. + + * Added an optional `read_only` middleware to make an entire cluster + or individual accounts read only. + + * Fixed a bug where zero-byte PUTs would not work properly + with "If-None-Match: *" conditional requests. + + * ACLs now work with unicode in user/account names. + + * COPY now works with unicode account names. + + * Improved S3 API compatibility. + + * Lock timeouts in the container updater are now logged at INFO + level, not ERROR. + + * Various other minor bug fixes and improvements. + + +swift (2.18.0) + + * Added container sharding, an operator controlled feature that + may be used to shard very large container databases into a + number of smaller shard containers. This mitigates the issues + with one large DB by distributing the data across multiple + smaller databases throughout the cluster. Please read the full + overview at + https://docs.openstack.org/swift/latest/overview_container_sharding.html + + * Provide an S3 API compatibility layer. The external "swift3" + project has been imported into Swift's codebase as the "s3api" + middleware. + + * Added "emergency mode" hooks in the account and container replicators. + These options may be used to prioritize moving handoff + partitions to primary locations more quickly. This helps when + adding capacity to a ring. + + - Added `-d ` and `-p ` command line options. + + - Added a handoffs-only mode. + + * Add a multiprocess mode to the object replicator. Setting the + "replicator_workers" setting to a positive value N will result + in the replicator using up to N worker processes to perform + replication tasks. At most one worker per disk will be spawned. + + Worker process logs will have a bit of information prepended so + operators can tell which messages came from which worker. The + prefix is "[worker M/N pid=P] ", where M is the worker's index, + N is the total number of workers, and P is the process ID. Every + message from the replicator's logger will have the prefix + + * The object reconstructor will now fork all available worker + processes when operating on a subset of local devices. + + * Add support for PROXY protocol v1 to the proxy server. This + allows the Swift proxy server to log accurate client IP + addresses when there is a proxy or SSL-terminator between the + client and the Swift proxy server. Example servers supporting + this PROXY protocol include stunnel, haproxy, hitch, and + varnish. See the sample proxy server config file for the + appropriate config setting to enable or disable this + functionality. + + * In the ratelimit middleware, account whitelist and blacklist + settings have been deprecated and may be removed in a future + release. When found, a deprecation message will be logged. + Instead of these config file values, set X-Account-Sysmeta- + Global-Write-Ratelimit:WHITELIST and X-Account-Sysmeta-Global- + Write-Ratelimit:BLACKLIST on the particular accounts that need + to be whitelisted or blacklisted. System metadata cannot be added + or modified by standard clients. Use the internal client to set sysmeta. + + * Add a --drop-prefixes flag to swift-account-info, + swift-container-info, and swift-object-info. This makes the + output between the three more consistent. + + * statsd error messages correspond to 5xx responses only. This + makes monitoring more useful because actual errors (5xx) will + not be hidden by common user requests (4xx). Previously, some 4xx + responses would be included in timing information in the statsd + error messages. + + * Truncate error logs to prevent log handler from running out of buffer. + + * Updated requirements.txt to match global exclusions and formatting. + + * tempauth user names now support unicode characters. + + * Various other minor bug fixes and improvements. + + +swift (2.17.1, queens stable backports) + + * Fix SLO delete for accounts with non-ASCII names. + + * Fixed an issue in COPY where concurrent requests may have copied the + wrong data. + + * Fixed a bug in how Swift uses eventlet that was exposed under high + concurrency. + + +swift (2.17.0, OpenStack Queens) + + * Added symlink objects support. + + Symlink objects reference one other object. They are created by + creating an empty object with an X-Symlink-Target header. The value of + the header is of the format /, and the target does + not need to exist at the time of symlink creation. Cross-account + symlinks can be created by including the + X-Symlink-Target-Account header. + + GET and HEAD requests to a symlink will operate on the + referenced object and require appropriate permission in the + target container. DELETE and PUT requests will operate on the + symlink object itself. POST requests are not forwarded to the + referenced object. POST requests sent to a symlink will result + in a 307 Temporary Redirect response. + + * Added support for inline data segments in SLO manifests. + + Upgrade impact: during a rolling upgrade, an updated proxy server + may write a manifest that an out-of-date proxy server will not be + able to read. This will resolve itself once the upgrade completes + on all nodes. + + * The tempurl digest algorithm is now configurable, and Swift added + support for both SHA-256 and SHA-512. Supported tempurl digests + are exposed to clients in `/info`. Additionally, tempurl signatures + can now be base64 encoded. + + * Object expiry improvements + + - Disallow X-Delete-At header values equal to the X-Timestamp header. + + - X-Delete-At computation now uses X-Timestamp instead of + system time. This prevents clock skew causing inconsistent + expiry data. + + - Deleting an expiring object will now cause less work in the system. + The number of async pending files written has been reduced for all + objects and greatly reduced for erasure-coded objects. This + dramatically reduces the burden on container servers. + + - Stopped logging tracebacks when receiving an unexpected response. + + - Allow the expirer to gracefully move past updating stale work items. + + * When the object auditor examines an object, it will now add any + missing metadata checksums. + + * `swift-ring-builder` improvements + + - Save the ring when dispersion improves, even if balance + doesn't improve. + + - Improved the granularity of the ring dispersion metric so that + small improvements after a rebalance can show changes in the + dispersion number. Dispersion in existing and new rings can be + recalculated using the new '--recalculate' option to + `swift-ring-builder`. + + - Display more info on empty rings. + + * Fixed rare socket leak on range requests to erasure-coded objects. + + * The number of container updates on object PUTs (ie to update listings) + has been recomputed to be far more efficient while maintaining + durability guarantees. Specifically, object PUTs to erasure-coded + policies will now normally result in far fewer container updates. + + * Moved Zuul v3 tox jobs into the Swift code repo. + + * Changed where liberasurecode-devel for CentOS 7 is referenced and + installed as a dependency. + + * Added container/object listing with prefix to InternalClient. + + * Added '--swift-versions' to `swift-recon` CLI to compare installed + versions in the cluster. + + * Stop logging tracebacks in the `object-replicator` when it runs + out of handoff locations. + + * Send ETag header in 206 Partial Content responses to SLO reads. + + * Now `swift-recon-cron` works with conf.d configs. + + * Improved `object-updater` stats logging. It now tells you all of + its stats (successes, failures, quarantines due to bad pickles, + unlinks, and errors), and it tells you incremental progress every + five minutes. The logging at the end of a pass remains and has + been expanded to also include all stats. + + * If a proxy server is configured to autocreate accounts and the + account create fails, it will now return a server error (500) + instead of Not Found (404). + + * Fractional replicas are no longer allowed for erasure code policies. + + * Various other minor bug fixes and improvements. + + +swift (2.16.0) + + * Add checksum to object extended attributes. + + * Let clients request heartbeats during SLO PUTs by including + the query parameter `heartbeat=on`. + + With heartbeating turned on, the proxy will start its response + immediately with 202 Accepted then send a single whitespace + character periodically until the request completes. At that + point, a final summary chunk will be sent which includes a + "Response Status" key indicating success or failure and (if + successful) an "Etag" key indicating the Etag of the resulting + SLO. + + * Added support for retrieving the encryption root secret from an + external key management system. In practice, this is currently limited + to Barbican. + + * Move listing formatting out to a new proxy middleware named + `listing_formats`. `listing_formats` should be just right of the + first proxy-logging middleware, and left of most other + middlewares. If it is not already present, it will be + automatically inserted for you. + + Note: if you have a custom middleware that makes account or + container listings, it will only receive listings in JSON format. + + * Log deprecation warning for `allow_versions` in the container + server config. Configure the `versioned_writes` middleware in + the proxy server instead. This option will be ignored in a + future release. + + * Replaced `replication_one_per_device` by custom count defined by + `replication_concurrency_per_device`. The original config value + is deprecated, but continues to function for now. If both values + are defined, the old `replication_one_per_device` is ignored. + + * Fixed a rare issue where multiple backend timeouts could result + in bad data being returned to the client. + + * Cleaned up logged tracebacks when talking to memcached servers. + + * Account and container replication stats logs now include + `remote_merges`, the number of times a whole database was sent + to another node. + + * Respond 400 Bad Request when Accept headers fail to parse + instead of returning 406 Not Acceptable. + + * The `domain_remap` middleware now supports the + `mangle_client_paths` option. Its default "false" value changes + `domain_remap` parsing to stop stripping the `path_root` value + from URL paths. If users depend on this path mangling, operators + should set `mangle_client_paths` to "True" before upgrading. + + * Remove `swift-temp-url` script. The functionality has been in + swiftclient for a long time and this script has been deprecated + since 2.10.0. + + * Removed all `post_as_copy` related code and configs. The option + has been deprecated since 2.13.0. + + * Fixed XML responses (eg on bulk extractions and SLO upload + failures) to be more correct. The enclosing "delete" tag was + removed where it doesn't make sense and replaced with "extract" + or "upload" depending on the context. + + * Static Large Object (SLO) manifest may now (again) have zero-byte + last segments. + + * Fixed an issue where background consistency daemon child + processes would deadlock waiting on the same file descriptor. + + * Removed a race condition where a POST to an SLO could modify the + X-Static-Large-Object metadata. + + * Accept a trade off of dispersion for balance in the ring builder + that will result in getting to balanced rings much more quickly + in some cases. + + * Fixed using `swift-ring-builder set_weight` with more than one + device. + + * When requesting objects, return 404 if a tombstone is found and + is newer than any data found. Previous behavior was to return + stale data. + + * Various other minor bug fixes and improvements. + + +swift (2.15.2, pike stable backports) + + * Fixed a cache invalidation issue related to GET and PUT requests to + containers that would occasionally cause object PUTs to a container to + 404 after the container had been successfully created. + + * Removed a race condition where a POST to an SLO could modify the + X-Static-Large-Object metadata. + + * Fixed rare socket leak on range requests to erasure-coded objects. + + * Fix SLO delete for accounts with non-ASCII names. + + * Fixed an issue in COPY where concurrent requests may have copied the + wrong data. + + * Fixed time skew when using X-Delete-After. + + * Send ETag header in 206 Partial Content responses to SLO reads. + + +swift (2.15.1, OpenStack Pike) + + * Fixed a bug introduced in 2.15.0 where the object reconstructor + would exit with a traceback if no EC policy was configured. + + * Fixed deadlock when logging from a tpool thread. + + The object server runs certain IO-intensive methods outside the + main pthread for performance. Previously, if one of those methods + tried to log, this can cause a crash that eventually leads to an + object server with hundreds or thousands of greenthreads, all + deadlocked. The fix is to use a mutex that works across different + greenlets and different pthreads. + + * The object reconstructor can now rebuild an EC fragment for an + expired object. + + * Various other minor bug fixes and improvements. + + +swift (2.15.0) + + * Add Composite Ring Functionality + + A composite ring comprises two or more component rings that are + combined to form a single ring with a replica count equal to the + sum of the component rings. The component rings are built + independently, using distinct devices in distinct regions, which + means that the dispersion of replicas between the components can + be guaranteed. + + Composite rings can be used for explicit replica placement and + "replicated EC" for global erasure codes policies. + + Composite rings support 'cooperative' rebalance which means that + during rebalance all component rings will be consulted before a + partition is moved in any component ring. This avoids the same + partition being simultaneously moved in multiple components. + + We do not yet have CLI tools for creating composite rings, but + the functionality has been enabled in the ring modules to + support this advanced functionality. CLI tools will be delivered + in a subsequent release. + + For further information see the docs at + + + * The EC reconstructor process has been dramatically improved by + adding support for multiple concurrent workers. Multiple + processes are required to get high concurrency, and this change + results in much faster rebalance times on servers with many + drives. + + Currently the default is still only one process, and no workers. + Set `reconstructor_workers` in the `[object-reconstructor]` + section to some whole number <= the number of devices on a node + to get that many reconstructor workers. + + * Add support to increase object ring partition power transparently + to end users and with no cluster downtime. Increasing the ring + partition power allows for incremental adjustment to the upper bound + of the cluster size. Please review the full docs at + . + + * Added support for per-policy proxy config options. This allows + per-policy affinity options to be set for use with duplicated EC + policies and composite rings. Certain options found in per-policy + conf sections will override their equivalents that may be set + in the [app:proxy-server] section. Currently the options handled that + way are sorting_method, read_affinity, write_affinity, + write_affinity_node_count, and write_affinity_handoff_delete_count. + + * Enabled versioned writes on Dynamic Large Objects (DLOs). + + * Write-affinity aware object deletion + + Previously, when deleting objects in multi-region swift + deployment with write affinity configured, users always get 404 + when deleting object before it's replicated to appropriate nodes. + + Now Swift will use `write_affinity_handoff_delete_count` to + define how many local handoff nodes should swift send request to + get more candidates for the final response. The default value + "auto" means Swift will calculate the number automatically based + on the number of replicas and current cluster topology. + + * Require that known-bad EC schemes be deprecated + + Erasure-coded storage policies using isa_l_rs_vand and nparity + >= 5 must be configured as deprecated, preventing any new + containers from being created with such a policy. This + configuration is known to harm data durability. Any data in such + policies should be migrated to a new policy. See + https://bugs.launchpad.net/swift/+bug/1639691 for more + information + + * Optimize the Erasure Code reconstructor protocol to reduce IO + load on servers. + + * Fixed a bug where SSYNC would fail to replicate unexpired object. + + * Fixed a bug in domain_remap when obj starts/ends with slash. + + * Fixed a socket leak in copy middleware when a large object was copied. + + * Fixed a few areas where the `swiftdir` option was not respected. + + * `swift-recon` now respects storage policy aliases. + + * cname_lookup middleware now accepts a `nameservers` config + variable that, if defined, will be used for DNS lookups instead of + the system default. + + * Make mount_check option usable in containerized environments by + adding a check for an ".ismount" file at the root directory of + a device. + + * Remove deprecated `vm_test_mode` option. + + * The object and container server config option `slowdown` has been + deprecated in favor of the new `objects_per_second` and + `containers_per_second` options. + + * The output of devices from `swift-ring-builder` has been reordered + by region, zone, ip, and device. + + * Imported docs content from openstack-manuals project. + + * Various other minor bug fixes and improvements. + + +swift (2.14.0) + + * Fixed error where a container drive error resulted in double space + usage on rest drives. When drive with container or account database + is unmounted, the bug would create handoff replicas on all remaining + drives, increasing the drive space used and filling the cluster. + + * Fixed UnicodeDecodeError in the object reconstructor that would + prevent objects with non-ascii names from being reconstructed and + caused the reconstructor process to hang. + + * EC Fragment Duplication - Foundational Global EC Cluster Support. + + * Fixed encoding issue in ssync where a mix of ascii and non-ascii + metadata values would cause an error. + + * `name_check` and `cname_lookup` keys have been added to `/info`. + + * Add Vary: headers for CORS responses. + + * Always set Swift processes to use UTC. + + * Prevent logged traceback in object-server on client disconnect for + chunked transfers to replicated policies. + + * Removed per-device reconstruction stats. Now that the reconstructor + is shuffling parts before going through them, those stats no longer + make sense. + + * Log correct status code for conditional requests. + + * Drop support for auth-server from common/manager.py and `swift-init`. + + * Include received fragment index in reconstructor log warnings. + + * Fixed a race condition in updating hashes.pkl where a partition + suffix invalidation may have been skipped. + + * `domain_remap` now accepts a list of domains in "storage_domain". + + * Do not follow CNAME when host is in storage_domain. + + * Enable cluster-wide CORS Expose-Headers setting via + "cors_expose_headers". + + * Cache all answers from nameservers in cname_lookup. + + * Log the correct request type of a subrequest downstream of copy. + + * Various other minor bug fixes and improvements. + + +swift (2.13.0, OpenStack Ocata) + + * Improvements in key parts of the consistency engine + + - Improved performance by eliminating an unneeded directory + structure hash. + + - Optimized the common case for hashing filesystem trees, thus + eliminating a lot of extraneous disk I/O. + + - Updated the `hashes.pkl` file format to include timestamp information + for race detection. Also simplified hashing logic to prevent race + conditions and optimize for the common case. + + - The erasure code reconstructor will now shuffle work jobs across all + disks instead of going disk-by-disk. This eliminates single-disk I/O + contention and allows continued scaling as concurrency is increased. + + - Erasure code reconstruction handles moving data from handoff nodes + better. Instead of moving the data to another handoff, it waits + until it can be moved to a primary node. + + Upgrade Impact: If you upgrade and roll back, you must delete all + `hashes.pkl` files. + + * If using erasure coding with ISA-L in rs_vand mode and 5 or more parity + fragments, Swift will emit a warning. This is a configuration that is + known to harm data durability. In a future release, this warning will be + upgraded to an error unless the policy is marked as deprecated. All data + in an erasure code storage policy using isa_l_rs_vand with 5 or more + parity should be migrated as soon as possible. Please see + https://bugs.launchpad.net/swift/+bug/1639691 for more information. + + * The erasure code reconstructor `handoffs_first` option has been + deprecated in favor of `handoffs_only`. `handoffs_only` is far more + useful, and just like `handoffs_first` mode in the replicator, it gives + the operator the option of forcing the consistency engine to focus + solely on revert (handoff) jobs, thus improving the speed of + rebalances. The `handoffs_only` behavior is somewhat consistent with + the replicator's `handoffs_first` option (any error on any handoff in + the replicator will make it essentially handoff only forever) but the + `handoff_only` option does what you want and is named correctly in the + reconstructor. + + * The default for `object_post_as_copy` has been changed to False. The + option is now deprecated and will be removed in a future release. If + your cluster is still running with post-as-copy enabled, please update + it to use the "fast-post" method. Future versions of Swift will not + support post-as-copy, and future features will not be supported under + post-as-copy. ("Fast-post" is where `object_post_as_copy` is false). + + * Temporary URLs now support one common form of ISO 8601 timestamps in + addition to Unix seconds-since-epoch timestamps. The ISO 8601 format + accepted is '%Y-%m-%dT%H:%M:%SZ'. This makes TempURLs more + user-friendly to produce and consume. + + * Listing containers in accounts with json or xml now includes a + `last_modified` time. This does not change any on-disk data, but simply + exposes the value to offer consistency with the object listings on + containers. + + * Fixed a bug where the ring builder would not allow removal of a device + when min_part_seconds_left was greater than zero. + + * PUT subrequests generated from a client-side COPY will now properly log + the SSC (server-side copy) Swift source field. See + https://docs.openstack.org/swift/latest/logs.html#swift-source for + more information. + + * Fixed a bug where an SLO download with a range request may have resulted + in a 5xx series response. + + * SLO manifest PUT requests can now be properly validated by sending an + ETag header of the md5 sum of the concatenated md5 sums of the + referenced segments. + + * Fixed the stats calculation in the erasure code reconstructor. + + * Rings with min_part_hours set to zero will now only move one partition + replica per rebalance, thus matching behavior when min_part_hours is + greater than zero. + + * I/O priority is now supported on AArch64 architecture. + + * Various other minor bug fixes and improvements. + + +swift (2.12.0) + + * Ring files now include byteorder information about the endian of + the machine used to generate the file, and the values are + appropriately byteswapped if deserialized on a machine with a + different endianness. + + Newly created ring files will be byteorder agnostic, but + previously generated ring files will still fail on different + endian architectures. Regenerating older ring files will cause + them to become byteorder agnostic. The regeneration of the ring + files will not cause any new data movement. Newer ring files + will still be usable by older versions of Swift (on machines + with the same endianness--this maintains existing behavior). + + * All 416 responses will now include a Content-Range header with + an unsatisfied-range value. This allows the caller to know the + valid range request value for an object. + + * TempURLs now support a validation against a common prefix. A + prefix-based signature grants access to all objects which share the + same prefix. This avoids the creation of a large amount of signatures, + when a whole container or pseudofolder is shared. + + * Correctly handle deleted files with if-none-match requests. + + * Correctly send 412 Precondition Failed if a user sends an + invalid copy destination. Previously Swift would send a 500 + Internal Server Error. + + * In SLO manifests, the `etag` and `size_bytes` keys are now fully + optional and not required. Previously, the keys needed to exist + but the values were optional. The only required key is `path`. + + * Fixed a rare infinite loop in `swift-ring-builder` while placing parts. + + * Ensure update of the container by object-updater, removing a rare + possibility that objects would never be added to a container listing. + + * Fixed non-deterministic suffix updates in hashes.pkl where a partition + may be updated much less often than expected. + + * Fixed regression in consolidate_hashes that occurred when a new + file was stored to new suffix to a non-empty partition. This bug + was introduced in 2.7.0 and could cause an increase in rsync + replication stats during and after upgrade, due to inconsistent + hashing of partition suffixes. + + * Account and container databases will now be quarantined if the + database schema has been corrupted. + + * Removed "in-process-" from func env tox name to work with + upstream CI. + + * Respect server type for --md5 check in swift-recon. + + * Remove empty db hash and suffix directories if a db gets quarantined. + + * Various other minor bug fixes and improvements. + + +swift (2.11.0) + + * We have made significant improvements and changes to the erasure + code implementation. + + - Instead of using a separate .durable file to indicate the + durable status of an EC fragment archive, we rename the .data + to include a durable marker in the filename. This saves one + inode for every EC .data file. Existing .durable files will not + be removed, and they will continue to work just fine. + + Note that after writing EC data with Swift 2.11.0 or later, that + data will not be accessible to earlier versions of Swift. + + - Closed a bug where ssync may have written bad fragment data in + some circumstances. A check was added to ensure the correct number + of bytes is written for a fragment before finalizing the write. + Also, erasure coded fragment metadata will now be validated on read + requests and, if bad data is found, the fragment will be quarantined. + + - The improvements to EC reads made in Swift 2.10.0 have also been + applied to the reconstructor. This allows fragments to be rebuilt + in more circumstances, resulting in faster recovery from failures. + + - WARNING: If you are using the ISA-L library for erasure codes, + please upgrade to liberasurecode 1.3.1 (or later) as soon as + possible. If you are using isa_l_rs_vand with more than 4 parity, + please read https://bugs.launchpad.net/swift/+bug/1639691 and take + necessary action. + + - Updated the PyECLib dependency to 1.3.1. + + * Added a configurable URL base to staticweb. + + * Support multi-range GETs for static large objects. + + * TempURLs using the "inline" parameter can now also set the + "filename" parameter. Both are used in the Content-Disposition + response header. + + * Mirror X-Trans-Id to X-Openstack-Request-Id. + + * SLO will now concurrently HEAD segments, resulting in much faster + manifest validation and object creation. By default, two HEAD requests + will be done at a time, but this can be changed by the operator via + the new `concurrency` setting in the "[filter:slo]" section of + the proxy server config. + + * Suppressed the KeyError message when auditor finds an expired object. + + * Daemons using InternalClient can now be properly killed with SIGTERM. + + * Added a "user" option to the drive-audit config file. Its value is + used to set the owner of the drive-audit recon cache. + + * Throttle update_auditor_status calls so it updates no more than once + per minute. + + * Suppress unexpected-file warnings for rsync temp files. + + * Various other minor bug fixes and improvements. + + +swift (2.10.0, OpenStack Newton) + + * Object versioning now supports a "history" mode in addition to + the older "stack" mode. The difference is in how DELETE requests + are handled. For full details, please read + https://docs.openstack.org/swift/latest/overview_object_versioning.html. + + * New config variables to change the schedule priority and I/O + scheduling class. Servers and daemons now understand + `nice_priority`, `ionice_class`, and `ionice_priority` to + schedule their relative importance. Please read + https://docs.openstack.org/swift/latest/admin_guide.html + for full config details. + + * On newer kernels (3.15+ when using xfs), Swift will use the O_TMPFILE + flag when opening a file instead of creating a temporary file + and renaming it on commit. This makes the data path simpler and + allows the filesystem to more efficiently optimize the files on + disk, resulting in better performance. + + * Erasure code GET performance has been significantly + improved in clusters that are not completely healthy. + + * Significant improvements to the api-ref doc available at + https://docs.openstack.org/api-ref/object-store/. + + * A PUT or POST to a container will now update the container's + Last-Modified time, and that value will be included in a + GET/HEAD response. + + * Include object sysmeta in POST responses. Sysmeta is still + stripped from the response before being sent to the client, but + this allows middleware to make use of the information. + + * Fixed a bug where a container listing delimiter wouldn't work + with encryption. + + * Fixed a bug where some headers weren't being copied correctly + in a COPY request. + + * Container sync can now copy SLOs more efficiently by allowing + the manifest to be synced before all of the referenced segments. + This fixes a bug where container sync would not copy SLO manifests. + + * Fixed a bug where some tombstone files might never be reclaimed. + + * Update dnspython dependency to 1.14, removing the need to have + separate dnspython dependencies for Py2 and Py3. + + * Deprecate swift-temp-url and call python-swiftclient's + implementation instead. This adds python-swiftclient as an + optional dependency of Swift. + + * Moved other-requirements.txt to bindep.txt. bindep.txt lists + non-python dependencies of Swift. + + * Various other minor bug fixes and improvements. + + +swift (2.9.0) + + * Swift now supports at-rest encryption. This feature encrypts all + object data and user-set object metadata as it is sent to the cluster. + This feature is designed to prevent information leaks if a hard drive + leaves the cluster. The encryption is transparent to the end-user. + + At-rest encryption in Swift is enabled on the proxy server by + adding two middlewares to the pipeline. The `keymaster` middleware + is responsible for managing the encryption keys and the `encryption` + middleware does the actual encryption and decryption. + + Existing clusters will continue to work without enabling + encryption. Although enabling this feature on existing clusters + is supported, best practice is to enable this feature on new + clusters when the cluster is created. + + For more information on the details of the at-rest encryption + feature, please see the docs at + https://docs.openstack.org/swift/latest/overview_encryption.html. + + * `swift-recon` can now be called with more than one server type. + + * Fixed a bug where non-ascii names could cause an error in logging + and cause a 5xx response to the client. + + * The install guide and API reference have been moved into Swift's + source code repository. + + * Various other minor bug fixes and improvements. + + +swift (2.8.0) + + * Allow concurrent bulk deletes for server-side deletes of static + large objects. Previously this would be single-threaded and each + DELETE executed serially. The new `delete_concurrency` value + (default value is 2) in the `[filter:slo]` and `[filter:bulk]` + sections of the proxy server config controls the concurrency + used to perform the DELETE requests for referenced segments. The + default value is recommended, but setting the value to 1 + restores previous behavior. + + * Refactor server-side copy as middleware + + The COPY verb is now implemented in the `copy` middleware instead + of in the proxy server code. If not explicitly added, the server + side copy middleware is auto-inserted to the left of `dlo`, `slo` + and `versioned_writes` middlewares in the proxy server pipeline. + As a result, dlo and slo `copy_hooks` are no longer required. SLO + manifests are now validated when copied so when copying a + manifest to another account the referenced segments must be + readable in that account for the manifest copy to succeed + (previously this validation was not made, meaning the manifest + was copied but could be unusable if the segments were not + readable). + + With this change, there should be no change in functionality or + existing behavior. + + * `fallocate_reserve` can now be a percentage (a value ending in "%"), + and the default has been adjusted to "1%". + + * Now properly require account/container metadata be valid UTF-8 + + * TempURL responses now include an `Expires` header with the + expiration time embedded in the URL. + + * Non-Python dependencies are now listed in other-requirements.txt. + + * `swift-ring-builder` now supports a `--yes` option to assume a + yes response to all questions. This is useful for scripts. + + * Write requests to a replicated storage policy with an even number + of replicas now have a quorum size of half the replica count + instead of half-plus-one. + + * Container sync now logs per-container stat information so operators + can track progress. This is logged at INFO level. + + * `swift-dispersion-*` now allows region to be specified when there + are multiple Swift regions served by the same Keystone instance + + * Fix infinite recursion during logging when syslog is down. + + * Fixed a bug where a backend failure during a read could result in + a missing byte in the response body. + + * Stop `staticweb` revealing container existence to unauth'd requests. + + * Reclaim isolated .meta files if they are older than the `reclaim_age`. + + * Make `rsync` ignore its own temporary files instead of spreading + them around the cluster, wasting space. + + * The object auditor now ignores files in the devices directory when + auditing objects. + + * The deprecated `threads_per_disk` setting has been removed. Deployers + are encouraged to use `servers_per_port` instead. + + * Fixed an issue where a single-replica configuration for account or + container DBs could result in the DB being inadvertently deleted if + it was placed on a handoff node. + + * `disable_fallocate` now also correctly disables `fallocate_reserve`. + + * Fixed a bug where the account-reaper did not delete all containers + in a reaped account. + + * Correctly handle delimiter queries where results start with the + delimiter and no prefix is given. + + * Changed the recommended ports for Swift services from ports + 6000-6002 to unused ports 6200-6202 so they do not conflict with + X-Windows or other services. Since these config values must be + explicitly set in the config file, this doesn't impact existing + deployments. + + * Fixed an instance where REPLICATE requests would not use + `replication_ip`. + + * Various other minor bug fixes and improvements. + + +swift (2.7.0, OpenStack Mitaka) + + * Bump PyECLib requirement to >= 1.2.0 + + * Update container on fast-POST + + "Fast-POST" is the mode where `object_post_as_copy` is set to + `False` in the proxy server config. This mode now allows for + fast, efficient updates of metadata without needing to fully + recopy the contents of the object. While the default still is + `object_post_as_copy` as True, the plan is to change the default + to False and then deprecate post-as-copy functionality in later + releases. Fast-POST now supports container-sync functionality. + + * Add concurrent reads option to proxy. + + This change adds 2 new parameters to enable and control concurrent + GETs in Swift, these are `concurrent_gets` and `concurrency_timeout`. + + `concurrent_gets` allows you to turn on or off concurrent + GETs; when on, it will set the GET/HEAD concurrency to the + replica count. And in the case of EC HEADs it will set it to + ndata. The proxy will then serve only the first valid source to + respond. This applies to all account, container, and replicated + object GETs and HEADs. For EC only HEAD requests are affected. + The default for `concurrent_gets` is off. + + `concurrency_timeout` is related to `concurrent_gets` and is + the amount of time to wait before firing the next thread. A + value of 0 will fire at the same time (fully concurrent), but + setting another value will stagger the firing allowing you the + ability to give a node a short chance to respond before firing + the next. This value is a float and should be somewhere between + 0 and `node_timeout`. The default is `conn_timeout`, meaning by + default it will stagger the firing. + + * Added an operational procedures guide to the docs. It can be + found at https://docs.openstack.org/swift/latest/ops_runbook/index.html and + includes information on detecting and handling day-to-day + operational issues in a Swift cluster. + + * Make `handoffs_first` a more useful mode for the object replicator. + + The `handoffs_first` replication mode is used during periods of + problematic cluster behavior (e.g. full disks) when replication + needs to quickly drain partitions from a handoff node and move + them to a primary node. + + Previously, `handoffs_first` would sort that handoff work before + "normal" replication jobs, but the normal replication work could + take quite some time and result in handoffs not being drained + quickly enough. + + In order to focus on getting handoff partitions off the node + `handoffs_first` mode will now abort the current replication + sweep before attempting any primary suffix syncing if any of the + handoff partitions were not removed for any reason - and start + over with replication of handoffs jobs as the highest priority. + + Note that `handoffs_first` being enabled will emit a warning on + start up, even if no handoff jobs fail, because of the negative + impact it can have during normal operations by dog-piling on a + node that was temporarily unavailable. + + * By default, inbound `X-Timestamp` headers are now disallowed + (except when in an authorized container-sync request). This + header is useful for allowing data migration from other storage + systems to Swift and keeping the original timestamp of the data. + If you have this migration use case (or any other requirement on + allowing the clients to set an object's timestamp), set the + `shunt_inbound_x_timestamp` config variable to False in the + gatekeeper middleware config section of the proxy server config. + + * Requesting a SLO manifest file with the query parameters + "?multipart-manifest=get&format=raw" will return the contents of + the manifest in the format as was originally sent by the client. + The "format=raw" is new. + + * Static web page listings can now be rendered with a custom + label. By default listings are rendered with a label of: + "Listing of /v1///". This change adds + a new custom metadata key/value pair + `X-Container-Meta-Web-Listings-Label: My Label` that when set, + will cause the following: "Listing of My Label/" to be + rendered instead. + + * Previously, static large objects (SLOs) had a minimum segment + size (default to 1MiB). This limit has been removed, but small + segments will be ratelimited. The config parameter + `rate_limit_under_size` controls the definition of "small" + segments (1MiB by default), and `rate_limit_segments_per_sec` + controls how many segments per second can be served (default is 1). + With the default values, the effective behavior is identical to the + previous behavior when serving SLOs. + + * Container sync has been improved to perform a HEAD on the remote + side of the sync for each object being synced. If the object + exists on the remote side, container-sync will no longer + transfer the object, thus significantly lowering the network + requirements to use the feature. + + * The object auditor will now clean up any old, stale rsync temp + files that it finds. These rsync temp files are left if the + rsync process fails without completing a full transfer of an + object. Since these files can be large, the temp files may end + up filling a disk. The new auditor functionality will reap these + rsync temp files if they are old. The new object-auditor config + variable `rsync_tempfile_timeout` is the number of seconds old a + tempfile must be before it is reaped. By default, this variable + is set to "auto" or the rsync_timeout plus 900 seconds (falling + back to a value of 1 day). + + * The Erasure Code reconstruction process has been made more + efficient by not syncing data files when only the durable commit + file is missing. + + * Fixed a bug where 304 and 416 response may not have the right + Etag and Accept-Ranges headers when the object is stored in an + Erasure Coded policy. + + * Versioned writes now correctly stores the date of previous versions + using GMT instead of local time. + + * The deprecated Keystone middleware option is_admin has been removed. + + * Fixed log format in object auditor. + + * The zero-byte mode (ZBF) of the object auditor will now properly + observe the `--once` option. + + * Swift keeps track, internally, of "dirty" parts of the partition + keyspace with a "hashes.pkl" file. Operations on this file no + longer require a read-modify-write cycle and use a new + "hashes.invalid" file to track dirty partitions. This change + will improve end-user performance for PUT and DELETE operations. + + * The object replicator's succeeded and failed counts are now logged. + + * `swift-recon` can now query hosts by storage policy. + + * The log_statsd_host value can now be an IPv6 address or a hostname + which only resolves to an IPv6 address. + + * Erasure coded fragments now properly call fallocate to reserve disk + space before being written. + + * Various other minor bug fixes and improvements. + + +swift (2.6.0) + + * Dependency changes + - Updated minimum version of eventlet to 0.17.4 to support IPv6. + + - Updated the minimum version of PyECLib to 1.0.7. + + * The ring rebalancing algorithm was updated to better handle edge cases + and to give better (more balanced) rings in the general case. New rings + will have better initial placement, capacity adjustments will move less + data for better balance, and existing rings that were imbalanced should + start to become better balanced as they go through rebalance cycles. + + * Added container and account reverse listings. + + A GET request to an account or container resource with a "reverse=true" + query parameter will return the listing in reverse order. When + iterating over pages of reverse listings, the relative order of marker + and end_marker are swapped. + + * Storage policies now support having more than one name. + + This allows operators to fix a typo without breaking existing clients, + or, alternatively, have "short names" for policies. This is implemented + with the "aliases" config key in the storage policy config in + swift.conf. The aliases value is a list of names that the storage + policy may also be identified by. The storage policy "name" is used to + report the policy to users (eg in container headers). The aliases have + the same naming restrictions as the policy's primary name. + + * The object auditor learned the "interval" config value to control the + time between each audit pass. + + * `swift-recon --all` now includes the config checksum check. + + * `swift-init` learned the --kill-after-timeout option to force a service + to quit (SIGKILL) after a designated time. + + * `swift-recon` now correctly shows timestamps in UTC instead of local + time. + + * Fixed bug where `swift-ring-builder` couldn't select device id 0. + + * Documented the previously undocumented + `swift-ring-builder pretend_min_part_hours_passed` command. + + * The "node_timeout" config value now accepts decimal values. + + * `swift-ring-builder` now properly removes devices with zero weight. + + * `swift-init` return codes are updated via "--strict" and "--non-strict" + options. Please see the usage string for more information. + + * `swift-ring-builder` now reports the min_part_hours lockout time + remaining + + * Container sync has been improved to more quickly find and iterate over + the containers to be synced. This reduced server load and lowers the + time required to see data propagate between two clusters. Please see + https://docs.openstack.org/swift/latest/overview_container_sync.html for more details + about the new on-disk structure for tracking synchronized containers. + + * A container POST will now update that container's put-timestamp value. + + * TempURL header restrictions are now exposed in /info. + + * Error messages on static large object manifest responses have been + greatly improved. + + * Closed a bug where an unfinished read of a large object would leak a + socket file descriptor and a small amount of memory. (CVE-2016-0738) + + * Fixed an issue where a zero-byte object PUT with an incorrect Etag + would return a 503. + + * Fixed an error when a static large object manifest references the same + object more than once. + + * Improved performance of finding handoff nodes if a zone is empty. + + * Fixed duplication of headers in Access-Control-Expose-Headers on CORS + requests. + + * Fixed handling of IPv6 connections to memcache pools. + + * Continued work towards python 3 compatibility. + + * Various other minor bug fixes and improvements. + + +swift (2.5.0, OpenStack Liberty) + + * Added the ability to specify ranges for Static Large Object (SLO) + segments. + + * Replicator configs now support an "rsync_module" value to allow + for per-device rsync modules. This setting gives operators the + ability to fine-tune replication traffic in a Swift cluster and + isolate replication disk IO to a particular device. Please see + the docs and sample config files for more information and + examples. + + * Significant work has gone in to testing, fixing, and validating + Swift's erasure code support at different scales. + + * Swift now emits StatsD metrics on a per-policy basis. + + * Fixed an issue with Keystone integration where a COPY request to a + service account may have succeeded even if a service token was not + included in the request. + + * Ring validation now warns if a placement partition gets assigned to the + same device multiple times. This happens when devices in the ring are + unbalanced (e.g. two servers where one server has significantly more + available capacity). + + * Various other minor bug fixes and improvements. + + +swift (2.4.0) + + * Dependency changes + + - Added six requirement. This is part of an ongoing effort to add + support for Python 3. + + - Dropped support for Python 2.6. + + * Config changes + + - Recent versions of Python restrict the number of headers allowed in a + request to 100. This number may be too low for custom middleware. The + new "extra_header_count" config value in swift.conf can be used to + increase the number of headers allowed. + + - Renamed "run_pause" setting to "interval" (current configs with + run_pause still work). Future versions of Swift may remove the + run_pause setting. + + * Versioned writes middleware + + The versioned writes feature has been refactored and reimplemented as + middleware. You should explicitly add the versioned_writes middleware to + your proxy pipeline, but do not remove or disable the existing container + server config setting ("allow_versions"), if it is currently enabled. + The existing container server config setting enables existing + containers to continue being versioned. Please see + https://docs.openstack.org/swift/latest/middleware.html#how-to-enable-object-versioning-in-a-swift-cluster + for further upgrade notes. + + * Allow 1+ object-servers-per-disk deployment + + Enabled by a new > 0 integer config value, "servers_per_port" in the + [DEFAULT] config section for object-server and/or replication server + configs. The setting's integer value determines how many different + object-server workers handle requests for any single unique local port + in the ring. In this mode, the parent swift-object-server process + continues to run as the original user (i.e. root if low-port binding + is required), binds to all ports as defined in the ring, and forks off + the specified number of workers per listen socket. The child, per-port + servers drop privileges and behave pretty much how object-server workers + always have, except that because the ring has unique ports per disk, the + object-servers will only be handling requests for a single disk. The + parent process detects dead servers and restarts them (with the correct + listen socket), starts missing servers when an updated ring file is + found with a device on the server with a new port, and kills extraneous + servers when their port is found to no longer be in the ring. The ring + files are stat'ed at most every "ring_check_interval" seconds, as + configured in the object-server config (same default of 15s). + + In testing, this deployment configuration (with a value of 3) lowers + request latency, improves requests per second, and isolates slow disk + IO as compared to the existing "workers" setting. To use this, each + device must be added to the ring using a different port. + + * Do container listing updates in another (green)thread + + The object server has learned the "container_update_timeout" setting + (with a default of 1 second). This value is the number of seconds that + the object server will wait for the container server to update the + listing before returning the status of the object PUT operation. + + Previously, the object server would wait up to 3 seconds for the + container server response. The new behavior dramatically lowers object + PUT latency when container servers in the cluster are busy (e.g. when + the container is very large). Setting the value too low may result in a + client PUT'ing an object and not being able to immediately find it in + listings. Setting it too high will increase latency for clients when + container servers are busy. + + * TempURL fixes (closes CVE-2015-5223) + + Do not allow PUT tempurls to create pointers to other data. + Specifically, disallow the creation of DLO object manifests via a PUT + tempurl. This prevents discoverability attacks which can use any PUT + tempurl to probe for private data by creating a DLO object manifest and + then using the PUT tempurl to head the object. + + * Ring changes + + - Partition placement no longer uses the port number to place + partitions. This improves dispersion in small clusters running one + object server per drive, and it does not affect dispersion in + clusters running one object server per server. + + - Added ring-builder-analyzer tool to more easily test and analyze a + series of ring management operations. + + - Stop moving partitions unnecessarily when overload is on. + + * Significant improvements and bug fixes have been made to erasure code + support. This feature is suitable for beta testing, but it is not yet + ready for broad production usage. + + * Bulk upload now treats user xattrs on files in the given archive as + object metadata on the resulting created objects. + + * Emit warning log in object replicator if "handoffs_first" or + "handoff_delete" is set. + + * Enable object replicator's failure count in swift-recon. + + * Added storage policy support to dispersion tools. + + * Support keystone v3 domains in swift-dispersion. + + * Added domain_remap information to the /info endpoint. + + * Added support for a "default_reseller_prefix" in domain_remap + middleware config. + + * Allow SLO PUTs to forgo per-segment integrity checks. Previously, each + segment referenced in the manifest also needed the correct etag and + bytes setting. These fields now allow the "null" value to skip those + particular checks on the given segment. + + * Allow rsync to use compression via a "rsync_compress" config. If set to + true, compression is only enabled for an rsync to a device in a + different region. In some cases, this can speed up cross-region + replication data transfer. + + * Added time synchronization check in swift-recon (the --time option). + + * The account reaper now runs faster on large accounts. + + * Various other minor bug fixes and improvements. + + +swift (2.3.0, OpenStack Kilo) + + * Erasure Code support (beta) + + Swift now supports an erasure-code (EC) storage policy type. This allows + deployers to achieve very high durability with less raw capacity as used + in replicated storage. However, EC requires more CPU and network + resources, so it is not good for every use case. EC is great for storing + large, infrequently accessed data in a single region. + + Swift's implementation of erasure codes is meant to be transparent to + end users. There is no API difference between replicated storage and + EC storage. + + To support erasure codes, Swift now depends on PyECLib and + liberasurecode. liberasurecode is a pluggable library that allows for + the actual EC algorithm to be implemented in a library of your choosing. + + As a beta release, EC support is nearly fully feature complete, but it + is lacking support for some features (like multi-range reads) and has + not had a full performance characterization. This feature relies on + ssync for durability. Deployers are urged to do extensive testing and + not deploy production data using an erasure code storage policy. + + Full docs are at https://docs.openstack.org/swift/latest/overview_erasure_code.html + + * Add support for container TempURL Keys. + + * Make more memcache options configurable. connection_timeout, + pool_timeout, tries, and io_timeout are all now configurable. + + * Swift now supports composite tokens. This allows another service to + act on behalf of a user, but only with that user's consent. + See https://docs.openstack.org/swift/latest/overview_auth.html for more details. + + * Multi-region replication was improved. When replicating data to a + different region, only one replica will be pushed per replication + cycle. This gives the remote region a chance to replicate the data + locally instead of pushing more data over the inter-region network. + + * Internal requests from the ratelimit middleware now properly log a + swift_source. See https://docs.openstack.org/swift/latest/logs.html for details. + + * Improved storage policy support for quarantine stats in swift-recon. + + * The proxy log line now includes the request's storage policy index. + + * Ring checker has been added to swift-recon to validate if rings are + built correctly. As part of this feature, storage servers have learned + the OPTIONS verb. + + * Add support of x-remove- headers for container-sync. + + * Rings now support hostnames instead of just IP addresses. + + * Swift now enforces that the API version on a request is valid. Valid + versions are configured via the valid_api_versions setting in swift.conf + + * Various other minor bug fixes and improvements. + + +swift (2.2.2) + + * Data placement changes + + This release has several major changes to data placement in Swift in + order to better handle different deployment patterns. First, with an + unbalance-able ring, less partitions will move if the movement doesn't + result in any better dispersion across failure domains. Also, empty + (partition weight of zero) devices will no longer keep partitions after + rebalancing when there is an unbalance-able ring. + + Second, the notion of "overload" has been added to Swift's rings. This + allows devices to take some extra partitions (more than would normally + be allowed by the device weight) so that smaller and unbalanced clusters + will have less data movement between servers, zones, or regions if there + is a failure in the cluster. + + Finally, rings have a new metric called "dispersion". This is the + percentage of partitions in the ring that have too many replicas in a + particular failure domain. For example, if you have three servers in a + cluster but two replicas for a partition get placed onto the same + server, that partition will count towards the dispersion metric. A + lower value is better, and the value can be used to find the proper + value for "overload". + + The overload and dispersion metrics have been exposed in the + swift-ring-build CLI tools. + + See https://docs.openstack.org/swift/latest/overview_ring.html + for more info on how data placement works now. + + * Improve replication of large out-of-sync, out-of-date containers. + + * Added console logging to swift-drive-audit with a new log_to_console + config option (default False). + + * Optimize replication when a device and/or partition is specified. + + * Fix dynamic large object manifests getting versioned. This was not + intended and did not work. Now it is properly prevented. + + * Fix the GET's response code when there is a missing segment in a + large object manifest. + + * Change black/white listing in ratelimit middleware to use sysmeta. + Instead of using the config option, operators can set + "X-Account-Sysmeta-Global-Write-Ratelimit: WHITELIST" or + "X-Account-Sysmeta-Global-Write-Ratelimit: BLACKLIST" on an account to + whitelist or blacklist it for ratelimiting. Note: the existing + config options continue to work. + + * Use TCP_NODELAY on outgoing connections. + + * Improve object-replicator startup time. + + * Implement OPTIONS verb for storage nodes. + + * Various other minor bug fixes and improvements. + + +swift (2.2.1) + + * Swift now rejects object names with Unicode surrogates. + + * Return 403 (instead of 413) on unauthorized upload when over account + quota. + + * Fix a rare condition when a rebalance could cause swift-ring-builder + to crash. This would only happen on old ring files when "rebalance" + was the first command run. + + * Storage node error limits now survive a ring reload. + + * Speed up reading and writing xattrs for object metadata by using larger + xattr value sizes. The change is moving from 254 byte values to 64KiB + values. There is no migration issue with this. + + * Deleted containers beyond the reclaim age are now properly reclaimed. + + * Full Simplified Chinese translation (zh_CN locale) for errors and logs. + + * Container quota is now properly enforced during cross-account COPY. + + * ssync replication now properly uses the configured replication_ip. + + * Fixed issue were ssync did not replicate custom object headers. + + * swift-drive-audit now has the 'unmount_failed_device' config option + (default to True) that controls if the process will unmount failed + drives or not. + + * swift-drive-audit will now dump drive error rates to a recon file. + The file location is controlled by the 'recon_cache_path' config value + and it includes each drive and its associated number of errors. + + * When a filesystem does't support xattr, the object server now returns + a 507 Insufficient Storage error to the proxy server. + + * Clean up empty account and container partitions directories if they + are empty. This keeps the system healthy and prevents a large number + of empty directories from slowing down the replication process. + + * Show the sum of every policy's amount of async pendings in swift-recon. + + * Various other minor bug fixes and improvements. + + +swift (2.2.0, OpenStack Juno) + + * Added support for Keystone v3 auth. + + Keystone v3 introduced the concept of "domains" and user names + are no longer unique across domains. Swift's Keystone integration + now requires that ACLs be set on IDs, which are unique across + domains, and further restricts setting new ACLs to only use IDs. + + Please see https://docs.openstack.org/swift/latest/overview_auth.html for + more information on configuring Swift and Keystone together. + + * Swift now supports server-side account-to-account copy. Server- + side copy in Swift requires the X-Copy-From header (on a PUT) + or the Destination header (on a COPY). To initiate an account-to- + account copy, the existing header value remains the same, but the + X-Copy-From-Account header (on a PUT) or the Destination-Account + (on a COPY) are used to indicate the proper account. + + * Limit partition movement when adding a new placement tier. + + When adding a new placement tier (server, zone, or region), Swift + previously attempted to move all placement partitions, regardless + of the space available on the new tier, to ensure the best possible + durability. Unfortunately, this could result in too many partitions + being moved all at once to a new tier. Swift's ring-builder now + ensures that only the correct number of placement partitions are + rebalanced, and thus makes adding capacity to the cluster more + efficient. + + * Per storage policy container counts are now reported in an + account response headers. + + * Swift will now reject, with a 4xx series response, GET requests + with more than 50 ranges, more than 3 overlapping ranges, or more + than 8 non-increasing ranges. + + * The bind_port config setting is now required to be explicitly set. + + * The object server can now use splice() for a zero-copy GET + response. This feature is enabled with the "splice" config variable + in the object server config and defaults to off. Also, this feature + only works on recent Linux kernels (AF_ALG sockets must be + supported). A zero-copy GET response can significantly reduce CPU + requirements for object servers. + + * Added "--no-overlap" option to swift-dispersion populate so that + multiple runs of the tool can add coverage without overlapping + existing monitored partitions. + + * swift-recon now supports filtering by region. + + * Various other minor bug fixes and improvements. + + +swift (2.1.0) + + * swift-ring-builder placement was improved to allow gradual addition + of new regions without causing a massive migration of data to the new + region. The change was to prefer device weight first, then look at + failure domains. + + * Logging updates + + - Eliminated "Handoff requested (N)" log spam. + + - Added process pid to the end of storage node log lines. + + - Container auditor now logs a warning if the devices path contains a + non-directory. + + - Object daemons now send a user-agent string with their full name. + + * 412 and 416 responses are no longer tracked as errors in the StatsD + messages from the backend servers. + + * Parallel object auditor + + The object auditor can now be controlled with a "concurrency" config + value that allows multiple auditor processes to run at once. Using + multiple parallel auditor processes can speed up the overall auditor + cycle time. + + * The object updater will now concurrently update each necessary node + in a new greenthread. + + * TempURL updates + + - The default allowed methods have changed to also allow POST and + DELETE. The new default list is "GET HEAD PUT POST DELETE". + + - TempURLs for POST now also allow HEAD, matching existing GET and PUT + functionality. + + - Added filename*= support to TempURL Content-Disposition response + header. + + * X-Delete-At/After can now be used with the FormPost middleware. + + * Make swift-form-signature output a sample form. + + * Add v2 API to list endpoints middleware + + The new API adds better support for storage policies and changes the + response from a list of backend urls to a dictionary with the keys + "endpoints" and "headers". The endpoints key contains a list of the + backend urls, and the headers key is a dictionary of headers to send + along with the backend request. + + * Added allow_account_management and account_autocreate values to /info + responses. + + * Enable object system metadata on PUTs (Note: POST support is ongoing). + + * Various other minor bug fixes and improvements. + + +swift (2.0.0) + + * Storage policies + + Storage policies allow deployers to configure multiple object rings + and expose them to end users on a per-container basis. Deployers + can create policies based on hardware performance, regions, or other + criteria and independently choose different replication factors on + them. A policy is set on a Swift container at container creation + time and cannot be changed. + + Full docs are at https://docs.openstack.org/swift/latest/overview_policies.html + + * Add profiling middleware in Swift + + The profile middleware provides a tool to profile Swift + code on the fly and collects statistical data for performance + analysis. A native simple Web UI is also provided to help + query and visualize the data. + + * Add --quoted option to swift-temp-url + + * swift-recon now supports checking the md5sum of swift.conf, which + helps deployers verify configurations are consistent across a cluster. + + * Users can now set the transaction id suffix by passing in + a value in the X-Trans-Id-Extra header. + + * New log_max_line_length option caps the maximum length of a log line. + + * Support If-[Un]Modified-Since for object HEAD + + * Added missing constraints and ratelimit parameters to /info + + * Add ability to remove subsections from /info + + * Unify logging for account, container, and object server processes + to provide a consistent message format. This change reorders the + fields logged for the account server. + + * Add targeted config loading to swift-init. This allows an easier + and more explicit way to tell swift-init to run specific server + process configurations. + + * Properly quote www-authenticate (CVE-2014-3497) + + * Fix logging issue when services stop on py26. + + * Change the default logged length of the auth token to 16. + + * Explicitly set permissions on generated ring files to 0644 + + * Fix file uploads larger than 2GiB in the formpost feature + + * Fixed issue where large objects would fail to download if the + auth token expired partway through the download + + * Various other minor bug fixes and improvements + + +swift (1.13.1, OpenStack Icehouse) + + * Change the behavior of CORS responses to better match the spec + + A new proxy config variable (strict_cors_mode, default to True) + has been added. Setting it to False keeps the old behavior. For + an overview of old versus new behavior, please see + https://review.opendev.org/#/c/69419/ + + * Invert the responsibility of the two instances of proxy-logging in + the proxy pipeline + + The first proxy_logging middleware instance to receive a request + in the pipeline marks that request as handling it. So now, the + left most proxy_logging middleware handles logging for all + client requests, and the right most proxy_logging middleware + handles all other requests initiated from within the pipeline to + its left. This fixes logging related to large object + requests not properly recording bandwidth. + + * Added swift-container-info and swift-account-info tools + + * Allow specification of object devices for audit + + * Dynamic large object COPY requests with ?multipart-manifest=get + now work as expected + + * When a client is downloading a large object and one of the segment + reads gets bad data, Swift will now immediately abort the request. + + * Fix ring-builder crash when a ring partition was assigned to a + deleted device, zero-weighted device, and normal device + + * Make probetests work with conf.d configs + + * Various other minor bug fixes and improvements. + + +swift (1.13.0) + + * Account-level ACLs and ACL format v2 + + Accounts now have a new privileged header to represent ACLs or + any other form of account-level access control. The value of + the header is a JSON dictionary string to be interpreted by the + auth system. A reference implementation is given in TempAuth. + Please see the full docs at + https://docs.openstack.org/swift/latest/overview_auth.html + + * Added a WSGI environment flag to stop swob from always using + absolute location. This is useful if middleware needs to use + out-of-spec Location headers in a response. + + * Container sync proxies now support simple load balancing + + * Config option to lower the timeout for recoverable object GETs + + * Add a way to ratelimit all writes to an account + + * Allow multiple storage_domain values in cname_lookup middleware + + * Moved all DLO functionality into middleware + + The proxy will automatically insert the dlo middleware at an + appropriate place in the pipeline the same way it does with the + gatekeeper middleware. Clusters will still support DLOs after upgrade + even with an old config file that doesn't mention dlo at all. + + * Remove python-swiftclient dependency + + * Add secondary groups to process user during privilege escalation + + * When logging request headers, it is now possible to specify + specifically which headers should be logged + + * Added log_requests config parameter to account and container servers + to match the parameter in the object server. This allows a deployer + to turn off log messages for these processes. + + * Ensure swift.source is set for DLO/SLO requests + + * Fixed an issue where overwriting segments in a dynamic manifest + could cause issues on pipelined requests. + + * Properly handle COPY verb in container quota middleware + + * Improved StaticWeb 404 error message on web-listings and index + + * Various other minor bug fixes and improvements. + + +swift (1.12.0) + + * Several important pieces of information have been added to /info: + + - Configured constraints are included and allow a client to discover + the limits on names and object sizes that the cluster supports. + + - The supported tempurl methods are now included. + + - Static large object constraints are now included. + + * The Last-Modified header value returned will now be the object's + timestamp rounded up to the next second. This allows subsequent + requests with If-[un]modified-Since to use the Last-Modified + value as expected. + + * Non-integer values for if-delete-at headers will now properly + report a 400 error instead of a 503. + + * Fix object versioning with non-ASCII container names. + + * Bulk delete with POST now works properly. + + * Generic means for persisting system metadata + + Swift now supports system-level metadata on accounts and + containers. System metadata provides a means to store internal + custom metadata with associated Swift resources in a safe and + secure fashion without actually having to plumb custom metadata + through the core swift servers. The new gatekeeper middleware + prevents this system metadata from leaking into the request or + being set by a client. + + * catch_errors and gatekeeper middleware are now forced into the proxy + pipeline if not explicitly referenced. + + * New container sync configuration option, separating the end user + from knowing the required end point and adding more secure + signed requests. See + https://docs.openstack.org/swift/latest/overview_container_sync.html + for full information. + + * bulk middleware now can be configured to retry deleting containers. + + * The default yield_frequency used to keep client connections alive + during slow bulk requests was reduced from 60 seconds to 10 seconds. + While this is a change to a default, it should not affect deployments + and there is no migration process needed. + + * Swift processes will attempt to set RLIMIT_NPROC to 8192. + + * Server processes will now exit with a non-zero error code on config + errors. + + * Warn if read_affinity is configured but not enabled. + + * Fix checkmount error parsing in swift-recon. + + * Log at warn level when an object is quarantined. + + * Fixed CVE-2014-0006 to avoid a potential timing attack with tempurl. + + * Various other minor bug fixes and improvements. + + +swift (1.11.0) + + * Added discoverable capabilities + + A Swift proxy server now by default (although it can be turned off) + will respond to requests to /info. The response to these requests + include information about the cluster and can be used by clients to + determine which features are supported in the cluster. + + * Object replication ssync (an rsync alternative) + + A Swift storage node can now be configured to use Swift primitives + for replication transport instead of rsync. This is an experimental + feature that is not yet considered production ready. + + * If a source times out on an object server read, try another one + of them with a modified range. + + * The proxy now responds to many types of requests as soon as it + has a quorum. This can help speed up responses (without + changing the results), especially when one node is acting up. + There is a post_quorum_timeout config value that can tune how + long to wait for requests to finish after a quorum has been + established. + + * Add accurate timestamps in proxy log lines for the start and + end of a request. These are added as new fields on the end of + the existing log lines, and therefore should not break + existing, well-behaved log processors. + + * Add an "inline" query parameter to tempurl + + By default, temporary URLs add a "Content-Disposition" header + that forces many clients to download the object. Now, temporary + URLs support an optional "inline" query parameter that will + force a "Content-Disposition: inline" header to be added to the + response, overriding the default. + + * Use TCP_NODELAY for created sockets. This can dramatically + lower latency for small object workloads. + + * DiskFile API, with reference implementation + + The DiskFile abstraction for talking to data on disk has been + refactored to allow alternate implementations to be developed. + Included in the codebase is an in-memory reference + implementation. For full documentation, please see the developer + documentation. The DiskFile API is still a work in progress and + is not yet finalized. + + * Removal of swift-bench + + The included benchmarking tool swift-bench has been extracted + from the codebase and is now in its own repository at + https://github.com/openstack/swift-bench. New swift-bench + binaries and packages may be found on PyPI at + https://pypi.org/project/swift-bench + + * Bulk delete now also supports the POST verb, in addition to DELETE + + * Added functionality to the swift-ring-builder to support + limited recreation of ring builder files from the ring file itself. + + * HEAD on account now returns 410 if account was deleted and + not yet reaped. The old behavior was to return a 404. + + * Fixed a bug introduced since the 1.10.0 release that + prevented expired objects from being removed from the system. + This resulted in orphaned expired objects taking up space on + the system but inaccessible to the API. This regression and + fix are only important if you have deployed code since the + 1.10.0 release. For a full discussion, including a script that + can be used to clean up orphaned objects, see + https://bugs.launchpad.net/swift/+bug/1257330 + + * Tie socket write buffer size to server chunk size parameter. This + pairs the underlying network buffer size with the size of data + that Swift attempts to read from the connection, thereby + improving efficiency and throughput on connections. + + * Fix 500 from account-quota middleware. If a user had set + X-Account-Meta-Quota-Bytes to something non-integer prior to + the installation of the account-quota middleware, then the + quota check would choke on it. Now a non-integer value is + treated as "no quota". + + * Quarantine objects with busted metadata. Before, if you + encountered an object with corrupt or missing xattrs, the + object server would return a 500 on GET, and wouldn't quarantine + anything. Now the object server returns a 404 for that GET and + the corrupted file is quarantined, thus giving replication a + chance to fix it. + + * Fix quarantine and error counts in audit logs + + * Report transaction ID in failure exception logs + + * Make pbr a build-time only dependency + + * Worked around a bug in eventlet 0.9.16 where the size of the + memcache connection pools would grow unbounded. + + * Tempurl keys are now properly stored as utf8 + + * Fixed an issue where concurrent PUT requests to accounts or + containers may result in errors due to locked databases. + + * Handle copy requests in account and container quota middleware + + * Now ensure that a WWW-Authenticate header is on all 401 responses + + * Various other bug fixes and improvements + + +swift (1.10.0, OpenStack Havana) + + * Added support for pooling memcache connections + + * Added support to replicating handoff partitions first in object + replication. Can also configure how many remote nodes a storage node + must talk to before removing a local handoff partition. + + * Fixed bug where memcache entries would not expire + + * Much faster calculation for choosing handoff nodes + + * Added container listing ratelimiting + + * Fixed issue where the proxy would continue to read from a storage + server even after a client had disconnected + + * Added support for headers that are only visible to the owner of a Swift + account + + * Fixed ranged GET with If-None-Match + + * Fixed an issue where rings may not be balanced after initial creation + + * Fixed internationalization support + + * Return the correct etag for a static large object on the PUT response + + * Allow users to extract archives to containers with ACLs set + + * Fix support for range requests against static large objects + + * Now logs x-copy-from header in a useful place + + * Reverted back to old XML output of account and container listings to + ensure older clients do not break + + * Account quotas now appropriately handle copy requests + + * Fix issue with UTF-8 handling in versioned writes + + * Various other bug fixes and improvements, including support for running + Swift under Pypy and continuing work to support storage policies + + +swift (1.9.1) + + * Disallow PUT, POST, and DELETE requests from creating older tombstone + files, preventing the possibility of filling up the disk and removing + unnecessary container updates. + + * Set default wsgi workers to cpu_count + + Change the default value of wsgi workers from 1 to auto. The new + default value for workers in the proxy, container, account & object + wsgi servers will spawn as many workers per process as you have cpu + cores. This will not be ideal for some configurations, but it's much + more likely to produce a successful out of the box deployment. + + * Added reveal_sensitive_prefix config setting to filter the auth token + logged by the proxy server. + + * Ensure Keystone's reseller prefix ends with an underscore. Previously + this was a recommendation--now it is enforced. + + * Added log_file_pattern config to swift-drive-audit for drive errors + + * Add support for telling Swift to detect a content type on a request. + + * Additional object stats are now logged in the object auditor + + * Moved the DiskFile interface into its own module + + * Ensure the SQLite cursors are closed when creating functions + + * Better support for valid Accept headers + + * In Keystone, don't allow users to delete their own account + + * Return a UTC timezone designator in container listings + + * Ensure that users can't remove their account quotas + + * Allow floating point value for dispersion coverage + + * Fix incorrect error page handling in staticweb + + * Add utf-8 charset to multipart-manifest=get response. + + * Allow dispersion tools to use keystone server with insecure certificate + + * Ensure that files are always closed in tests + + * Use OpenStack's "Hacking" guidelines for code formatting + + * Various other minor bug fixes and improvements + + +swift (1.9.0) + + * Global clusters support + + The "region" concept introduced in Swift 1.8.0 has been augmented with + support for using a separate replication network and configuring read + and write affinity. These features combine to offer support for a single + Swift cluster spanning wide geographic area. + + * Disk performance + + The object server now can be configured to use threadpools to increase + performance and smooth out latency throughout the system. Also, many + disk operations were reordered to increase reliability and improve + performance. + + * Added config file conf.d support + + Allow Swift daemons and servers to optionally accept a directory as the + configuration parameter. This allows different parts of the config file + to be managed separately, eg each middleware could use a separate file + for its particular config settings. + + * Allow two TempURL keys per account + + By adding a second key, a user can safely rotate keys and prevent URLs + already in use from becoming invalid. TempURL middlware has also been + updated to allow a configuable set of allowed methods and to prevent a + bugrelated to content-disposition names. + + * Added crossdomain.xml middleware. See + https://docs.openstack.org/swift/latest/crossdomain.html for details + + * Added rsync bandwidth limit setting for object replicator + + * Transaction ID updated to include the time and an optional suffix + + * Added x-remove-versions-location header to disable versioned writes + + * Improvements to support for Keystone ACLs + + * Added parallelism to object expirer daemon + + * Added support for ring hash prefix in addition to the existing suffix + + * Allow all headers requested for CORS + + * Stop getting useless bytes on manifest Range requests + + * Improved container-sync resiliency + + * Added example Apache config files. See + https://docs.openstack.org/swift/latest/apache_deployment_guide.html + for more info + + * If an account is marked as deleted but hasn't been reaped and is still + on disk, responses will include an "X-Account-Status" header + + * Fix 503 on account/container HEAD with invalid format + + * Added extra safety on account-level DELETE when using bulk deletes + + * Made colons quote-safe in logs (mainly for IPv6) + + * Fixed bug with bulk delete max items + + * Fixed static large object manifest range requests + + * Prevent static large objects from containing other static large objects + + * Fixed issue with use of delimiter in container queries where some + objects would not be listed + + * Various other minor bug fixes and improvements + + +swift (1.8.0, OpenStack Grizzly) + + * Make rings' replica count adjustable + + * Added a region tier to the ring above zones + + * Added timing-based sorting of object servers on read requests + + * Added support for auto-extract archive uploads + + * Added support for bulk delete requests + + * Added support for large objects with static manifests + + * Added list_endpoints middleware to provide an API for determining where + the ring places data + + * proxy-logging middleware can now handle logging for other middleware + + proxy-logging should be used twice in the proxy pipeline. The first + handles middleware logs for requests that never made it all the way + to the server. The last handles requests that do make it to the server. + + This is a change that may require an update to your proxy server + config file or custom middleware that you may be using. See the full + docs at https://docs.openstack.org/swift/latest/misc.html. + + * Changed the default sample rate for a few high-traffic requests. + + Added log_statsd_sample_rate_factor to globally tune the StatsD + sample rate. This tunable can be used to reduce StatsD traffic + proportionally for all metrics and is intended to replace + log_statsd_default_sample_rate, which is left alone for + backward-compatibility, should anyone be using it. + + * Added swift_hash_path_prefix option to swift.conf + + New deployments are advised to set this value to a random secret + to protect against hash collisions + + * Added user-managed container quotas + + * Added support for account-level quotas managed by an auth reseller + + * Added --run-dir option to swift-init + + * Added more options to swift-bench + + * Added support for CORS "actual requests" + + * Added fallocate_reserve option to protect against full drives + + * Allow ring rebalance to take a seed + + * Ring serialization will now produce the same gzip file (Py2.7) + + * Added support to swift-drive-audit for handling rotated logs + + * Added first-byte latency timings for GET requests + + * Added per disk PUT timing monitoring support + + * Added speed limit options for DB auditor + + * Force log entries to be one line + + * Ensure that fsync is used and not just fdatasync + + * Improved handoff node selection + + * Deprecated keystone is_admin feature + + * Fix large objects with unicode in the segment names + + * Update Swift's MemcacheRing to provide API compatibility with + standard Python memcache libraries + + * Various other minor bug fixes and improvements + + +swift (1.7.6) + + * Better tempauth storage URL guessing + + * Added --top option to swift-recon -d + + * Allow optional, temporary healthcheck failure + + * keystoneauth middleware now supports cross-tenant ACLs + + * Add dispersion report flags to limit reports + + * Add config option to turn eventlet debug on/off + + * Added override option for swift-init's KILL_WAIT + + * Added oldest and most recent replication pass to swift-recon + + * Fixed 500 error response when GETing a many-segment manifest + + * Memcached keys now use a delta timeout when possible + + * Refactor DiskFile to hide temp file names and exts + + * Remove IP-based container-sync ACLs from auth middlewares + + * Fixed bug in deleting memcached account info data + + * Fixed lazy-listing of object manifest segments + + * Fixed bug where a ? in the object name caused an error + + * Swift now returns 406 if it can't satisfy Accept + + * Fix infinite recursion bug in object replicator + + * Swift will now reject names with NULL characters + + * Fixed object-auditor logging to use a minimum of unix sockets + + * Various other minor bug fixes and improvements + + swift (1.7.5) * Support OPTIONS verb, including CORS preflight requests @@ -48,29 +4600,32 @@ swift (1.7.5) * Various other minor bug fixes and improvements -swift (1.7.4) + +swift (1.7.4, OpenStack Folsom) * Fix issue where early client disconnects may have caused a memory leak + swift (1.7.2) * Fix issue where memcache serialization was not properly loading the config value + swift (1.7.0) * Use custom encoding for ring data instead of pickle Serialize RingData in a versioned, custom format which is a combination of a JSON-encoded header and .tostring() dumps of the - replica2part2dev_id arrays. This format deserializes hundreds of times + replica2part2dev_id arrays. This format deserializes hundreds of times faster than rings serialized with Python 2.7's pickle (a significant performance regression for ring loading between Python 2.6 and Python - 2.7). Fixes bug 1031954. + 2.7). Fixes bug 1031954. The new implementation is backward-compatible; if a ring does not begin with a new-style magic string, it is assumed to be an - old-style pickle-dumped ring and is handled as before. So new Swift + old-style pickle-dumped ring and is handled as before. So new Swift code can read old rings, but old Swift code will not be able to read newly-serialized rings. @@ -127,6 +4682,7 @@ swift (1.7.0) * Various other minor bug fixes and improvements + swift (1.6.0) * Removed bin/swift and swift/common/client.py from the swift repo. These @@ -141,7 +4697,7 @@ swift (1.6.0) substantially affects the JSON output of the dispersion report, and any tools written to consume this output will need to be updated. - * Added Solaris (Illumos) compability + * Added Solaris (Illumos) compatibility * Added -a option to swift-get-nodes to show all handoffs @@ -186,6 +4742,7 @@ swift (1.6.0) * Various other minor bug fixes and improvements + swift (1.5.0) * New option to toggle SQLite database preallocation with account @@ -254,7 +4811,8 @@ swift (1.5.0) * Various other minor bug fixes and improvements -swift (1.4.8) + +swift (1.4.8, OpenStack Essex) * Added optional max_containers_per_account restriction @@ -285,6 +4843,7 @@ swift (1.4.8) * Refactored some ring building functions for clarity and simplicity + swift (1.4.7) * Improvements to account and container replication. @@ -303,6 +4862,7 @@ swift (1.4.7) * Other bug fixes and documentation updates. + swift (1.4.6) * TempURL and FormPost middleware added @@ -327,6 +4887,7 @@ swift (1.4.6) * Other minor bug fixes + swift (1.4.5) * New swift-orphans and swift-oldies command line tools to detect @@ -355,6 +4916,7 @@ swift (1.4.5) * PEP8 Updates. + swift (1.4.4) * Fixes to prevent socket hoarding (memory leak) @@ -397,7 +4959,8 @@ swift (1.4.4) * Query only specific zone via swift-recon. -swift (1.4.3) + +swift (1.4.3, OpenStack Diablo) * Additional quarantine catching code. @@ -421,6 +4984,7 @@ swift (1.4.3) * Fix to the swift tool to strip any leading slashes on file names when uploading. + swift (1.4.2) * Removed stats/logging code from Swift [now in separate slogging project]. @@ -448,6 +5012,7 @@ swift (1.4.2) * This fixes the bug that drop_buffer_cache() doesn't work on systems where off_t isn't 64 bits. + swift (1.4.1) * st renamed to swift @@ -464,6 +5029,7 @@ swift (1.4.1) * Accounts are auto-created if an auth token is valid when the account_autocreate proxy config parameter is set to true. + swift (1.4.0) * swift-bench now cleans up containers it creates. @@ -522,3 +5088,16 @@ swift (1.4.0) * Stats uploaders now allow overrides for source_filename_pattern and new_log_cutoff values. + + +---- + +Changelog entries for previous versions are incomplete + +swift (1.3.0, OpenStack Cactus) + +swift (1.2.0, OpenStack Bexar) + +swift (1.1.0, OpenStack Austin) + +swift (1.0.0, Initial Release) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md deleted file mode 100644 index 85297900c3..0000000000 --- a/CONTRIBUTING.md +++ /dev/null @@ -1,12 +0,0 @@ -If you would like to contribute to the development of OpenStack, -you must follow the steps in the "If you're a developer, start here" -section of this page: [http://wiki.openstack.org/HowToContribute](http://wiki.openstack.org/HowToContribute#If_you.27re_a_developer.2C_start_here:) - -Once those steps have been completed, changes to OpenStack -should be submitted for review via the Gerrit tool, following -the workflow documented at [http://wiki.openstack.org/GerritWorkflow](http://wiki.openstack.org/GerritWorkflow). - -Pull requests submitted through GitHub will be ignored. - -Bugs should be filed [on Launchpad](https://bugs.launchpad.net/swift), -not in GitHub's issue tracker. diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst new file mode 100644 index 0000000000..b2117bf7f5 --- /dev/null +++ b/CONTRIBUTING.rst @@ -0,0 +1,184 @@ +Contributing to OpenStack Swift +=============================== + +Who is a Contributor? +--------------------- + +Put simply, if you improve Swift, you're a contributor. The easiest way to +improve the project is to tell us where there's a bug. In other words, filing +a bug is a valuable and helpful way to contribute to the project. + +Once a bug has been filed, someone will work on writing a patch to fix the +bug. Perhaps you'd like to fix a bug. Writing code to fix a bug or add new +functionality is tremendously important. + +Once code has been written, it is submitted upstream for review. All code, +even that written by the most senior members of the community, must pass code +review and all tests before it can be included in the project. Reviewing +proposed patches is a very helpful way to be a contributor. + +Swift is nothing without the community behind it. We'd love to welcome you to +our community. Come find us in #openstack-swift on OFTC IRC or on the +OpenStack dev mailing list. + +For general information on contributing to OpenStack, please check out the +`contributor guide `_ to get started. +It covers all the basics that are common to all OpenStack projects: the accounts +you need, the basics of interacting with our Gerrit review system, how we +communicate as a community, etc. + +For more project information, feel free to check out the `Swift documentation `__. + +Filing a Bug +~~~~~~~~~~~~ + +Filing a bug is the easiest way to contribute. You can find currently-tracked bugs on our `Launchpad. `__ +Use the `Report a bug `__ link to +file a new bug. + +If you find something in Swift that doesn't match the documentation or doesn't +meet your expectations with how it should work, please let us know. Of course, +if you ever get an error (like a Traceback message in the logs), we definitely +want to know about that. We'll do our best to diagnose any problem and patch +it as soon as possible. + +A bug report, at minimum, should describe what you were doing that caused the +bug. "Swift broke, pls fix" is not helpful. Instead, something like "When I +restarted syslog, Swift started logging traceback messages" is very helpful. +The goal is that we can reproduce the bug and isolate the issue in order to +apply a fix. If you don't have full details, that's ok. Anything you can +provide is helpful. + +You may have noticed that there are many tracked bugs, but not all of them +have been confirmed. If you take a look at an old bug report and you can +reproduce the issue described, please leave a comment on the bug about that. +It lets us all know that the bug is very likely to be valid. + +Reviewing Someone Else's Code +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +All code reviews in OpenStack projects are done on our Gerrit instance at +`review.opendev.org `__. +Reviewing patches is one of the most effective ways you can contribute to the community. + +We've written a set of `Review Guidelines `__ to help you +give good reviews. + +To find out what reviews are priortiy in the community, you can read `Priority Reviews `__ on our wiki. + +What do I work on? +------------------ + +If you're looking for a way to write and contribute code, but you're not sure +what to work on, check out the `"wishlist" bugs `__ in the bug tracker. These are +normally smaller items that someone took the time to write down but didn't +have time to implement. + +And please join #openstack-swift on OFTC IRC to tell us what you're working on. + +Getting Started +--------------- + +https://docs.openstack.org/swift/latest/first_contribution_swift.html + +Once those steps have been completed, changes to OpenStack +should be submitted for review via the Gerrit tool, following our `Development Workflow. `__ + +Gerrit is the review system used in the OpenStack projects. We're sorry, but +we won't be able to respond to pull requests submitted through GitHub. + +Bugs should be filed `on Launchpad `__, +not in GitHub's issue tracker. + +Swift Design Principles +======================= + +- `The Zen of Python `__ +- Simple Scales +- Minimal dependencies +- Re-use existing tools and libraries when reasonable +- Leverage the economies of scale +- Small, loosely coupled RESTful services +- No single points of failure +- Start with the use case +- ... then design from the cluster operator up +- If you haven't argued about it, you don't have the right answer yet + :) +- If it is your first implementation, you probably aren't done yet :) + +Please don't feel offended by difference of opinion. Be prepared to +advocate for your change and iterate on it based on feedback. Reach out +to other people working on the project on +`IRC `__ or +the `mailing +list `__ - we want +to help. + +Recommended workflow +==================== + +- Set up a `Swift All-In-One + VM `__\ (SAIO). + +- Make your changes. Docs and tests for your patch must land before or + with your patch. + +- Run unit tests, functional tests, probe tests ``./.unittests`` + ``./.functests`` ``./.probetests`` + +- Run ``tox`` (no command-line args needed) + +- ``git review`` + +Notes on Testing +================ + +Running the tests above against Swift in your development environment +(ie your SAIO) will catch most issues. Any patch you propose is expected +to be both tested and documented and all tests should pass. + +If you want to run just a subset of the tests while you are developing, +you can use pytest: + +.. code-block:: console + + cd test/unit/common/middleware/ && pytest test_healthcheck.py + +To check which parts of your code are being exercised by a test, you can +run tox and then point your browser to swift/cover/index.html: + +.. code-block:: console + + tox -e py3 -- test.unit.common.middleware.test_healthcheck:TestHealthCheck.test_healthcheck + +Swift's unit tests are designed to test small parts of the code in +isolation. The functional tests validate that the entire system is +working from an external perspective (they are "black-box" tests). You +can even run functional tests against public Swift endpoints. The +probetests are designed to test much of Swift's internal processes. For +example, a test may write data, intentionally corrupt it, and then +ensure that the correct processes detect and repair it. + +When your patch is submitted for code review, it will automatically be +tested on the OpenStack CI infrastructure. In addition to many of the +tests above, it will also be tested by several other OpenStack test +jobs. + +Once your patch has been reviewed and approved by core reviewers and +has passed all automated tests, it will be merged into the Swift source +tree. + +Ideas +===== + +https://wiki.openstack.org/wiki/Swift/ideas + +If you're working on something, it's a very good idea to write down +what you're thinking about. This lets others get up to speed, helps +you collaborate, and serves as a great record for future reference. +Write down your thoughts somewhere and put a link to it here. It +doesn't matter what form your thoughts are in; use whatever is best +for you. Your document should include why your idea is needed and your +thoughts on particular design choices and tradeoffs. Please include +some contact information (ideally, your IRC nick) so that people can +collaborate with you. diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000..f4a652ba40 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,76 @@ +################################################ +# +# Alpine 3.16.2 Swift-All-In-One +# +################################################ + +FROM alpine:3.16.2 +MAINTAINER Openstack Swift + +ENV S6_LOGGING 1 +ENV S6_VERSION 1.21.4.0 +ENV SOCKLOG_VERSION 3.0.1-1 +ENV ARCH amd64 +ENV BUILD_DIR "/tmp" +ENV ENV="/etc/profile" + +#COPY docker/install_scripts /install_scripts +COPY . /opt/swift + +ADD https://github.com/just-containers/s6-overlay/releases/download/v$S6_VERSION/s6-overlay-$ARCH.tar.gz /tmp/ +ADD https://github.com/just-containers/s6-overlay/releases/download/v$S6_VERSION/s6-overlay-$ARCH.tar.gz.sig /tmp/ +ADD https://github.com/just-containers/socklog-overlay/releases/download/v$SOCKLOG_VERSION/socklog-overlay-$ARCH.tar.gz /tmp/ + +RUN mkdir /etc/swift && \ + echo && \ + echo && \ + echo && \ + echo "================ starting swift_needs ===================" && \ + /opt/swift/docker/install_scripts/00_swift_needs.sh && \ + echo && \ + echo && \ + echo && \ + echo "================ starting apk_install_prereqs ===================" && \ + /opt/swift/docker/install_scripts/10_apk_install_prereqs.sh && \ + echo && \ + echo && \ + echo && \ + echo "================ starting apk_install_py3 ===================" && \ + /opt/swift/docker/install_scripts/21_apk_install_py3.sh && \ + echo && \ + echo && \ + echo && \ + echo "================ starting swift_install ===================" && \ + /opt/swift/docker/install_scripts/50_swift_install.sh && \ + echo && \ + echo && \ + echo && \ + echo "================ installing s6-overlay ===================" && \ + gpg --import /opt/swift/docker/s6-gpg-pub-key && \ + gpg --verify /tmp/s6-overlay-$ARCH.tar.gz.sig /tmp/s6-overlay-$ARCH.tar.gz && \ + gunzip -c /tmp/s6-overlay-$ARCH.tar.gz | tar -xf - -C / && \ + gunzip -c /tmp/socklog-overlay-amd64.tar.gz | tar -xf - -C / && \ + rm -rf /tmp/s6-overlay* && \ + rm -rf /tmp/socklog-overlay* && \ + echo && \ + echo && \ + echo && \ + echo "================ starting pip_uninstall_dev ===================" && \ + /opt/swift/docker/install_scripts/60_pip_uninstall_dev.sh && \ + echo && \ + echo && \ + echo && \ + echo "================ starting apk_uninstall_dev ===================" && \ + /opt/swift/docker/install_scripts/99_apk_uninstall_dev.sh && \ + echo && \ + echo && \ + echo && \ + echo "================ clean up ===================" && \ + echo "TODO: cleanup" + #rm -rf /opt/swift + + +# Add Swift required configuration files +COPY docker/rootfs / + +ENTRYPOINT ["/init"] diff --git a/LICENSE b/LICENSE index 75b52484ea..d645695673 100644 --- a/LICENSE +++ b/LICENSE @@ -1,202 +1,202 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/MANIFEST.in b/MANIFEST.in index 73ea9e2e8b..495d35c967 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,12 +1,13 @@ include AUTHORS LICENSE .functests .unittests .probetests test/__init__.py -include CHANGELOG README.md +include CHANGELOG CONTRIBUTING.rst README.rst include babel.cfg include test/sample.conf include tox.ini +include requirements.txt test-requirements.txt graft doc graft etc -graft locale +graft swift/locale +recursive-include swift/common/middleware/s3api/schema *.rng graft test/functional graft test/probe graft test/unit -graft tools diff --git a/README.md b/README.md deleted file mode 100644 index a839dd6fcc..0000000000 --- a/README.md +++ /dev/null @@ -1,83 +0,0 @@ -# Swift - -A distributed object storage system designed to scale from a single machine -to thousands of servers. Swift is optimized for multi-tenancy and high -concurrency. Swift is ideal for backups, web and mobile content, and any other -unstructured data that can grow without bound. - -Swift provides a simple, REST-based API fully documented at -http://doc.openstack.org/. - -Swift was originally developed as the basis for Rackspace's Cloud Files and -was open-sourced in 2010 as part of the OpenStack project. It has since grown -to include contributions from many companies and has spawned a thriving -ecosystem of 3rd party tools. Swift's contributors are listed in the AUTHORS -file. - -## Docs - -To build documentation install sphinx (`pip install sphinx`), run -`python setup.py build_sphinx`, and then browse to /doc/build/html/index.html. -These docs are auto-generated after every commit and available online at -http://docs.openstack.org/developer/swift/. - -## For Developers - -The best place to get started is the ["SAIO - Swift All In One"](http://docs.openstack.org/developer/swift/development_saio.html). -This document will walk you through setting up a development cluster of Swift -in a VM. The SAIO environment is ideal for running small-scale tests against -swift and trying out new features and bug fixes. - -You can run unit tests with `.unittests` and functional tests with -`.functests`. - -### Code Organization - - * bin/: Executable scripts that are the processes run by the deployer - * doc/: Documentation - * etc/: Sample config files - * swift/: Core code - * account/: account server - * common/: code shared by different modules - * middleware/: "standard", officially-supported middleware - * ring/: code implementing Swift's ring - * container/: container server - * obj/: object server - * proxy/: proxy server - * test/: Unit and functional tests - -### Data Flow - -Swift is a WSGI application and uses eventlet's WSGI server. After the -processes are running, the entry point for new requests is the `Application` -class in `swift/proxy/server.py`. From there, a controller is chosen, and the -request is processed. The proxy may choose to forward the request to a back- -end server. For example, the entry point for requests to the object server is -the `ObjectController` class in `swift/obj/server.py`. - - -## For Deployers - -Deployer docs are also available at -http://docs.openstack.org/developer/swift/. A good starting point is at -http://docs.openstack.org/developer/swift/deployment_guide.html - -You can run functional tests against a swift cluster with `.functests`. These -functional tests require `/etc/swift/test.conf` to run. A sample config file -can be found in this source tree in `test/sample.conf`. - -## For Client Apps - -For client applications, official Python language bindings are provided at -http://github.com/openstack/python-swiftclient. - -Complete API documentation at -http://docs.openstack.org/api/openstack-object-storage/1.0/content/ - ----- - -For more information come hang out in #openstack-swift on freenode. - -Thanks, - -The Swift Development Team diff --git a/README.rst b/README.rst new file mode 100644 index 0000000000..1afe2b0a37 --- /dev/null +++ b/README.rst @@ -0,0 +1,154 @@ +=============== +OpenStack Swift +=============== + +OpenStack Swift is a distributed object storage system designed to scale +from a single machine to thousands of servers. Swift is optimized for +multi-tenancy and high concurrency. Swift is ideal for backups, web and mobile +content, and any other unstructured data that can grow without bound. + +Swift provides a simple, REST-based API fully documented at +https://docs.openstack.org/swift/latest/. + +Swift was originally developed as the basis for Rackspace's Cloud Files +and was open-sourced in 2010 as part of the OpenStack project. It has +since grown to include contributions from many companies and has spawned +a thriving ecosystem of 3rd party tools. Swift's contributors are listed +in the AUTHORS file. + +Docs +---- + +To build documentation run:: + + pip install -r requirements.txt -r doc/requirements.txt + sphinx-build -W -b html doc/source doc/build/html + +and then browse to doc/build/html/index.html. These docs are auto-generated +after every commit and available online at +https://docs.openstack.org/swift/latest/. + +For Developers +-------------- + +Getting Started +~~~~~~~~~~~~~~~ + +Swift is part of OpenStack and follows the code contribution, review, and +testing processes common to all OpenStack projects. + +If you would like to start contributing, check out these +`notes `__ to help you get started. + +The best place to get started is the +`"SAIO - Swift All In One" `__. +This document will walk you through setting up a development cluster of +Swift in a VM. The SAIO environment is ideal for running small-scale +tests against Swift and trying out new features and bug fixes. + +Tests +~~~~~ + +There are three types of tests included in Swift's source tree. + +#. Unit tests +#. Functional tests +#. Probe tests + +Unit tests check that small sections of the code behave properly. For example, +a unit test may test a single function to ensure that various input gives the +expected output. This validates that the code is correct and regressions are +not introduced. + +Functional tests check that the client API is working as expected. These can +be run against any endpoint claiming to support the Swift API (although some +tests require multiple accounts with different privilege levels). These are +"black box" tests that ensure that client apps written against Swift will +continue to work. + +Probe tests are "white box" tests that validate the internal workings of a +Swift cluster. They are written to work against the +`"SAIO - Swift All In One" `__ +dev environment. For example, a probe test may create an object, delete one +replica, and ensure that the background consistency processes find and correct +the error. + +You can run unit tests with ``.unittests``, functional tests with +``.functests``, and probe tests with ``.probetests``. There is an +additional ``.alltests`` script that wraps the other three. + +To fully run the tests, the target environment must use a filesystem that +supports large xattrs. XFS is strongly recommended. For unit tests and in- +process functional tests, either mount ``/tmp`` with XFS or provide another +XFS filesystem via the ``TMPDIR`` environment variable. Without this setting, +tests should still pass, but a very large number will be skipped. + +Code Organization +~~~~~~~~~~~~~~~~~ + +- doc/: Documentation +- etc/: Sample config files +- examples/: Config snippets used in the docs +- swift/: Core code + + - account/: account server + - cli/: code that backs some of the CLI tools + - common/: code shared by different modules + + - middleware/: "standard", officially-supported middleware + - ring/: code implementing Swift's ring + + - container/: container server + - locale/: internationalization (translation) data + - obj/: object server + - proxy/: proxy server + +- test/: Unit, functional, and probe tests + +Data Flow +~~~~~~~~~ + +Swift is a WSGI application and uses eventlet's WSGI server. After the +processes are running, the entry point for new requests is the +``Application`` class in ``swift/proxy/server.py``. From there, a +controller is chosen, and the request is processed. The proxy may choose +to forward the request to a back-end server. For example, the entry +point for requests to the object server is the ``ObjectController`` +class in ``swift/obj/server.py``. + +For Deployers +------------- + +Deployer docs are also available at +https://docs.openstack.org/swift/latest/. A good starting point is at +https://docs.openstack.org/swift/latest/deployment_guide.html +There is an `ops runbook `__ +that gives information about how to diagnose and troubleshoot common issues +when running a Swift cluster. + +You can run functional tests against a Swift cluster with +``.functests``. These functional tests require ``/etc/swift/test.conf`` +to run. A sample config file can be found in this source tree in +``test/sample.conf``. + +For Client Apps +--------------- + +For client applications, official Python language bindings are provided +at https://opendev.org/openstack/python-swiftclient. + +Complete API documentation at +https://docs.openstack.org/api-ref/object-store/ + +There is a large ecosystem of applications and libraries that support and +work with OpenStack Swift. Several are listed on the +`associated projects `__ +page. + +-------------- + +For more information come hang out in #openstack-swift on OFTC. + +Thanks, + +The Swift Development Team diff --git a/REVIEW_GUIDELINES.rst b/REVIEW_GUIDELINES.rst new file mode 100644 index 0000000000..e2bc4c6bbb --- /dev/null +++ b/REVIEW_GUIDELINES.rst @@ -0,0 +1,390 @@ +Review Guidelines +================= + +Effective code review is a skill like any other professional skill you +develop with experience. Effective code review requires trust. No +one is perfect. Everyone makes mistakes. Trust builds over time. + +This document will enumerate behaviors commonly observed and +associated with competent reviews of changes purposed to the Swift +code base. No one is expected to "follow these steps". Guidelines +are not *rules*, not all behaviors will be relevant in all situations. + + Code review is collaboration, not judgement. + + -- Alistair Coles + +Checkout the Change +------------------- + +You will need to have a copy of the change in an environment where you +can freely edit and experiment with the code in order to provide a +non-superficial review. Superficial reviews are not terribly helpful. +Always try to be helpful. ;) + +Check out the change so that you may begin. + +Commonly, ``git review -d `` + +Run it +------ + +Imagine that you submit a patch to Swift, and a reviewer starts to +take a look at it. Your commit message on the patch claims that it +fixes a bug or adds a feature, but as soon as the reviewer downloads +it locally and tries to test it, a severe and obvious error shows up. +Something like a syntax error or a missing dependency. + +"Did you even run this?" is the review comment all contributors dread. + +Reviewers in particular need to be fearful merging changes that just +don't work - or at least fail in frequently common enough scenarios to +be considered "horribly broken". A comment in our review that says +roughly "I ran this on my machine and observed ``description of +behavior change is supposed to achieve``" is the most powerful defense +we have against the terrible scorn from our fellow Swift developers +and operators when we accidentally merge bad code. + +If you're doing a fair amount of reviews - you will participate in +merging a change that will break my clusters - it's cool - I'll do it +to you at some point too (sorry about that). But when either of us go +look at the reviews to understand the process gap that allowed this to +happen - it better not be just because we were too lazy to check it out +and run it before it got merged. + +Or be warned, you may receive, the dreaded... + + "Did you even *run* this?" + +I'm sorry, I know it's rough. ;) + +Consider edge cases very seriously +---------------------------------- + + Saying "that should rarely happen" is the same as saying "that + *will* happen" + + -- Douglas Crockford + +Scale is an *amazingly* abusive partner. If you contribute changes to +Swift your code is running - in production - at scale - and your bugs +cannot hide. I wish on all of us that our bugs may be exceptionally +rare - meaning they only happen in extremely unlikely edge cases. For +example, bad things that happen only 1 out of every 10K times an op is +performed will be discovered in minutes. Bad things that happen only +1 out of every one billion times something happens will be observed - +by multiple deployments - over the course of a release. Bad things +that happen 1/100 times some op is performed are considered "horribly +broken". Tests must exhaustively exercise possible scenarios. Every +system call and network connection will raise an error and timeout - +where will that Exception be caught? + +Run the tests +------------- + +Yes, I know Gerrit does this already. You can do it *too*. You might +not need to re-run *all* the tests on your machine - it depends on the +change. But, if you're not sure which will be most useful - running +all of them best - unit - functional - probe. If you can't reliably +get all tests passing in your development environment you will not be +able to do effective reviews. Whatever tests/suites you are able to +exercise/validate on your machine against your config you should +mention in your review comments so that other reviewers might choose +to do *other* testing locally when they have the change checked out. + +e.g. + + I went ahead and ran probe/test_object_metadata_replication.py on + my machine with both sync_method = rsync and sync_method = ssync - + that works for me - but I didn't try it with object_post_as_copy = + false + +Maintainable Code is Obvious +---------------------------- + +Style is an important component to review. The goal is maintainability. + +However, keep in mind that generally style, readability and +maintainability are orthogonal to the suitability of a change for +merge. A critical bug fix may be a well written pythonic masterpiece +of style - or it may be a hack-y ugly mess that will absolutely need +to be cleaned up at some point - but it absolutely should merge +because: CRITICAL. BUG. FIX. + +You should comment inline to praise code that is "obvious". You should +comment inline to highlight code that you found to be "obfuscated". + +Unfortunately "readability" is often subjective. We should remember +that it's probably just our own personal preference. Rather than a +comment that says "You should use a list comprehension here" - rewrite +the code as a list comprehension, run the specific tests that hit the +relevant section to validate your code is correct, then leave a +comment that says: + + I find this more readable: + + ``diff with working tested code`` + +If the author (or another reviewer) agrees - it's possible the change will get +updated to include that improvement before it is merged; or it may happen in a +follow-up change. + +However, remember that style is non-material - it is useful to provide (via +diff) suggestions to improve maintainability as part of your review - but if +the suggestion is functionally equivalent - it is by definition optional. + +Commit Messages +--------------- + +Read the commit message thoroughly before you begin the review. + +Commit messages must answer the "why" and the "what for" - more so +than the "how" or "what it does". Commonly this will take the form of +a short description: + +- What is broken - without this change +- What is impossible to do with Swift - without this change +- What is slower/worse/harder - without this change + +If you're not able to discern why a change is being made or how it +would be used - you may have to ask for more details before you can +successfully review it. + +Commit messages need to have a high consistent quality. While many +things under source control can be fixed and improved in a follow-up +change - commit messages are forever. Luckily it's easy to fix minor +mistakes using the in-line edit feature in Gerrit! If you can avoid +ever having to *ask* someone to change a commit message you will find +yourself an amazingly happier and more productive reviewer. + +Also commit messages should follow the OpenStack Commit Message +guidelines, including references to relevant impact tags or bug +numbers. You should hand out links to the OpenStack Commit Message +guidelines *liberally* via comments when fixing commit messages during +review. + +Here you go: `GitCommitMessages `_ + +New Tests +--------- + +New tests should be added for all code changes. Historically you +should expect good changes to have a diff line count ratio of at least +2:1 tests to code. Even if a change has to "fix" a lot of *existing* +tests, if a change does not include any *new* tests it probably should +not merge. + +If a change includes a good ratio of test changes and adds new tests - +you should say so in your review comments. + +If it does not - you should write some! + +... and offer them to the patch author as a diff indicating to them that +"something" like these tests I'm providing as an example will *need* to be +included in this change before it is suitable to merge. Bonus points if you +include suggestions for the author as to how they might improve or expand upon +the tests stubs you provide. + +Be *very* careful about asking an author to add a test for a "small change" +before attempting to do so yourself. It's quite possible there is a lack of +existing test infrastructure needed to develop a concise and clear test - the +author of a small change may not be the best person to introduce a large +amount of new test infrastructure. Also, most of the time remember it's +*harder* to write the test than the change - if the author is unable to +develop a test for their change on their own you may prevent a useful change +from being merged. At a minimum you should suggest a specific unit test that +you think they should be able to copy and modify to exercise the behavior in +their change. If you're not sure if such a test exists - replace their change +with an Exception and run tests until you find one that blows up. + +Documentation +------------- + +Most changes should include documentation. New functions and code +should have Docstrings. Tests should obviate new or changed behaviors +with descriptive and meaningful phrases. New features should include +changes to the documentation tree. New config options should be +documented in example configs. The commit message should document the +change for the change log. + +Always point out typos or grammar mistakes when you see them in +review, but also consider that if you were able to recognize the +intent of the statement - documentation with typos may be easier to +iterate and improve on than nothing. + +If a change does not have adequate documentation it may not be suitable to +merge. If a change includes incorrect or misleading documentation or is +contrary to *existing* documentation is probably is not suitable to merge. + +Every change could have better documentation. + +Like with tests, a patch isn't done until it has docs. Any patch that +adds a new feature, changes behavior, updates configs, or in any other +way is different than previous behavior requires docs. manpages, +sample configs, docstrings, descriptive prose in the source tree, etc. + +Reviewers Write Code +-------------------- + +Reviews have been shown to provide many benefits - one of which is shared +ownership. After providing a positive review you should understand how the +change works. Doing this will probably require you to "play with" the change. + +You might functionally test the change in various scenarios. You may need to +write a new unit test to validate the change will degrade gracefully under +failure. You might have to write a script to exercise the change under some +superficial load. You might have to break the change and validate the new +tests fail and provide useful errors. You might have to step through some +critical section of the code in a debugger to understand when all the possible +branches are exercised in tests. + +When you're done with your review an artifact of your effort will be +observable in the piles of code and scripts and diffs you wrote while +reviewing. You should make sure to capture those artifacts in a paste +or gist and include them in your review comments so that others may +reference them. + +e.g. + + When I broke the change like this: + + ``diff`` + + it blew up like this: + + ``unit test failure`` + + +It's not uncommon that a review takes more time than writing a change - +hopefully the author also spent as much time as you did *validating* their +change but that's not really in your control. When you provide a positive +review you should be sure you understand the change - even seemingly trivial +changes will take time to consider the ramifications. + +Leave Comments +-------------- + +Leave. Lots. Of. Comments. + +A popular web comic has stated that +`WTFs/Minute `_ is the +*only* valid measurement of code quality. + +If something initially strikes you as questionable - you should jot +down a note so you can loop back around to it. + +However, because of the distributed nature of authors and reviewers +it's *imperative* that you try your best to answer your own questions +as part of your review. + +Do not say "Does this blow up if it gets called when xyz" - rather try +and find a test that specifically covers that condition and mention it +in the comment so others can find it more quickly. Or if you can find +no such test, add one to demonstrate the failure, and include a diff +in a comment. Hopefully you can say "I *thought* this would blow up, +so I wrote this test, but it seems fine." + +But if your initial reaction is "I don't understand this" or "How does +this even work?" you should notate it and explain whatever you *were* +able to figure out in order to help subsequent reviewers more quickly +identify and grok the subtle or complex issues. + +Because you will be leaving lots of comments - many of which are +potentially not highlighting anything specific - it is VERY important +to leave a good summary. Your summary should include details of how +you reviewed the change. You may include what you liked most, or +least. + +If you are leaving a negative score ideally you should provide clear +instructions on how the change could be modified such that it would be +suitable for merge - again diffs work best. + +Scoring +------- + +Scoring is subjective. Try to realize you're making a judgment call. + +A positive score means you believe Swift would be undeniably better +off with this code merged than it would be going one more second +without this change running in production immediately. It is indeed +high praise - you should be sure. + +A negative score means that to the best of your abilities you have not +been able to your satisfaction, to justify the value of a change +against the cost of its deficiencies and risks. It is a surprisingly +difficult chore to be confident about the value of unproven code or a +not well understood use-case in an uncertain world, and unfortunately +all too easy with a **thorough** review to uncover our defects, and be +reminded of the risk of... regression. + +Reviewers must try *very* hard first and foremost to keep master stable. + +If you can demonstrate a change has an incorrect *behavior* it's +almost without exception that the change must be revised to fix the +defect *before* merging rather than letting it in and having to also +file a bug. + +Every commit must be deployable to production. + +Beyond that - almost any change might be merge-able depending on +its merits! Here are some tips you might be able to use to find more +changes that should merge! + +#. Fixing bugs is HUGELY valuable - the *only* thing which has a + higher cost than the value of fixing a bug - is adding a new + bug - if it's broken and this change makes it fixed (without + breaking anything else) you have a winner! + +#. Features are INCREDIBLY difficult to justify their value against + the cost of increased complexity, lowered maintainability, risk + of regression, or new defects. Try to focus on what is + *impossible* without the feature - when you make the impossible + possible, things are better. Make things better. + +#. Purely test/doc changes, complex refactoring, or mechanical + cleanups are quite nuanced because there's less concrete + objective value. I've seen lots of these kind of changes + get lost to the backlog. I've also seen some success where + multiple authors have collaborated to "push-over" a change + rather than provide a "review" ultimately resulting in a + quorum of three or more "authors" who all agree there is a lot + of value in the change - however subjective. + +Because the bar is high - most reviews will end with a negative score. + +However, for non-material grievances (nits) - you should feel +confident in a positive review if the change is otherwise complete +correct and undeniably makes Swift better (not perfect, *better*). If +you see something worth fixing you should point it out in review +comments, but when applying a score consider if it *need* be fixed +before the change is suitable to merge vs. fixing it in a follow up +change? Consider if the change makes Swift so undeniably *better* +and it was deployed in production without making any additional +changes would it still be correct and complete? Would releasing the +change to production without any additional follow up make it more +difficult to maintain and continue to improve Swift? + +Endeavor to leave a positive or negative score on every change you review. + +Use your best judgment. + +A note on Swift Core Maintainers +-------------------------------- + +Swift Core maintainers may provide positive reviews scores that *look* +different from your reviews - a "+2" instead of a "+1". + +But it's *exactly the same* as your "+1". + +It means the change has been thoroughly and positively reviewed. The +only reason it's different is to help identify changes which have +received multiple competent and positive reviews. If you consistently +provide competent reviews you run a *VERY* high risk of being +approached to have your future positive review scores changed from a +"+1" to "+2" in order to make it easier to identify changes which need +to get merged. + +Ideally a review from a core maintainer should provide a clear path +forward for the patch author. If you don't know how to proceed +respond to the reviewers comments on the change and ask for help. +We'd love to try and help. diff --git a/api-ref/source/conf.py b/api-ref/source/conf.py new file mode 100644 index 0000000000..dea25e9c7b --- /dev/null +++ b/api-ref/source/conf.py @@ -0,0 +1,210 @@ +# -*- coding: utf-8 -*- +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# swift documentation build configuration file +# +# This file is execfile()d with the current directory set to +# its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import datetime +import os +import sys +import warnings + + +html_theme = 'openstackdocs' +html_theme_options = { + "sidebar_mode": "toc", +} + +extensions = [ + 'os_api_ref', + 'openstackdocstheme' +] + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +sys.path.insert(0, os.path.abspath('../../')) +sys.path.insert(0, os.path.abspath('../')) +sys.path.insert(0, os.path.abspath('./')) + +# -- General configuration ---------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +# +# source_encoding = 'utf-8' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'Object Storage API Reference' +copyright = u'2010-present, OpenStack Foundation' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +# today = '' +# Else, today_fmt is used as the format for a strftime call. +# today_fmt = '%B %d, %Y' + +# The reST default role (used for this markup: `text`) to use +# for all documents. +# default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +# add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +add_module_names = False + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'native' + +# openstackdocstheme options +openstackdocs_repo_name = 'openstack/swift' +openstackdocs_bug_project = 'swift' +openstackdocs_bug_tag = 'api-ref' + +# -- Options for man page output ---------------------------------------------- + +# Grouping the document tree for man pages. +# List of tuples 'sourcefile', 'target', u'title', u'Authors name', 'manual' + + +# -- Options for HTML output -------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. Major themes that come with +# Sphinx are currently 'default' and 'sphinxdoc'. +# html_theme_path = ["."] +# html_theme = '_theme' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +# html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +# html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +# html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +# html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +# html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +# html_static_path = ['_static'] + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +# html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +# html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +# html_additional_pages = {} + +# If false, no module index is generated. +# html_use_modindex = True + +# If false, no index is generated. +# html_use_index = True + +# If true, the index is split into individual pages for each letter. +# html_split_index = False + +# If true, links to the reST sources are added to the pages. +# html_show_sourcelink = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +# html_use_opensearch = '' + +# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). +# html_file_suffix = '' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'swiftdoc' + + +# -- Options for LaTeX output ------------------------------------------------- + +# The paper size ('letter' or 'a4'). +# latex_paper_size = 'letter' + +# The font size ('10pt', '11pt' or '12pt'). +# latex_font_size = '10pt' + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass +# [howto/manual]). +latex_documents = [ + ('index', 'swift.tex', u'OpenStack Object Storage API Documentation', + u'OpenStack Foundation', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +# latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +# latex_use_parts = False + +# Additional stuff for the LaTeX preamble. +# latex_preamble = '' + +# Documents to append as an appendix to all manuals. +# latex_appendices = [] + +# If false, no module index is generated. +# latex_use_modindex = True diff --git a/api-ref/source/index.rst b/api-ref/source/index.rst new file mode 100644 index 0000000000..22f40d575c --- /dev/null +++ b/api-ref/source/index.rst @@ -0,0 +1,15 @@ +:tocdepth: 2 + +=================== + Object Storage API +=================== + +.. rest_expand_all:: + +.. include:: storage_info.inc +.. include:: storage-account-services.inc +.. include:: storage-container-services.inc +.. include:: storage-object-services.inc +.. include:: storage_endpoints.inc + + diff --git a/api-ref/source/metadata_header_encoding.inc b/api-ref/source/metadata_header_encoding.inc new file mode 100644 index 0000000000..1aee47bf83 --- /dev/null +++ b/api-ref/source/metadata_header_encoding.inc @@ -0,0 +1,6 @@ +.. note:: + + The metadata value must be UTF-8-encoded and then + URL-encoded before you include it in the header. + This is a direct violation of the HTTP/1.1 `basic rules + `_. diff --git a/api-ref/source/metadata_header_syntax.inc b/api-ref/source/metadata_header_syntax.inc new file mode 100644 index 0000000000..771f99dfcf --- /dev/null +++ b/api-ref/source/metadata_header_syntax.inc @@ -0,0 +1,7 @@ +.. note:: + + Metadata keys (the name of the metadata) must be treated as case-insensitive + at all times. These keys can contain ASCII 7-bit characters that are not + control (0-31) characters, DEL, or a separator character, according to + `HTTP/1.1 `_ . + The underscore character is silently converted to a hyphen. diff --git a/api-ref/source/parameters.yaml b/api-ref/source/parameters.yaml new file mode 100644 index 0000000000..01952f58f4 --- /dev/null +++ b/api-ref/source/parameters.yaml @@ -0,0 +1,1270 @@ +# variables in header +Accept: + description: | + Instead of using the ``format`` query parameter, + set this header to ``application/json``, ``application/xml``, or + ``text/xml``. + in: header + required: false + type: string +Accept-Ranges: + description: | + The type of ranges that the object accepts. + in: header + required: true + type: string +Content-Disposition: + description: | + If set, specifies the override behavior for the + browser. For example, this header might specify that the browser + use a download program to save this file rather than show the + file, which is the default. + in: header + required: false + type: string +Content-Disposition_resp: + description: | + If present, specifies the override behavior for the + browser. For example, this header might specify that the browser + use a download program to save this file rather than show the + file, which is the default. If not set, this header is not + returned by this operation. + in: header + required: false + type: string +Content-Encoding: + description: | + If set, the value of the ``Content-Encoding`` + metadata. + in: header + required: false + type: string +Content-Encoding_resp: + description: | + If present, the value of the ``Content-Encoding`` + metadata. If not set, the operation does not return this header. + in: header + required: false + type: string +Content-Length_cud_resp: + description: | + If the operation succeeds, this value is zero + (0) or the length of informational or error + text in the response body. + in: header + required: true + type: string +Content-Length_get_resp: + description: | + The length of the object content in the response + body, in bytes. + in: header + required: true + type: string +Content-Length_listing_resp: + description: | + If the operation succeeds, the length of the response body + in bytes. On error, this is the length of the error text. + in: header + required: true + type: string +Content-Length_obj_head_resp: + description: | + HEAD operations do not return content. The + ``Content-Length`` header value is not the size of the response + body but is the size of the object, in bytes. + in: header + required: true + type: string +Content-Length_put_req: + description: | + Set to the length of the object content (i.e. the length in bytes + of the request body). Do not + set if chunked transfer encoding is being used. + in: header + required: false + type: integer +Content-Type_cud_resp: + description: | + If present, this value is the MIME + type of the informational or error text in the response body. + in: header + required: false + type: string +Content-Type_listing_resp: + description: | + If the operation succeeds, this value is the MIME type of the list + response. The MIME type is determined by the listing format specified by + the request and will be one of ``text/plain``, ``application/json``, + ``application/xml``, or ``text/xml``. If the operation fails, this value is + the MIME type of the error text in the response body. + in: header + required: true + type: string +Content-Type_obj_cu_req: + description: | + Sets the MIME type for the object. + in: header + required: false + type: string +Content-Type_obj_resp: + description: | + If the operation succeeds, this value is the MIME type of the object. If + the operation fails, this value is the MIME type of the error text in the + response body. + in: header + required: true + type: string +Date: + description: | + The date and time the system responded to the request, + using the preferred format of + `RFC 7231 `_ as + shown in this example ``Thu, 16 Jun 2016 15:10:38 GMT``. The time is + always in UTC. + in: header + required: true + type: string +Destination: + description: | + The container and object name of the destination + object in the form of ``/container/object``. You must UTF-8-encode + and then URL-encode the names of the destination container and + object before you include them in this header. + in: header + required: true + type: string +Destination-Account: + description: | + Specifies the account name where the object is copied to. If not + specified, the object is copied to the account which owns the object + (i.e., the account in the path). + in: header + required: false + type: string +ETag_obj_copied: + description: | + The MD5 checksum of the copied object content. + The value is not quoted. + in: header + required: true + type: string +ETag_obj_received: + description: | + The MD5 checksum of the uploaded object content. + The value is not quoted. If it is an SLO, it would + be MD5 checksum of the segments' etags. + in: header + required: true + type: string +ETag_obj_req: + description: | + The MD5 checksum value of the request body. For + example, the MD5 checksum value of the object content. For + manifest objects, this value is the MD5 checksum of the + concatenated string of ETag values for each of the segments in + the manifest. You are strongly recommended to compute + the MD5 checksum value and include it in the request. This + enables the Object Storage API to check the integrity of the + upload. The value is not quoted. + in: header + required: false + type: string +ETag_obj_resp: + description: | + For objects smaller than 5 GB, this value is the + MD5 checksum of the object content. The value is not quoted. For + manifest objects, this value is the MD5 checksum of the + concatenated string of ETag values for each of the + segments in the manifest, and not the MD5 checksum of the content + that was downloaded. Also the value is enclosed in double-quote + characters. You are strongly recommended to compute the MD5 + checksum of the response body as it is received and compare this + value with the one in the ETag header. If they differ, the content + was corrupted, so retry the operation. + in: header + required: true + type: string +If-Match: + description: | + See `Request for Comments: 2616 + `_. + in: header + required: false + type: string +If-Modified-Since: + description: | + See `Request for Comments: 2616 + `_. + in: header + required: false + type: string +If-None-Match-get-request: + description: | + A client that has one or more entities previously + obtained from the resource can verify that none of those entities is + current by including a list of their associated entity tags in the + ``If-None-Match header`` field. + See `Request for Comments: 2616 `_ + for details. + in: header + required: false + type: string +If-None-Match-put-request: + description: | + In combination with ``Expect: 100-Continue``, + specify an ``"If-None-Match: *"`` header to query whether the + server already has a copy of the object before any data is sent. + in: header + required: false + type: string +If-Unmodified-Since: + description: | + See `Request for Comments: 2616 + `_. + in: header + required: false + type: string +Last-Modified: + description: | + The date and time when the object was created or its metadata was + changed. The date and time is formatted as shown in this + example: ``Fri, 12 Aug 2016 14:24:16 GMT`` + + The time is always in UTC. + in: header + required: true + type: string +Range: + description: | + The ranges of content to get. You can use the + ``Range`` header to get portions of data by using one or more + range specifications. To specify many ranges, separate the range + specifications with a comma. The types of range specifications + are: - **Byte range specification**. Use FIRST_BYTE_OFFSET to + specify the start of the data range, and LAST_BYTE_OFFSET to + specify the end. You can omit the LAST_BYTE_OFFSET and if you + do, the value defaults to the offset of the last byte of data. + - **Suffix byte range specification**. Use LENGTH bytes to specify + the length of the data range. The following forms of the header + specify the following ranges of data: + + - ``Range: bytes=-5``. The last five bytes. + - ``Range: bytes=10-15``. The six bytes of data after a 10-byte offset. + - ``Range: bytes=10-15,-5``. A multi-part response that contains the + last five bytes and the six + bytes of data after a 10-byte offset. The ``Content-Type`` + response header contains ``multipart/byteranges``. + - ``Range: bytes=4-6``. Bytes 4 to 6 inclusive. + - ``Range: bytes=2-2``. Byte 2, the third byte of the data. + - ``Range: bytes=6-``. Byte 6 and after. + - ``Range: bytes=1-3,2-5``. A multi-part response that + contains bytes 1 to 3 inclusive, and bytes 2 to 5 inclusive. The + ``Content-Type`` response header contains + ``multipart/byteranges``. + in: header + required: false + type: string +Transfer-Encoding: + description: | + Set to ``chunked`` to enable chunked transfer + encoding. If used, do not set the ``Content-Length`` header to a + non-zero value. + in: header + required: false + type: string +X-Account-Access-Control_req: + description: | + **Note**: `X-Account-Access-Control` is not supported by Keystone auth. + + Sets an account access control list (ACL) that grants access to + containers and objects in the account. + See `Account ACLs + `_ + for more information. + in: header + required: false + type: string +X-Account-Access-Control_resp: + description: | + **Note**: `X-Account-Access-Control` is not supported by Keystone auth. + + The account access control list (ACL) that grants access to + containers and objects in the account. + If there is no ACL, this header is not returned by this operation. + See `Account ACLs + `_ + for more information. + in: header + required: false + type: string +X-Account-Bytes-Used: + description: | + The total number of bytes that are stored in + Object Storage for the account. + in: header + required: true + type: integer +X-Account-Container-Count: + description: | + The number of containers. + in: header + required: true + type: integer +X-Account-Meta-name: + description: | + The custom account metadata item, where + ``name`` is the name of the metadata item. One ``X-Account-Meta-name`` + response header appears for each metadata item (for + each ``name``). + in: header + required: false + type: string +X-Account-Meta-name_req: + description: | + The account metadata. The ``name`` is the name + of metadata item that you want to add, update, or delete. To + delete this item, send an empty value in this header. You must + specify an ``X-Account-Meta-name`` header for each metadata + item (for each ``name``) that you want to add, update, or + delete. + in: header + required: false + type: string +X-Account-Meta-Quota-Bytes_resp: + description: | + If present, this is the limit on the total size in bytes of objects stored + in the account. + Typically this value is set by an administrator. + in: header + required: false + type: string +X-Account-Meta-Temp-URL-Key-2_req: + description: | + A second secret key value for temporary URLs. + The second key enables you to rotate keys by having + two active keys at the same time. + in: header + required: false + type: string +X-Account-Meta-Temp-URL-Key-2_resp: + description: | + The second secret key value for temporary URLs. If + not set, this header is not returned in the response. + in: header + required: false + type: string +X-Account-Meta-Temp-URL-Key_req: + description: | + The secret key value for temporary URLs. + in: header + required: false + type: string +X-Account-Meta-Temp-URL-Key_resp: + description: | + The secret key value for temporary URLs. If not + set, this header is not returned in the response. + in: header + required: false + type: string +X-Account-Object-Count: + description: | + The number of objects in the account. + in: header + required: true + type: integer +X-Account-Storage-Policy-name-Bytes-Used: + description: | + The total number of bytes that are stored in + in a given storage policy, where ``name`` is the + name of the storage policy. + in: header + required: true + type: integer +X-Account-Storage-Policy-name-Container-Count: + description: | + The number of containers in the account that use the given + storage policy where ``name`` is the name of the storage policy. + in: header + required: true + type: integer +X-Account-Storage-Policy-name-Object-Count: + description: | + The number of objects in given storage policy where ``name`` is + the name of the storage policy. + in: header + required: true + type: integer +X-Auth-Token: + description: | + Authentication token. If you omit this header, + your request fails unless the account owner has granted you access + through an access control list (ACL). + in: header + required: false + type: string +X-Container-Bytes-Used: + description: | + The total number of bytes used. + in: header + required: true + type: integer +X-Container-Meta-Access-Control-Allow-Origin: + description: | + Originating URLs allowed to make cross-origin + requests (CORS), separated by spaces. This heading applies to the + container only, and all objects within the container with this + header applied are CORS-enabled for the allowed origin URLs. A + browser (user-agent) typically issues a `preflighted request + `_ , which is an OPTIONS call + that verifies the origin is allowed to make the request. The + Object Storage service returns 200 if the originating URL is + listed in this header parameter, and issues a 401 if the + originating URL is not allowed to make a cross-origin request. + Once a 200 is returned, the browser makes a second request to the + Object Storage service to retrieve the CORS-enabled object. + in: header + required: false + type: string +X-Container-Meta-Access-Control-Expose-Headers: + description: | + Headers the Object Storage service exposes to the + browser (technically, through the ``user-agent`` setting), in the + request response, separated by spaces. By default the Object + Storage service returns the following headers: + + - All "simple response headers" as listed on + `http://www.w3.org/TR/cors/#simple-response-header + `_. + - The headers ``etag``, ``x-timestamp``, ``x-trans-id``, + ``x-openstack-request-id``. + - All metadata headers (``X-Container-Meta-*`` for containers and + ``X-Object-Meta-*`` for objects). + - headers listed in ``X-Container-Meta-Access-Control-Expose-Headers``. + in: header + required: false + type: string +X-Container-Meta-Access-Control-Max-Age: + description: | + Maximum time for the origin to hold the preflight + results. A browser may make an OPTIONS call to verify the origin + is allowed to make the request. Set the value to an integer number + of seconds after the time that the request was received. + in: header + required: false + type: string +X-Container-Meta-name: + description: | + The custom container metadata item, where + ``name`` is the name of the metadata item. One ``X-Container-Meta-name`` + response header appears for each metadata item (for + each ``name``). + in: header + required: true + type: string +X-Container-Meta-name_req: + description: | + The container metadata, where ``name`` is the + name of metadata item. You must specify an ``X-Container-Meta-name`` + header for each metadata item (for each ``name``) that + you want to add or update. + in: header + required: false + type: string +X-Container-Meta-Quota-Bytes: + description: | + Sets maximum size of the container, in bytes. + Typically these values are set by an administrator. Returns a 413 + response (request entity too large) when an object PUT operation + exceeds this quota value. + This value does not take effect immediately. see + `Container Quotas + `_ + for more information. + in: header + required: false + type: string +X-Container-Meta-Quota-Bytes_resp: + description: | + The maximum size of the container, in bytes. If not set, this header is not + returned by this operation. + in: header + required: false + type: string +X-Container-Meta-Quota-Count: + description: | + Sets maximum object count of the container. + Typically these values are set by an administrator. Returns a 413 + response (request entity too large) when an object PUT operation + exceeds this quota value. + This value does not take effect immediately. see + `Container Quotas + `_ + for more information. + in: header + required: false + type: string +X-Container-Meta-Quota-Count_resp: + description: | + The maximum object count of the container. If not set, this header is not + returned by this operation. + in: header + required: false + type: string +X-Container-Meta-Temp-URL-Key-2_req: + description: | + A second secret key value for temporary URLs. + The second key enables you to rotate keys by having + two active keys at the same time. + in: header + required: false + type: string +X-Container-Meta-Temp-URL-Key-2_resp: + description: | + The second secret key value for temporary URLs. If + not set, this header is not returned in the response. + in: header + required: false + type: string +X-Container-Meta-Temp-URL-Key_req: + description: | + The secret key value for temporary URLs. + in: header + required: false + type: string +X-Container-Meta-Temp-URL-Key_resp: + description: | + The secret key value for temporary URLs. If not + set, this header is not returned in the response. + in: header + required: false + type: string +X-Container-Meta-Web-Directory-Type: + description: | + Sets the content-type of directory marker + objects. If the header is not set, default is + ``application/directory``. Directory marker objects are 0-byte + objects that represent directories to create a simulated + hierarchical structure. For example, if you set ``"X-Container- + Meta-Web-Directory-Type: text/directory"``, Object Storage treats + 0-byte objects with a content-type of ``text/directory`` as + directories rather than objects. + in: header + required: false + type: string +X-Container-Object-Count: + description: | + The number of objects. + in: header + required: true + type: integer +X-Container-Read: + description: | + Sets a container access control list (ACL) that grants read access. + The scope of the access is specific to the container. The ACL grants + the ability to perform GET or HEAD operations on objects in the container + or to perform a GET or HEAD operation on the container itself. + + The format and scope of the ACL is dependent on the authorization system + used by the Object Storage service. See `Container ACLs + `_ + for more information. + in: header + required: false + type: string +X-Container-Read_resp: + description: | + The ACL that grants read access. If there is no ACL, this + header is not returned by this operation. + See `Container ACLs + `_ + for more information. + in: header + required: false + type: string +X-Container-Sync-Key: + description: | + Sets the secret key for container + synchronization. If you remove the secret key, synchronization is + halted. + For more information, see `Container to Container Synchronization + `_ + in: header + required: false + type: string +X-Container-Sync-Key_resp: + description: | + The secret key for container synchronization. If + not set, this header is not returned by this operation. + in: header + required: false + type: string +X-Container-Sync-To: + description: | + Sets the destination for container + synchronization. Used with the secret key indicated in the ``X + -Container-Sync-Key`` header. If you want to stop a container from + synchronizing, send a blank value for the ``X-Container-Sync-Key`` + header. + in: header + required: false + type: string +X-Container-Sync-To_resp: + description: | + The destination for container synchronization. If + not set, this header is not returned by this operation. + in: header + required: false + type: string +X-Container-Write: + description: | + Sets a container access control list (ACL) that grants write access. + The scope of the access is specific to the container. The ACL grants + the ability to perform PUT, POST and DELETE operations on + objects in the container. It does not grant write access to the container + metadata. + + The format of the ACL is dependent on the authorization system + used by the Object Storage service. See `Container ACLs + `_ + for more information. + + in: header + required: false + type: string +X-Container-Write_resp: + description: + The ACL that grants write access. If there is no ACL, + this header is not returned by this operation. + See `Container ACLs + `_ + for more information. + in: header + required: false + type: string +X-Copied-From: + description: | + For a copied object, shows the container and + object name from which the new object was copied. The value is in + the ``{container}/{object}`` format. + in: header + required: false + type: string +X-Copied-From-Account: + description: | + For a copied object, shows the account + from which the new object was copied. + in: header + required: false + type: string +X-Copied-From-Last-Modified: + description: | + For a copied object, the date and time in `UNIX + Epoch time stamp format + `_ when the container and + object name from which the new object was copied was last + modified. For example, ``1440619048`` is equivalent to ``Mon, + Wed, 26 Aug 2015 19:57:28 GMT``. + in: header + required: false + type: integer +X-Copy-From: + description: | + If set, this is the name of an object used to + create the new object by copying the ``X-Copy-From`` object. The + value is in form ``{container}/{object}``. You must UTF-8-encode + and then URL-encode the names of the container and object before + you include them in the header. Using PUT with ``X-Copy-From`` + has the same effect as using the COPY operation to copy an object. + Using ``Range`` header with ``X-Copy-From`` will create a new + partial copied object with bytes set by ``Range``. + in: header + required: false + type: string +X-Copy-From-Account: + description: | + Specifies the account name where the object is copied from. If not + specified, the object is copied from the account which owns the new + object (i.e., the account in the path). + in: header + required: false + type: string +X-Delete-After: + description: | + The number of seconds after which the system removes the object. The value + should be a positive integer. Internally, the Object Storage system uses + this value to generate an ``X-Delete-At`` metadata item. If both + ``X-Delete-After`` and ``X-Delete-At`` are set then ``X-Delete-After`` + takes precedence. + in: header + required: false + type: integer +X-Delete-At: + description: | + The date and time in `UNIX Epoch time stamp format + `_ when the system removes the + object. For example, ``1440619048`` is equivalent to ``Mon, Wed, 26 Aug + 2015 19:57:28 GMT``. The value should be a positive integer corresponding + to a time in the future. If both ``X-Delete-After`` and ``X-Delete-At`` are + set then ``X-Delete-After`` takes precedence. + in: header + required: false + type: integer +X-Delete-At_resp: + description: | + If present, specifies date and time in `UNIX Epoch time stamp format + `_ when the system removes the + object. For example, ``1440619048`` is equivalent to ``Mon, Wed, 26 Aug + 2015 19:57:28 GMT``. + in: header + required: false + type: integer +X-Detect-Content-Type: + description: | + If set to ``true``, Object Storage guesses the + content type based on the file extension and ignores the value + sent in the ``Content-Type`` header, if present. + in: header + required: false + type: boolean +X-Fresh-Metadata: + description: | + Enables object creation that omits existing user + metadata. If set to ``true``, the COPY request creates an object + without existing user metadata. Default value is ``false``. + in: header + required: false + type: boolean +X-History-Location: + description: | + The URL-encoded UTF-8 representation of the container that stores + previous versions of objects. If neither this nor ``X-Versions-Location`` + is set, versioning is disabled for this container. ``X-History-Location`` + and ``X-Versions-Location`` cannot both be set at the same time. For more + information about object versioning, see `Object versioning + `_. + in: header + required: false + type: string +X-History-Location_resp: + description: | + If present, this container has versioning enabled and the value + is the UTF-8 encoded name of another container. For more information + about object versioning, see `Object versioning + `_. + in: header + required: false + type: string +X-Newest: + description: | + If set to true , Object Storage queries all + replicas to return the most recent one. If you omit this header, + Object Storage responds faster after it finds one valid replica. + Because setting this header to true is more expensive for the back + end, use it only when it is absolutely needed. + in: header + required: false + type: boolean +X-Object-Manifest: + description: | + Set to specify that this is a dynamic large + object manifest object. The value is the container and object name + prefix of the segment objects in the form ``container/prefix``. + You must UTF-8-encode and then URL-encode the names of the + container and prefix before you include them in this header. + in: header + required: false + type: string +X-Object-Manifest_resp: + description: | + If present, this is a dynamic large object + manifest object. The value is the container and object name prefix + of the segment objects in the form ``container/prefix``. + in: header + required: false + type: string +X-Object-Meta-name: + description: | + The object metadata, where ``name`` is the name + of the metadata item. You must specify an + ``X-Object-Meta-name`` header for each metadata ``name`` item that + you want to add or update. + in: header + required: false + type: string +X-Object-Meta-name_resp: + description: | + If present, the custom object metadata item, where ``name`` + is the name of the metadata item. One``X-Object-Meta-name`` + response header appears for each metadata ``name`` item. + in: header + required: false + type: string +X-Openstack-Request-Id: + description: | + A unique transaction ID for this request. Your + service provider might need this value if you report a problem. + (same as ``X-Trans-Id``) + in: header + required: true + type: string +X-Remove-Account-name: + description: | + Removes the metadata item named ``name``. + For example, ``X-Remove-Account-Meta-Blue`` removes + custom metadata. + in: header + required: false + type: string +X-Remove-Container-name: + description: | + Removes the metadata item named ``name``. For + example, ``X-Remove-Container-Read`` removes the + ``X-Container-Read`` metadata item and ``X-Remove-Container-Meta-Blue`` + removes custom metadata. + in: header + required: false + type: string +X-Remove-History-Location: + description: | + Set to any value to disable versioning. Note that this disables version + that was set via ``X-Versions-Location`` as well. + in: header + required: false + type: string +X-Remove-Versions-Location: + description: | + Set to any value to disable versioning. Note that this disables version + that was set via ``X-History-Location`` as well. + in: header + required: false + type: string +X-Service-Token: + description: | + A service token. See `OpenStack Service Using Composite Tokens + `_ for more information. + in: header + required: false + type: string +X-Static-Large-Object: + description: | + Set to ``true`` if this object is a static large + object manifest object. + in: header + required: true + type: boolean +X-Storage-Policy: + description: | + In requests, specifies the name of the storage policy to use for + the container. In responses, is the storage policy name. + The storage policy of the container cannot be changed. + in: header + required: false + type: string +X-Symlink-Target: + description: | + Set to specify that this is a symlink object. + The value is the relative path of the target object in the + format /. The target object does not need to + exist at the time of symlink creation. + You must UTF-8-encode and then URL-encode the names of the + container and object before you include them in this header. + in: header + required: false + type: string +X-Symlink-Target-Account: + description: | + Set to specify that this is a cross-account symlink to + an object in the account specified in the value. + The ``X-Symlink-Target`` must also be set for this to + be effective. + You must UTF-8-encode and then URL-encode the account name + before you include it in this header. + in: header + required: false + type: string +X-Symlink-Target-Account_resp: + description: | + If present, and ``X-Symlink-Target`` is present, then + this is a cross-account symlink to + an object in the account specified in the value. + in: header + required: false + type: string +X-Symlink-Target_resp: + description: | + If present, this is a symlink object. + The value is the relative path of the target object in the + format /. + in: header + required: false + type: string +X-Timestamp: + description: | + The date and time in `UNIX Epoch time stamp + format `_ when the + account, container, or object was initially created as a current + version. For example, ``1440619048`` is equivalent to ``Mon, Wed, + 26 Aug 2015 19:57:28 GMT``. + in: header + required: true + type: integer +X-Trans-Id: + description: | + A unique transaction ID for this request. Your + service provider might need this value if you report a problem. + in: header + required: true + type: string +X-Trans-Id-Extra: + description: | + Extra transaction information. Use the ``X-Trans-Id-Extra`` + request header to include extra information to help you + debug any errors that might occur with large object upload and + other Object Storage transactions. The server appends the + first 32 characters of the ``X-Trans-Id-Extra`` request header + value to the transaction ID value in the generated ``X-Trans-Id`` + response header. You must UTF-8-encode and then URL-encode the + extra transaction information before you include it in the + ``X-Trans-Id-Extra`` request header. For example, you can include + extra transaction information when you upload `large objects + `_ + such as images. When + you upload each segment and the manifest, include the same value + in the ``X-Trans-Id-Extra`` request header. If an error occurs, + you can find all requests that are related to the large object + upload in the Object Storage logs. You can also use ``X-Trans-Id-Extra`` + strings to help operators debug requests that fail to + receive responses. The operator can search for the extra + information in the logs. + in: header + required: false + type: string +X-Versions-Location: + description: | + The URL-encoded UTF-8 representation of the container that stores + previous versions of objects. If neither this nor ``X-History-Location`` + is set, versioning is disabled for this container. ``X-Versions-Location`` + and ``X-History-Location`` cannot both be set at the same time. For more + information about object versioning, see `Object versioning + `_. + in: header + required: false + type: string +X-Versions-Location_resp: + description: | + If present, this container has versioning enabled and the value + is the UTF-8 encoded name of another container. For more information + about object versioning, see `Object versioning + `_. + in: header + required: false + type: string + +# variables in path +account: + description: | + The unique name for the account. An account is + also known as the project or tenant. + in: path + required: false + type: string +container: + description: | + The unique (within an account) name for the container. The container + name must be from 1 to 256 characters long and can start with any + character and contain any pattern. Character set must be UTF-8. + The container name cannot contain a slash (``/``) character + because this character delimits the container and object name. For + example, the path ``/v1/account/www/pages`` specifies the ``www`` + container, not the ``www/pages`` container. + in: path + required: false + type: string +object: + description: | + The unique name for the object. + in: path + required: false + type: string + +# variables in query +bulk-delete: + description: | + When the ``bulk-delete`` query parameter is present in the POST + request, multiple objects or containers can be deleted + with a single request. See `Bulk Delete + `_ + for how this feature is used. + in: query + required: false + type: string +delimiter: + description: | + The delimiter is a single character used to split object + names to present a pseudo-directory hierarchy of objects. When combined + with a ``prefix`` query, this enables API users to simulate and + traverse the objects in a container as if they were in a directory tree. + in: query + required: false + type: string +end_marker: + description: | + For a string value, `x` , constrains the list to items whose names + are less than `x`. + in: query + required: false + type: string +extract-archive: + description: | + When the ``extract-archive`` query parameter is present in the POST + request, an archive (tar file) is uploaded and extracted to + create multiple objects. See `Extract Archive + `_ + for how this feature is used. + in: query + required: false + type: string +filename: + description: | + Overrides the default file name. Object Storage + generates a default file name for GET temporary URLs that is based + on the object name. Object Storage returns this value in the + ``Content-Disposition`` response header. Browsers can interpret + this file name value as a file attachment to save. For more + information about temporary URLs, see `Temporary URL middleware + `_. + in: query + required: false + type: string +format: + description: | + The response format. Valid values are ``json``, + ``xml``, or ``plain``. The default is ``plain``. If you append + the ``format=xml`` or ``format=json`` query parameter to the + storage account URL, the response shows extended container + information serialized in that format. If you append the + ``format=plain`` query parameter, the response lists the container + names separated by newlines. + in: query + required: false + type: string +limit: + description: | + For an integer value n , limits the number of + results to n . + in: query + required: false + type: integer +marker: + description: | + For a string value, `x` , constrains the list to items whose names + are greater than `x`. + in: query + required: false + type: string +multipart-manifest_copy: + description: | + If you include the ``multipart-manifest=get`` + query parameter and the object is a large object, the object + contents are not copied. Instead, the manifest is copied to + the new object. + in: query + required: false + type: string +multipart-manifest_delete: + description: | + If you include the ``multipart-manifest=delete`` + query parameter and the object is a static large object, the + segment objects and manifest object are deleted. If you omit the + ``multipart-manifest=delete`` query parameter and the object is a + static large object, the manifest object is deleted but the + segment objects are not deleted. The response body will contain + the status of the deletion of every processed segment object. + in: query + required: false + type: string +multipart-manifest_get: + description: | + If you include the ``multipart-manifest=get`` + query parameter and the object is a large object, the object + contents are not returned. Instead, the manifest is returned in + the ``X-Object-Manifest`` response header for dynamic large + objects or in the response body for static large objects. + in: query + required: false + type: string +multipart-manifest_head: + description: | + If you include the ``multipart-manifest=get`` query parameter and the + object is a large object, the object metadata is not returned. Instead, the + response headers will include the manifest metadata and for dynamic large + objects the ``X-Object-Manifest`` response header. + in: query + required: false + type: string +multipart-manifest_put: + description: | + If you include the ``multipart-manifest=put`` query parameter, the object + is a static large object manifest and the body contains the manifest. + See `Static large objects `_ for more information. + in: query + required: false + type: string +path: + description: | + For a string value, returns the object names that + are nested in the pseudo path. Please use ``prefix``/``delimiter`` + queries instead of using this ``path`` query. + in: query + required: false + type: string +prefix: + description: | + Only objects with this prefix will be returned. When combined with a + ``delimiter`` query, this enables API users to simulate and + traverse the objects in a container as if they were in a directory tree. + in: query + required: false + type: string +reverse: + description: | + By default, listings are returned sorted by name, ascending. If you include + the ``reverse=true`` query parameter, the listing will be returned sorted + by name, descending. + in: query + required: false + type: boolean +swiftinfo_expires: + description: | + The time at which ``swiftinfo_sig`` expires. The time is in + `UNIX Epoch time stamp format + `_. + in: query + required: false + type: integer +swiftinfo_sig: + description: | + A hash-based message authentication code (HMAC) + that enables access to administrator-only information. To use this + parameter, the ``swiftinfo_expires`` parameter is also required. + in: query + required: false + type: string +symlink: + description: | + If you include the ``symlink=get`` query parameter + and the object is a symlink, then the response will include + data and metadata from the symlink itself rather than from the target. + in: query + required: false + type: string +symlink_copy: + description: | + If you include the ``symlink=get`` query parameter + and the object is a symlink, the target object + contents are not copied. Instead, the symlink is copied to + create a new symlink to the same target. + in: query + required: false + type: string +temp_url_expires: + description: | + The date and time in `UNIX Epoch time stamp + format `_ or + `ISO 8601 UTC timestamp `_ + when the signature for temporary URLs expires. + For example, ``1440619048`` or ``2015-08-26T19:57:28Z`` + is equivalent to ``Mon, Wed, 26 Aug 2015 19:57:28 GMT``. For more + information about temporary URLs, see `Temporary URL middleware + `_. + in: query + required: true + type: integer +temp_url_sig: + description: | + Used with temporary URLs to sign the request with + an HMAC-SHA1 cryptographic signature that defines the allowed HTTP + method, expiration date, full path to the object, and the secret + key for the temporary URL. For more information about temporary + URLs, see `Temporary URL middleware + `_. + in: query + required: true + type: string + +# variables in body +bytes_in_account_get: + description: | + The total number of bytes that are stored in + Object Storage for the account. + in: body + required: true + type: integer +bytes_in_container_get: + description: | + The total number of bytes that are stored in + Object Storage for the container. + in: body + required: true + type: integer +content_type: + description: | + The content type of the object. + in: body + required: true + type: string +count: + description: | + The number of objects in the container. + in: body + required: true + type: integer +hash: + description: | + The MD5 checksum value of the object content. + in: body + required: true + type: string +last_modified: + description: | + The date and time when the object was last modified. + + The date and time stamp format is `ISO 8601 + `_: + + :: + + CCYY-MM-DDThh:mm:ss±hh:mm + + For example, ``2015-08-27T09:49:58-05:00``. + + The ``±hh:mm`` value, if included, is the time zone as an offset + from UTC. In the previous example, the offset value is ``-05:00``. + in: body + required: true + type: string +name_in_account_get: + description: | + The name of the container. + in: body + required: true + type: string +name_in_container_get: + description: | + The name of the object. + in: body + required: true + type: string +symlink_path: + description: | + This field exists only when the object is symlink. + This is the target path of the symlink object. + in: body + required: true + type: string + + diff --git a/api-ref/source/samples/account-containers-list-http-request-json.txt b/api-ref/source/samples/account-containers-list-http-request-json.txt new file mode 100644 index 0000000000..137ee93fbd --- /dev/null +++ b/api-ref/source/samples/account-containers-list-http-request-json.txt @@ -0,0 +1 @@ +curl -i $publicURL?format=json -X GET -H "X-Auth-Token: $token" diff --git a/api-ref/source/samples/account-containers-list-http-request-xml.txt b/api-ref/source/samples/account-containers-list-http-request-xml.txt new file mode 100644 index 0000000000..6f9293fdca --- /dev/null +++ b/api-ref/source/samples/account-containers-list-http-request-xml.txt @@ -0,0 +1 @@ +curl -i $publicURL?format=xml -X GET -H "X-Auth-Token: $token" diff --git a/api-ref/source/samples/account-containers-list-http-response-json.txt b/api-ref/source/samples/account-containers-list-http-response-json.txt new file mode 100644 index 0000000000..6c86e00ca5 --- /dev/null +++ b/api-ref/source/samples/account-containers-list-http-response-json.txt @@ -0,0 +1,12 @@ +HTTP/1.1 200 OK +Content-Length: 96 +X-Account-Object-Count: 1 +X-Timestamp: 1389453423.35964 +X-Account-Meta-Subject: Literature +X-Account-Bytes-Used: 14 +X-Account-Container-Count: 2 +Content-Type: application/json; charset=utf-8 +Accept-Ranges: bytes +X-Trans-Id: tx274a77a8975c4a66aeb24-0052d95365 +X-Openstack-Request-Id: tx274a77a8975c4a66aeb24-0052d95365 +Date: Fri, 17 Jan 2014 15:59:33 GMT diff --git a/api-ref/source/samples/account-containers-list-http-response-xml.txt b/api-ref/source/samples/account-containers-list-http-response-xml.txt new file mode 100644 index 0000000000..c477638567 --- /dev/null +++ b/api-ref/source/samples/account-containers-list-http-response-xml.txt @@ -0,0 +1,12 @@ +HTTP/1.1 200 OK +Content-Length: 262 +X-Account-Object-Count: 1 +X-Timestamp: 1389453423.35964 +X-Account-Meta-Subject: Literature +X-Account-Bytes-Used: 14 +X-Account-Container-Count: 2 +Content-Type: application/xml; charset=utf-8 +Accept-Ranges: bytes +X-Trans-Id: tx69f60bc9f7634a01988e6-0052d9544b +X-Openstack-Request-Id: tx69f60bc9f7634a01988e6-0052d9544b +Date: Fri, 17 Jan 2014 16:03:23 GMT diff --git a/api-ref/source/samples/account-containers-list-response.json b/api-ref/source/samples/account-containers-list-response.json new file mode 100644 index 0000000000..d9864aa246 --- /dev/null +++ b/api-ref/source/samples/account-containers-list-response.json @@ -0,0 +1,14 @@ +[ + { + "count": 0, + "bytes": 0, + "name": "janeausten", + "last_modified": "2013-11-19T20:08:13.283452" + }, + { + "count": 1, + "bytes": 14, + "name": "marktwain", + "last_modified": "2016-04-29T16:23:50.460230" + } +] diff --git a/api-ref/source/samples/account-containers-list-response.xml b/api-ref/source/samples/account-containers-list-response.xml new file mode 100644 index 0000000000..6e194aebce --- /dev/null +++ b/api-ref/source/samples/account-containers-list-response.xml @@ -0,0 +1,15 @@ + + + + janeausten + 0 + 0 + 2013-11-19T20:08:13.283452 + + + marktwain + 1 + 14 + 2016-04-29T16:23:50.460230 + + diff --git a/api-ref/source/samples/capabilities-list-response.json b/api-ref/source/samples/capabilities-list-response.json new file mode 100644 index 0000000000..f082dc7b9e --- /dev/null +++ b/api-ref/source/samples/capabilities-list-response.json @@ -0,0 +1,12 @@ +{ + "swift": { + "version": "1.11.0" + }, + "slo": { + "max_manifest_segments": 1000, + "max_manifest_size": 2097152, + "min_segment_size": 1 + }, + "staticweb": {}, + "tempurl": {} +} diff --git a/api-ref/source/samples/containers-list-http-request.txt b/api-ref/source/samples/containers-list-http-request.txt new file mode 100644 index 0000000000..4101ce80e5 --- /dev/null +++ b/api-ref/source/samples/containers-list-http-request.txt @@ -0,0 +1,3 @@ +GET /{api_version}/{account} HTTP/1.1 +Host: storage.swiftdrive.com +X-Auth-Token: eaaafd18-0fed-4b3a-81b4-663c99ec1cbb \ No newline at end of file diff --git a/api-ref/source/samples/containers-list-http-response.txt b/api-ref/source/samples/containers-list-http-response.txt new file mode 100644 index 0000000000..43070e5235 --- /dev/null +++ b/api-ref/source/samples/containers-list-http-response.txt @@ -0,0 +1,9 @@ +HTTP/1.1 200 Ok +Date: Thu, 07 Jun 2010 18:57:07 GMT +Content-Type: text/plain; charset=UTF-8 +Content-Length: 32 + +images +movies +documents +backups \ No newline at end of file diff --git a/api-ref/source/samples/endpoints-list-response-headers.json b/api-ref/source/samples/endpoints-list-response-headers.json new file mode 100644 index 0000000000..3a0d930603 --- /dev/null +++ b/api-ref/source/samples/endpoints-list-response-headers.json @@ -0,0 +1,14 @@ +{ + "endpoints": [ + "http://storage01.swiftdrive.com:6208/d8/583/AUTH_dev/EC_cont1/obj", + "http://storage02.swiftdrive.com:6208/d2/583/AUTH_dev/EC_cont1/obj", + "http://storage02.swiftdrive.com:6206/d3/583/AUTH_dev/EC_cont1/obj", + "http://storage02.swiftdrive.com:6208/d5/583/AUTH_dev/EC_cont1/obj", + "http://storage01.swiftdrive.com:6207/d7/583/AUTH_dev/EC_cont1/obj", + "http://storage02.swiftdrive.com:6207/d4/583/AUTH_dev/EC_cont1/obj", + "http://storage01.swiftdrive.com:6206/d6/583/AUTH_dev/EC_cont1/obj" + ], + "headers": { + "X-Backend-Storage-Policy-Index": "2" + } +} diff --git a/api-ref/source/samples/endpoints-list-response.json b/api-ref/source/samples/endpoints-list-response.json new file mode 100644 index 0000000000..1f0f9cf93f --- /dev/null +++ b/api-ref/source/samples/endpoints-list-response.json @@ -0,0 +1,8 @@ +{ + "endpoints": [ + "http://storage02.swiftdrive:6202/d2/617/AUTH_dev", + "http://storage01.swiftdrive:6202/d8/617/AUTH_dev", + "http://storage01.swiftdrive:6202/d11/617/AUTH_dev" + ], + "headers": {} +} diff --git a/api-ref/source/samples/goodbyeworld.txt b/api-ref/source/samples/goodbyeworld.txt new file mode 100644 index 0000000000..aebc9c0c05 --- /dev/null +++ b/api-ref/source/samples/goodbyeworld.txt @@ -0,0 +1 @@ +Goodbye World! \ No newline at end of file diff --git a/api-ref/source/samples/helloworld.txt b/api-ref/source/samples/helloworld.txt new file mode 100644 index 0000000000..6900abf34d --- /dev/null +++ b/api-ref/source/samples/helloworld.txt @@ -0,0 +1 @@ +Hello World Again! \ No newline at end of file diff --git a/api-ref/source/samples/objects-list-http-response-json.txt b/api-ref/source/samples/objects-list-http-response-json.txt new file mode 100644 index 0000000000..aa0f6b4297 --- /dev/null +++ b/api-ref/source/samples/objects-list-http-response-json.txt @@ -0,0 +1,11 @@ +HTTP/1.1 200 OK +Content-Length: 341 +X-Container-Object-Count: 2 +Accept-Ranges: bytes +X-Container-Meta-Book: TomSawyer +X-Timestamp: 1389727543.65372 +X-Container-Bytes-Used: 26 +Content-Type: application/json; charset=utf-8 +X-Trans-Id: tx26377fe5fab74869825d1-0052d6bdff +X-Openstack-Request-Id: tx26377fe5fab74869825d1-0052d6bdff +Date: Wed, 15 Jan 2014 16:57:35 GMT diff --git a/api-ref/source/samples/objects-list-http-response-xml.txt b/api-ref/source/samples/objects-list-http-response-xml.txt new file mode 100644 index 0000000000..b9804cb583 --- /dev/null +++ b/api-ref/source/samples/objects-list-http-response-xml.txt @@ -0,0 +1,11 @@ +HTTP/1.1 200 OK +Content-Length: 500 +X-Container-Object-Count: 2 +Accept-Ranges: bytes +X-Container-Meta-Book: TomSawyer +X-Timestamp: 1389727543.65372 +X-Container-Bytes-Used: 26 +Content-Type: application/xml; charset=utf-8 +X-Trans-Id: txc75ea9a6e66f47d79e0c5-0052d6be76 +X-Openstack-Request-Id: txc75ea9a6e66f47d79e0c5-0052d6be76 +Date: Wed, 15 Jan 2014 16:59:35 GMT diff --git a/api-ref/source/samples/objects-list-response.json b/api-ref/source/samples/objects-list-response.json new file mode 100644 index 0000000000..b104d3a9a6 --- /dev/null +++ b/api-ref/source/samples/objects-list-response.json @@ -0,0 +1,16 @@ +[ + { + "hash": "451e372e48e0f6b1114fa0724aa79fa1", + "last_modified": "2014-01-15T16:41:49.390270", + "bytes": 14, + "name": "goodbye", + "content_type": "application/octet-stream" + }, + { + "hash": "ed076287532e86365e841e92bfc50d8c", + "last_modified": "2014-01-15T16:37:43.427570", + "bytes": 12, + "name": "helloworld", + "content_type": "application/octet-stream" + } +] diff --git a/api-ref/source/samples/objects-list-response.xml b/api-ref/source/samples/objects-list-response.xml new file mode 100644 index 0000000000..07fda614c1 --- /dev/null +++ b/api-ref/source/samples/objects-list-response.xml @@ -0,0 +1,17 @@ + + + + goodbye + 451e372e48e0f6b1114fa0724aa79fa1 + 14 + application/octet-stream + 2014-01-15T16:41:49.390270 + + + helloworld + ed076287532e86365e841e92bfc50d8c + 12 + application/octet-stream + 2014-01-15T16:37:43.427570 + + diff --git a/api-ref/source/storage-account-services.inc b/api-ref/source/storage-account-services.inc new file mode 100644 index 0000000000..2bcab9ab94 --- /dev/null +++ b/api-ref/source/storage-account-services.inc @@ -0,0 +1,430 @@ +.. -*- rst -*- + +======== +Accounts +======== + +Lists containers for an account. Creates, updates, shows, and +deletes account metadata. For more information and concepts about +accounts see `Object Storage API overview +`_. + + + +Show account details and list containers +======================================== + +.. rest_method:: GET /v1/{account} + +Shows details for an account and lists containers, sorted by name, in the account. + +The sort order for the name is based on a binary comparison, a +single built-in collating sequence that compares string data by +using the SQLite memcmp() function, regardless of text encoding. +See `Collating Sequences +`_. + +The response body returns a list of containers. The default +response (``text/plain``) returns one container per line. + +If you use query parameters to page through a long list of +containers, you have reached the end of the list if the number of +items in the returned list is less than the request ``limit`` +value. The list contains more items if the number of items in the +returned list equals the ``limit`` value. + +When asking for a list of containers and there are none, the +response behavior changes depending on whether the request format +is text, JSON, or XML. For a text response, you get a 204 , because +there is no content. However, for a JSON or XML response, you get a +200 with content indicating an empty array. + +Example requests and responses: + +- Show account details and list containers and ask for a JSON + response: + + .. literalinclude:: samples/account-containers-list-http-request-json.txt + .. literalinclude:: samples/account-containers-list-http-response-json.txt + .. literalinclude:: samples/account-containers-list-response.json + +- Show account details and list containers and ask for an XML response: + + .. literalinclude:: samples/account-containers-list-http-request-xml.txt + .. literalinclude:: samples/account-containers-list-http-response-xml.txt + .. literalinclude:: samples/account-containers-list-response.xml + +If the request succeeds, the operation returns one of these status +codes: + +- ``OK (200)``. Success. The response body lists the containers. + +- ``No Content (204)``. Success. The response body shows no + containers. Either the account has no containers or you are + paging through a long list of names by using the ``marker``, + ``limit``, or ``end_marker`` query parameter and you have reached + the end of the list. + + +Normal response codes: 200 +Error response codes:204, + + +Request +------- + +.. rest_parameters:: parameters.yaml + + - account: account + - limit: limit + - marker: marker + - end_marker: end_marker + - format: format + - prefix: prefix + - delimiter: delimiter + - reverse: reverse + - X-Auth-Token: X-Auth-Token + - X-Service-Token: X-Service-Token + - X-Newest: X-Newest + - Accept: Accept + - X-Trans-Id-Extra: X-Trans-Id-Extra + + +Response Parameters +------------------- + +.. rest_parameters:: parameters.yaml + + - Content-Length: Content-Length_listing_resp + - X-Account-Meta-name: X-Account-Meta-name + - X-Account-Meta-Temp-URL-Key: X-Account-Meta-Temp-URL-Key_resp + - X-Account-Meta-Temp-URL-Key-2: X-Account-Meta-Temp-URL-Key-2_resp + - X-Timestamp: X-Timestamp + - X-Trans-Id: X-Trans-Id + - X-Openstack-Request-Id: X-Openstack-Request-Id + - Date: Date + - X-Account-Bytes-Used: X-Account-Bytes-Used + - X-Account-Container-Count: X-Account-Container-Count + - X-Account-Object-Count: X-Account-Object-Count + - X-Account-Storage-Policy-name-Bytes-Used: X-Account-Storage-Policy-name-Bytes-Used + - X-Account-Storage-Policy-name-Container-Count: X-Account-Storage-Policy-name-Container-Count + - X-Account-Storage-Policy-name-Object-Count: X-Account-Storage-Policy-name-Object-Count + - X-Account-Meta-Quota-Bytes: X-Account-Meta-Quota-Bytes_resp + - X-Account-Access-Control: X-Account-Access-Control_resp + - Content-Type: Content-Type_listing_resp + - count: count + - bytes: bytes_in_account_get + - name: name_in_account_get + + +Create, update, or delete account metadata +========================================== + +.. rest_method:: POST /v1/{account} + +Creates, updates, or deletes account metadata. + +To create, update, or delete custom metadata, use the +``X-Account-Meta-{name}`` request header, where ``{name}`` is the name of the +metadata item. + +Account metadata operations work differently than how +object metadata operations work. Depending on the contents of your +POST account metadata request, the Object Storage API updates the +metadata as shown in the following table: + +**Account metadata operations** + ++----------------------------------------------------------+---------------------------------------------------------------+ +| POST request header contains | Result | ++----------------------------------------------------------+---------------------------------------------------------------+ +| A metadata key without a value. | The API removes the metadata item from the account. | +| | | +| The metadata key already exists for the account. | | ++----------------------------------------------------------+---------------------------------------------------------------+ +| A metadata key without a value. | The API ignores the metadata key. | +| | | +| The metadata key does not already exist for the account. | | ++----------------------------------------------------------+---------------------------------------------------------------+ +| A metadata key value. | The API updates the metadata key value for the account. | +| | | +| The metadata key already exists for the account. | | ++----------------------------------------------------------+---------------------------------------------------------------+ +| A metadata key value. | The API adds the metadata key and value pair, or item, to the | +| | account. | +| The metadata key does not already exist for the account. | | ++----------------------------------------------------------+---------------------------------------------------------------+ +| One or more account metadata items are omitted. | The API does not change the existing metadata items. | +| | | +| The metadata items already exist for the account. | | ++----------------------------------------------------------+---------------------------------------------------------------+ + + + +To delete a metadata header, send an empty value for that header, +such as for the ``X-Account-Meta-Book`` header. If the tool you use +to communicate with Object Storage, such as an older version of +cURL, does not support empty headers, send the ``X-Remove-Account- +Meta-{name}`` header with an arbitrary value. For example, +``X-Remove-Account-Meta-Book: x``. The operation ignores the arbitrary +value. + +.. include:: metadata_header_syntax.inc +.. include:: metadata_header_encoding.inc + +Subsequent requests for the same key and value pair overwrite the +existing value. + +If the container already has other custom metadata items, a request +to create, update, or delete metadata does not affect those items. + +This operation does not accept a request body. + +Example requests and responses: + +- Create account metadata: + + :: + + curl -i $publicURL -X POST -H "X-Auth-Token: $token" -H "X-Account-Meta-Book: MobyDick" -H "X-Account-Meta-Subject: Literature" + + + + + :: + + HTTP/1.1 204 No Content + Content-Length: 0 + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: tx8c2dd6aee35442a4a5646-0052d954fb + X-Openstack-Request-Id: tx8c2dd6aee35442a4a5646-0052d954fb + Date: Fri, 17 Jan 2014 16:06:19 GMT + + +- Update account metadata: + + :: + + curl -i $publicURL -X POST -H "X-Auth-Token: $token" -H "X-Account-Meta-Subject: AmericanLiterature" + + + + + :: + + HTTP/1.1 204 No Content + Content-Length: 0 + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: tx1439b96137364ab581156-0052d95532 + X-Openstack-Request-Id: tx1439b96137364ab581156-0052d95532 + Date: Fri, 17 Jan 2014 16:07:14 GMT + + +- Delete account metadata: + + :: + + curl -i $publicURL -X POST -H "X-Auth-Token: $token" -H "X-Remove-Account-Meta-Subject: x" + + + + + :: + + HTTP/1.1 204 No Content + Content-Length: 0 + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: tx411cf57701424da99948a-0052d9556f + X-Openstack-Request-Id: tx411cf57701424da99948a-0052d9556f + Date: Fri, 17 Jan 2014 16:08:15 GMT + + +If the request succeeds, the operation returns the ``No Content +(204)`` response code. + +To confirm your changes, issue a show account metadata request. + +Error response codes:204, + + +Request +------- + +.. rest_parameters:: parameters.yaml + + - account: account + - X-Auth-Token: X-Auth-Token + - X-Service-Token: X-Service-Token + - X-Account-Meta-Temp-URL-Key: X-Account-Meta-Temp-URL-Key_req + - X-Account-Meta-Temp-URL-Key-2: X-Account-Meta-Temp-URL-Key-2_req + - X-Account-Meta-name: X-Account-Meta-name_req + - X-Remove-Account-name: X-Remove-Account-name + - X-Account-Access-Control: X-Account-Access-Control_req + - X-Trans-Id-Extra: X-Trans-Id-Extra + + +Response Parameters +------------------- + +.. rest_parameters:: parameters.yaml + + - Date: Date + - X-Timestamp: X-Timestamp + - Content-Length: Content-Length_cud_resp + - Content-Type: Content-Type_cud_resp + - X-Trans-Id: X-Trans-Id + - X-Openstack-Request-Id: X-Openstack-Request-Id + + +Show account metadata +===================== + +.. rest_method:: HEAD /v1/{account} + +Shows metadata for an account. + +Metadata for the account includes: + +- Number of containers + +- Number of objects + +- Total number of bytes that are stored in Object Storage for the + account + +Because the storage system can store large amounts of data, take +care when you represent the total bytes response as an integer; +when possible, convert it to a 64-bit unsigned integer if your +platform supports that primitive type. + +Do not include metadata headers in this request. + +Show account metadata request: + +:: + + curl -i $publicURL -X HEAD -H "X-Auth-Token: $token" + + + + +:: + + HTTP/1.1 204 No Content + Content-Length: 0 + X-Account-Object-Count: 1 + X-Account-Meta-Book: MobyDick + X-Timestamp: 1389453423.35964 + X-Account-Bytes-Used: 14 + X-Account-Container-Count: 2 + Content-Type: text/plain; charset=utf-8 + Accept-Ranges: bytes + X-Trans-Id: txafb3504870144b8ca40f7-0052d955d4 + X-Openstack-Request-Id: txafb3504870144b8ca40f7-0052d955d4 + Date: Fri, 17 Jan 2014 16:09:56 GMT + + +If the account or authentication token is not valid, the operation +returns the ``Unauthorized (401)`` response code. + +Error response codes:204,401, + + +Request +------- + +.. rest_parameters:: parameters.yaml + + - account: account + - X-Auth-Token: X-Auth-Token + - X-Service-Token: X-Service-Token + - X-Newest: X-Newest + - X-Trans-Id-Extra: X-Trans-Id-Extra + + +Response Parameters +------------------- + +.. rest_parameters:: parameters.yaml + + - Content-Length: Content-Length_cud_resp + - X-Account-Meta-name: X-Account-Meta-name + - X-Account-Meta-Temp-URL-Key: X-Account-Meta-Temp-URL-Key_resp + - X-Account-Meta-Temp-URL-Key-2: X-Account-Meta-Temp-URL-Key-2_resp + - X-Timestamp: X-Timestamp + - X-Trans-Id: X-Trans-Id + - X-Openstack-Request-Id: X-Openstack-Request-Id + - Date: Date + - X-Account-Bytes-Used: X-Account-Bytes-Used + - X-Account-Object-Count: X-Account-Object-Count + - X-Account-Container-Count: X-Account-Container-Count + - X-Account-Storage-Policy-name-Bytes-Used: X-Account-Storage-Policy-name-Bytes-Used + - X-Account-Storage-Policy-name-Container-Count: X-Account-Storage-Policy-name-Container-Count + - X-Account-Storage-Policy-name-Object-Count: X-Account-Storage-Policy-name-Object-Count + - X-Account-Meta-Quota-Bytes: X-Account-Meta-Quota-Bytes_resp + - X-Account-Access-Control: X-Account-Access-Control_resp + - Content-Type: Content-Type_cud_resp + + +Delete the specified account +============================ + +.. rest_method:: DELETE /v1/{account} + +Deletes the specified account when a reseller admin issues this request. +Accounts are only deleted by (1) having a reseller admin level auth token (2) +sending a DELETE to a proxy server for the account to be deleted and (3) that +proxy server having the allow_account_management" config option set to true. + +Note that an issuing a DELETE request simply marks the account for deletion +later as outlined in the link: https://docs.openstack.org/swift/latest/overview_reaper.html. + +Take care when performing this operation because deleting an account is a +one-way operation that is not trivially recoverable. It's crucial to note that in +an OpenStack context, you should delete an account after the project/tenant has been deleted from Keystone. + + +:: + + curl -i $publicURL -X DELETE -H 'X-Auth-Token: $' + + + +:: + + HTTP/1.1 204 No Content + Content-Length: 0 + Content-Type: text/html; charset=UTF-8 + X-Account-Status: Deleted + X-Trans-Id: tx91ce60a640cc42eca198a-006128c180 + X-Openstack-Request-Id: tx91ce60a640cc42eca198a-006128c180 + Date: Fri, 27 Aug 2021 11:42:08 GMT + +If the account or authentication token is not valid, the operation +returns the ``Unauthorized (401)``. If you try to delete an account with a +non-admin token, a ``403 Forbidden`` response code is returned. +If you give a non-existent account or an invalid URL, a ``404 Not Found`` response code is returned. + +Error response codes:204,401,403,404. + + +Request +------- + +.. rest_parameters:: parameters.yaml + + - account: account + - X-Auth-Token: X-Auth-Token + +Response Parameters +------------------- + +.. rest_parameters:: parameters.yaml + + - Date: Date + - X-Timestamp: X-Timestamp + - Content-Length: Content-Length_cud_resp + - Content-Type: Content-Type_cud_resp + - X-Trans-Id: X-Trans-Id + - X-Openstack-Request-Id: X-Openstack-Request-Id + diff --git a/api-ref/source/storage-container-services.inc b/api-ref/source/storage-container-services.inc new file mode 100644 index 0000000000..7d7be9fc52 --- /dev/null +++ b/api-ref/source/storage-container-services.inc @@ -0,0 +1,559 @@ +.. -*- rst -*- + +========== +Containers +========== + +Lists objects in a container. Creates, shows details for, and +deletes containers. Creates, updates, shows, and deletes container +metadata. For more information and concepts about +containers see `Object Storage API overview +`_. + + +Show container details and list objects +======================================= + +.. rest_method:: GET /v1/{account}/{container} + +Shows details for a container and lists objects, sorted by name, in the container. + +Specify query parameters in the request to filter the list and +return a subset of objects. Omit query parameters to return +a list of objects that are stored in the container, +up to 10,000 names. The 10,000 maximum value is configurable. To +view the value for the cluster, issue a GET ``/info`` request. + +Example requests and responses: + +- ``OK (200)``. Success. The response body lists the objects. + +- ``No Content (204)``. Success. The response body shows no objects. + Either the container has no objects or you are paging through a + long list of objects by using the ``marker``, ``limit``, or + ``end_marker`` query parameter and you have reached the end of + the list. + +If the container does not exist, the call returns the ``Not Found +(404)`` response code. + +Normal response codes: 200, 204 + +Error response codes: 404 + + +Request +------- + +.. rest_parameters:: parameters.yaml + + - account: account + - container: container + - limit: limit + - marker: marker + - end_marker: end_marker + - prefix: prefix + - format: format + - delimiter: delimiter + - path: path + - reverse: reverse + - X-Auth-Token: X-Auth-Token + - X-Service-Token: X-Service-Token + - X-Newest: X-Newest + - Accept: Accept + - X-Container-Meta-Temp-URL-Key: X-Container-Meta-Temp-URL-Key_req + - X-Container-Meta-Temp-URL-Key-2: X-Container-Meta-Temp-URL-Key-2_req + - X-Trans-Id-Extra: X-Trans-Id-Extra + - X-Storage-Policy: X-Storage-Policy + + +Response Parameters +------------------- + +.. rest_parameters:: parameters.yaml + + - X-Container-Meta-name: X-Container-Meta-name + - Content-Length: Content-Length_listing_resp + - X-Container-Object-Count: X-Container-Object-Count + - X-Container-Bytes-Used: X-Container-Bytes-Used + - Accept-Ranges: Accept-Ranges + - X-Container-Meta-Temp-URL-Key: X-Container-Meta-Temp-URL-Key_resp + - X-Container-Meta-Temp-URL-Key-2: X-Container-Meta-Temp-URL-Key-2_resp + - X-Container-Meta-Quota-Count: X-Container-Meta-Quota-Count_resp + - X-Container-Meta-Quota-Bytes: X-Container-Meta-Quota-Bytes_resp + - X-Storage-Policy: X-Storage-Policy + - X-Container-Read: X-Container-Read_resp + - X-Container-Write: X-Container-Write_resp + - X-Container-Sync-Key: X-Container-Sync-Key_resp + - X-Container-Sync-To: X-Container-Sync-To_resp + - X-Versions-Location: X-Versions-Location_resp + - X-History-Location: X-History-Location_resp + - X-Timestamp: X-Timestamp + - X-Trans-Id: X-Trans-Id + - X-Openstack-Request-Id: X-Openstack-Request-Id + - Content-Type: Content-Type_listing_resp + - Date: Date + - hash: hash + - last_modified: last_modified + - content_type: content_type + - bytes: bytes_in_container_get + - name: name_in_container_get + - symlink_path: symlink_path + + +Response Example format=json +---------------------------- + +.. literalinclude:: samples/objects-list-http-response-json.txt +.. literalinclude:: samples/objects-list-response.json + + +Response Example format=xml +--------------------------- + +.. literalinclude:: samples/objects-list-http-response-xml.txt +.. literalinclude:: samples/objects-list-response.xml + +Create container +================ + +.. rest_method:: PUT /v1/{account}/{container} + +Creates a container. + +You do not need to check whether a container already exists before +issuing a PUT operation because the operation is idempotent: It +creates a container or updates an existing container, as +appropriate. + +To create, update, or delete a custom metadata item, use the ``X +-Container-Meta-{name}`` header, where ``{name}`` is the name of +the metadata item. + +.. include:: metadata_header_syntax.inc +.. include:: metadata_header_encoding.inc + +Example requests and responses: + +- Create a container with no metadata: + + :: + + curl -i $publicURL/steven -X PUT -H "Content-Length: 0" -H "X-Auth-Token: $token" + + + + + :: + + HTTP/1.1 201 Created + Content-Length: 0 + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: tx7f6b7fa09bc2443a94df0-0052d58b56 + X-Openstack-Request-Id: tx7f6b7fa09bc2443a94df0-0052d58b56 + Date: Tue, 14 Jan 2014 19:09:10 GMT + + +- Create a container with metadata: + + :: + + curl -i $publicURL/marktwain -X PUT -H "X-Auth-Token: $token" -H "X-Container-Meta-Book: TomSawyer" + + + + + :: + + HTTP/1.1 201 Created + Content-Length: 0 + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: tx06021f10fc8642b2901e7-0052d58f37 + X-Openstack-Request-Id: tx06021f10fc8642b2901e7-0052d58f37 + Date: Tue, 14 Jan 2014 19:25:43 GMT + +- Create a container with an ACL to allow anybody to get an object in the + marktwain container: + :: + + curl -i $publicURL/marktwain -X PUT -H "X-Auth-Token: $token" -H "X-Container-Read: .r:*" + + + + :: + + HTTP/1.1 201 Created + Content-Length: 0 + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: tx06021f10fc8642b2901e7-0052d58f37 + X-Openstack-Request-Id: tx06021f10fc8642b2901e7-0052d58f37 + Date: Tue, 14 Jan 2014 19:25:43 GMT + +Normal response codes: 201, 202 + +Error response codes: 400, 404, 507 + +Request +------- + +.. rest_parameters:: parameters.yaml + + - account: account + - container: container + - X-Auth-Token: X-Auth-Token + - X-Service-Token: X-Service-Token + - X-Container-Read: X-Container-Read + - X-Container-Write: X-Container-Write + - X-Container-Sync-To: X-Container-Sync-To + - X-Container-Sync-Key: X-Container-Sync-Key + - X-Versions-Location: X-Versions-Location + - X-History-Location: X-History-Location + - X-Container-Meta-name: X-Container-Meta-name_req + - X-Container-Meta-Access-Control-Allow-Origin: X-Container-Meta-Access-Control-Allow-Origin + - X-Container-Meta-Access-Control-Max-Age: X-Container-Meta-Access-Control-Max-Age + - X-Container-Meta-Access-Control-Expose-Headers: X-Container-Meta-Access-Control-Expose-Headers + - X-Container-Meta-Quota-Bytes: X-Container-Meta-Quota-Bytes + - X-Container-Meta-Quota-Count: X-Container-Meta-Quota-Count + - X-Container-Meta-Temp-URL-Key: X-Container-Meta-Temp-URL-Key_req + - X-Container-Meta-Temp-URL-Key-2: X-Container-Meta-Temp-URL-Key-2_req + - X-Trans-Id-Extra: X-Trans-Id-Extra + - X-Storage-Policy: X-Storage-Policy + + +Response Parameters +------------------- + +.. rest_parameters:: parameters.yaml + + - Date: Date + - X-Timestamp: X-Timestamp + - Content-Length: Content-Length_cud_resp + - Content-Type: Content-Type_cud_resp + - X-Trans-Id: X-Trans-Id + - X-Openstack-Request-Id: X-Openstack-Request-Id + + + + + + +Create, update, or delete container metadata +============================================ + +.. rest_method:: POST /v1/{account}/{container} + +Creates, updates, or deletes custom metadata for a container. + +To create, update, or delete a custom metadata item, use the ``X +-Container-Meta-{name}`` header, where ``{name}`` is the name of +the metadata item. + +.. include:: metadata_header_syntax.inc +.. include:: metadata_header_encoding.inc + +Subsequent requests for the same key and value pair overwrite the +previous value. + +To delete container metadata, send an empty value for that header, +such as for the ``X-Container-Meta-Book`` header. If the tool you +use to communicate with Object Storage, such as an older version of +cURL, does not support empty headers, send the ``X-Remove- +Container-Meta-{name}`` header with an arbitrary value. For +example, ``X-Remove-Container-Meta-Book: x``. The operation ignores +the arbitrary value. + +If the container already has other custom metadata items, a request +to create, update, or delete metadata does not affect those items. + +Example requests and responses: + +- Create container metadata: + + :: + + curl -i $publicURL/marktwain -X POST -H "X-Auth-Token: $token" -H "X-Container-Meta-Author: MarkTwain" -H "X-Container-Meta-Web-Directory-Type: text/directory" -H "X-Container-Meta-Century: Nineteenth" + + + + + :: + + HTTP/1.1 204 No Content + Content-Length: 0 + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: tx05dbd434c651429193139-0052d82635 + X-Openstack-Request-Id: tx05dbd434c651429193139-0052d82635 + Date: Thu, 16 Jan 2014 18:34:29 GMT + + +- Update container metadata: + + :: + + curl -i $publicURL/marktwain -X POST -H "X-Auth-Token: $token" -H "X-Container-Meta-Author: SamuelClemens" + + + + + :: + + HTTP/1.1 204 No Content + Content-Length: 0 + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: txe60c7314bf614bb39dfe4-0052d82653 + X-Openstack-Request-Id: txe60c7314bf614bb39dfe4-0052d82653 + Date: Thu, 16 Jan 2014 18:34:59 GMT + + +- Delete container metadata: + + :: + + curl -i $publicURL/marktwain -X POST -H "X-Auth-Token: $token" -H "X-Remove-Container-Meta-Century: x" + + + + + :: + + HTTP/1.1 204 No Content + Content-Length: 0 + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: tx7997e18da2a34a9e84ceb-0052d826d0 + X-Openstack-Request-Id: tx7997e18da2a34a9e84ceb-0052d826d0 + Date: Thu, 16 Jan 2014 18:37:04 GMT + + +If the request succeeds, the operation returns the ``No Content +(204)`` response code. + +To confirm your changes, issue a show container metadata request. + +Normal response codes: 204 + +Error response codes: 404 + + +Request +------- + +.. rest_parameters:: parameters.yaml + + - account: account + - container: container + - X-Auth-Token: X-Auth-Token + - X-Service-Token: X-Service-Token + - X-Container-Read: X-Container-Read + - X-Remove-Container-name: X-Remove-Container-name + - X-Container-Write: X-Container-Write + - X-Container-Sync-To: X-Container-Sync-To + - X-Container-Sync-Key: X-Container-Sync-Key + - X-Versions-Location: X-Versions-Location + - X-History-Location: X-History-Location + - X-Remove-Versions-Location: X-Remove-Versions-Location + - X-Remove-History-Location: X-Remove-History-Location + - X-Container-Meta-name: X-Container-Meta-name_req + - X-Container-Meta-Access-Control-Allow-Origin: X-Container-Meta-Access-Control-Allow-Origin + - X-Container-Meta-Access-Control-Max-Age: X-Container-Meta-Access-Control-Max-Age + - X-Container-Meta-Access-Control-Expose-Headers: X-Container-Meta-Access-Control-Expose-Headers + - X-Container-Meta-Quota-Bytes: X-Container-Meta-Quota-Bytes + - X-Container-Meta-Quota-Count: X-Container-Meta-Quota-Count + - X-Container-Meta-Web-Directory-Type: X-Container-Meta-Web-Directory-Type + - X-Container-Meta-Temp-URL-Key: X-Container-Meta-Temp-URL-Key_req + - X-Container-Meta-Temp-URL-Key-2: X-Container-Meta-Temp-URL-Key-2_req + - X-Trans-Id-Extra: X-Trans-Id-Extra + + +Response Parameters +------------------- + +.. rest_parameters:: parameters.yaml + + - Date: Date + - X-Timestamp: X-Timestamp + - Content-Length: Content-Length_cud_resp + - Content-Type: Content-Type_cud_resp + - X-Trans-Id: X-Trans-Id + - X-Openstack-Request-Id: X-Openstack-Request-Id + + + + + +Show container metadata +======================= + +.. rest_method:: HEAD /v1/{account}/{container} + +Shows container metadata, including the number of objects and the total bytes of all objects stored in the container. + +Show container metadata request: + +:: + + curl -i $publicURL/marktwain -X HEAD -H "X-Auth-Token: $token" + + + + +:: + + HTTP/1.1 204 No Content + Content-Length: 0 + X-Container-Object-Count: 1 + Accept-Ranges: bytes + X-Container-Meta-Book: TomSawyer + X-Timestamp: 1389727543.65372 + X-Container-Meta-Author: SamuelClemens + X-Container-Bytes-Used: 14 + Content-Type: text/plain; charset=utf-8 + X-Trans-Id: tx0287b982a268461b9ec14-0052d826e2 + X-Openstack-Request-Id: tx0287b982a268461b9ec14-0052d826e2 + Date: Thu, 16 Jan 2014 18:37:22 GMT + + +If the request succeeds, the operation returns the ``No Content +(204)`` response code. + +Normal response codes: 204 + + +Request +------- + +.. rest_parameters:: parameters.yaml + + - account: account + - container: container + - X-Auth-Token: X-Auth-Token + - X-Service-Token: X-Service-Token + - X-Newest: X-Newest + - X-Trans-Id-Extra: X-Trans-Id-Extra + + +Response Parameters +------------------- + +.. rest_parameters:: parameters.yaml + + - X-Container-Meta-name: X-Container-Meta-name + - Content-Length: Content-Length_cud_resp + - X-Container-Object-Count: X-Container-Object-Count + - X-Container-Bytes-Used: X-Container-Bytes-Used + - X-Container-Write: X-Container-Write_resp + - X-Container-Meta-Quota-Bytes: X-Container-Meta-Quota-Bytes_resp + - X-Container-Meta-Quota-Count: X-Container-Meta-Quota-Count_resp + - Accept-Ranges: Accept-Ranges + - X-Container-Read: X-Container-Read_resp + - X-Container-Meta-Access-Control-Expose-Headers: X-Container-Meta-Access-Control-Expose-Headers + - X-Container-Meta-Temp-URL-Key: X-Container-Meta-Temp-URL-Key_resp + - X-Container-Meta-Temp-URL-Key-2: X-Container-Meta-Temp-URL-Key-2_resp + - X-Timestamp: X-Timestamp + - X-Container-Meta-Access-Control-Allow-Origin: X-Container-Meta-Access-Control-Allow-Origin + - X-Container-Meta-Access-Control-Max-Age: X-Container-Meta-Access-Control-Max-Age + - X-Container-Sync-Key: X-Container-Sync-Key_resp + - X-Container-Sync-To: X-Container-Sync-To_resp + - Date: Date + - X-Trans-Id: X-Trans-Id + - X-Openstack-Request-Id: X-Openstack-Request-Id + - Content-Type: Content-Type_cud_resp + - X-Versions-Location: X-Versions-Location_resp + - X-History-Location: X-History-Location_resp + - X-Storage-Policy: X-Storage-Policy + + + + + +Delete container +================ + +.. rest_method:: DELETE /v1/{account}/{container} + +Deletes an empty container. + +This operation fails unless the container is empty. An empty +container has no objects. + +Delete the ``steven`` container: + +:: + + curl -i $publicURL/steven -X DELETE -H "X-Auth-Token: $token" + + +If the container does not exist, the response is: + +:: + + HTTP/1.1 404 Not Found + Content-Length: 70 + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: tx4d728126b17b43b598bf7-0052d81e34 + X-Openstack-Request-Id: tx4d728126b17b43b598bf7-0052d81e34 + Date: Thu, 16 Jan 2014 18:00:20 GMT + + +If the container exists and the deletion succeeds, the response is: + +:: + + HTTP/1.1 204 No Content + Content-Length: 0 + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: txf76c375ebece4df19c84c-0052d81f14 + X-Openstack-Request-Id: txf76c375ebece4df19c84c-0052d81f14 + Date: Thu, 16 Jan 2014 18:04:04 GMT + + +If the container exists but is not empty, the response is: + +:: + + HTTP/1.1 409 Conflict + Content-Length: 95 + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: tx7782dc6a97b94a46956b5-0052d81f6b + X-Openstack-Request-Id: tx7782dc6a97b94a46956b5-0052d81f6b + Date: Thu, 16 Jan 2014 18:05:31 GMT + +

Conflict +

+

There was a conflict when trying to complete your request. +

+ + +Normal response codes: 204 + +Error response codes: 404, 409 + + +Request +------- + +.. rest_parameters:: parameters.yaml + + - account: account + - container: container + - X-Auth-Token: X-Auth-Token + - X-Service-Token: X-Service-Token + - X-Trans-Id-Extra: X-Trans-Id-Extra + + +Response Parameters +------------------- + +.. rest_parameters:: parameters.yaml + + - Date: Date + - X-Timestamp: X-Timestamp + - Content-Length: Content-Length_cud_resp + - Content-Type: Content-Type_cud_resp + - X-Trans-Id: X-Trans-Id + - X-Openstack-Request-Id: X-Openstack-Request-Id + + + + + + diff --git a/api-ref/source/storage-object-services.inc b/api-ref/source/storage-object-services.inc new file mode 100644 index 0000000000..2519b5973d --- /dev/null +++ b/api-ref/source/storage-object-services.inc @@ -0,0 +1,787 @@ +.. -*- rst -*- + +======= +Objects +======= + +Creates, replaces, shows details for, and deletes objects. Copies +objects from another object with a new or different name. Updates +object metadata. For more information and concepts about +objects see `Object Storage API overview +`_ +and `Large Objects +`_. + + +Get object content and metadata +=============================== + +.. rest_method:: GET /v1/{account}/{container}/{object} + +Downloads the object content and gets the object metadata. + +This operation returns the object metadata in the response headers +and the object content in the response body. + +If this is a large object, the response body contains the +concatenated content of the segment objects. To get the manifest +instead of concatenated segment objects for a static large object, +use the ``multipart-manifest`` query parameter. + +Example requests and responses: + +- Show object details for the ``goodbye`` object in the + ``marktwain`` container: + + :: + + curl -i $publicURL/marktwain/goodbye -X GET -H "X-Auth-Token: $token" + + + + + :: + + HTTP/1.1 200 OK + Content-Length: 14 + Accept-Ranges: bytes + Last-Modified: Wed, 15 Jan 2014 16:41:49 GMT + Etag: 451e372e48e0f6b1114fa0724aa79fa1 + X-Timestamp: 1389804109.39027 + X-Object-Meta-Orig-Filename: goodbyeworld.txt + Content-Type: application/octet-stream + X-Trans-Id: tx8145a190241f4cf6b05f5-0052d82a34 + X-Openstack-Request-Id: tx8145a190241f4cf6b05f5-0052d82a34 + Date: Thu, 16 Jan 2014 18:51:32 GMT + Goodbye World! + + +- Show object details for the ``goodbye`` object, which does not + exist, in the ``janeausten`` container: + + :: + + curl -i $publicURL/janeausten/goodbye -X GET -H "X-Auth-Token: $token" + + + + + :: + + HTTP/1.1 404 Not Found + Content-Length: 70 + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: tx073f7cbb850c4c99934b9-0052d82b04 + X-Openstack-Request-Id: tx073f7cbb850c4c99934b9-0052d82b04 + Date: Thu, 16 Jan 2014 18:55:00 GMT + +

Not Found +

+

The resource could not be found. +

+ + + +The operation returns the ``Range Not Satisfiable (416)`` response +code for any ranged GET requests that specify more than: + +- Fifty ranges. + +- Three overlapping ranges. + +- Eight non-increasing ranges. + + +Normal response codes: 200 + +Error response codes: 416, 404 + + +Request +------- + +.. rest_parameters:: parameters.yaml + + - account: account + - container: container + - object: object + - X-Auth-Token: X-Auth-Token + - X-Service-Token: X-Service-Token + - X-Newest: X-Newest + - temp_url_sig: temp_url_sig + - temp_url_expires: temp_url_expires + - filename: filename + - multipart-manifest: multipart-manifest_get + - symlink: symlink + - Range: Range + - If-Match: If-Match + - If-None-Match: If-None-Match-get-request + - If-Modified-Since: If-Modified-Since + - If-Unmodified-Since: If-Unmodified-Since + - X-Trans-Id-Extra: X-Trans-Id-Extra + + +Response Parameters +------------------- + +.. rest_parameters:: parameters.yaml + + - Content-Length: Content-Length_get_resp + - Content-Type: Content-Type_obj_resp + - X-Object-Meta-name: X-Object-Meta-name_resp + - Content-Disposition: Content-Disposition_resp + - Content-Encoding: Content-Encoding_resp + - X-Delete-At: X-Delete-At_resp + - Accept-Ranges: Accept-Ranges + - X-Object-Manifest: X-Object-Manifest_resp + - Last-Modified: Last-Modified + - ETag: ETag_obj_resp + - X-Timestamp: X-Timestamp + - X-Trans-Id: X-Trans-Id + - X-Openstack-Request-Id: X-Openstack-Request-Id + - Date: Date + - X-Static-Large-Object: X-Static-Large-Object + - X-Symlink-Target: X-Symlink-Target_resp + - X-Symlink-Target-Account: X-Symlink-Target-Account_resp + + +Response Example +---------------- + +See examples above. + + +Create or replace object +======================== + +.. rest_method:: PUT /v1/{account}/{container}/{object} + +Creates an object with data content and metadata, or replaces an existing object with data content and metadata. + +The PUT operation always creates an object. If you use this +operation on an existing object, you replace the existing object +and metadata rather than modifying the object. Consequently, this +operation returns the ``Created (201)`` response code. + +If you use this operation to copy a manifest object, the new object +is a normal object and not a copy of the manifest. Instead it is a +concatenation of all the segment objects. This means that you +cannot copy objects larger than 5 GB. + +Note that the provider may have limited the characters which are allowed +in an object name. Any name limits are exposed under the ``name_check`` key +in the ``/info`` discoverability response. Regardless of ``name_check`` +limitations, names must be URL quoted UTF-8. + +To create custom metadata, use the +``X-Object-Meta-name`` header, where ``name`` is the name of the metadata +item. + +.. include:: metadata_header_syntax.inc + +Example requests and responses: + +- Create object: + + :: + + curl -i $publicURL/janeausten/helloworld.txt -X PUT -d "Hello" -H "Content-Type: text/html; charset=UTF-8" -H "X-Auth-Token: $token" + + + + + :: + + HTTP/1.1 201 Created + Last-Modified: Fri, 17 Jan 2014 17:28:35 GMT + Content-Length: 0 + Etag: 8b1a9953c4611296a827abf8c47804d7 + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: tx4d5e4f06d357462bb732f-0052d96843 + X-Openstack-Request-Id: tx4d5e4f06d357462bb732f-0052d96843 + Date: Fri, 17 Jan 2014 17:28:35 GMT + + +- Replace object: + + :: + + curl -i $publicURL/janeausten/helloworld.txt -X PUT -d "Hola" -H "X-Auth-Token: $token" + + + + + :: + + HTTP/1.1 201 Created + Last-Modified: Fri, 17 Jan 2014 17:28:35 GMT + Content-Length: 0 + Etag: f688ae26e9cfa3ba6235477831d5122e + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: tx4d5e4f06d357462bb732f-0052d96843 + X-Openstack-Request-Id: tx4d5e4f06d357462bb732f-0052d96843 + Date: Fri, 17 Jan 2014 17:28:35 GMT + + +The ``Created (201)`` response code indicates a successful write. + +If the container for the object does not already exist, the operation +returns the ``404 Not Found`` response code. + +If the request times out, the operation returns the ``Request +Timeout (408)`` response code. + +The ``Length Required (411)`` response code indicates a missing +``Transfer-Encoding`` or ``Content-Length`` request header. + +If the MD5 checksum of the data that is written to the object store +does not match the optional ``ETag`` value, the operation returns +the ``Unprocessable Entity (422)`` response code. + +Normal response codes: 201 + +Error response codes: 404, 408, 411, 422 + + +Request +------- + +.. rest_parameters:: parameters.yaml + + - account: account + - container: container + - object: object + - multipart-manifest: multipart-manifest_put + - temp_url_sig: temp_url_sig + - temp_url_expires: temp_url_expires + - X-Object-Manifest: X-Object-Manifest + - X-Auth-Token: X-Auth-Token + - X-Service-Token: X-Service-Token + - Content-Length: Content-Length_put_req + - Transfer-Encoding: Transfer-Encoding + - Content-Type: Content-Type_obj_cu_req + - X-Detect-Content-Type: X-Detect-Content-Type + - X-Copy-From: X-Copy-From + - X-Copy-From-Account: X-Copy-From-Account + - ETag: ETag_obj_req + - Content-Disposition: Content-Disposition + - Content-Encoding: Content-Encoding + - X-Delete-At: X-Delete-At + - X-Delete-After: X-Delete-After + - X-Object-Meta-name: X-Object-Meta-name + - If-None-Match: If-None-Match-put-request + - X-Trans-Id-Extra: X-Trans-Id-Extra + - X-Symlink-Target: X-Symlink-Target + - X-Symlink-Target-Account: X-Symlink-Target-Account + + +Response Parameters +------------------- + +.. rest_parameters:: parameters.yaml + + - Content-Length: Content-Length_cud_resp + - ETag: ETag_obj_received + - X-Timestamp: X-Timestamp + - X-Trans-Id: X-Trans-Id + - X-Openstack-Request-Id: X-Openstack-Request-Id + - Date: Date + - Content-Type: Content-Type_obj_resp + - last_modified: last_modified + + + + + + + + +Copy object +=========== + +.. rest_method:: COPY /v1/{account}/{container}/{object} + +Copies an object to another object in the object store. + +You can copy an object to a new object with the same name. Copying +to the same name is an alternative to using POST to add metadata to +an object. With POST, you must specify all the metadata. With COPY, +you can add additional metadata to the object. + +With COPY, you can set the ``X-Fresh-Metadata`` header to ``true`` +to copy the object without any existing metadata. + +Alternatively, you can use PUT with the ``X-Copy-From`` request +header to accomplish the same operation as the COPY object +operation. + +The COPY operation always creates an object. If you use this +operation on an existing object, you replace the existing object +and metadata rather than modifying the object. Consequently, this +operation returns the ``Created (201)`` response code. + +Normally, if you use this operation to copy a manifest object, the new object +is a normal object and not a copy of the manifest. Instead it is a +concatenation of all the segment objects. This means that you +cannot copy objects larger than 5 GB in size. + +To copy the manifest object, you include the +``multipart-manifest=get`` query string in the COPY request. +The new object contains the same manifest as the original. +The segment objects are not copied. Instead, both the original +and new manifest objects share the same set of segment objects. + +To copy a symlink either with a COPY or a PUT with the +``X-Copy-From`` request, include the ``symlink=get`` query string. +The new symlink will have the same target as the original. +The target object is not copied. Instead, both the original +and new symlinks point to the same target object. + +All metadata is +preserved during the object copy. If you specify metadata on the +request to copy the object, either PUT or COPY , the metadata +overwrites any conflicting keys on the target (new) object. + +Example requests and responses: + +- Copy the ``goodbye`` object from the ``marktwain`` container to + the ``janeausten`` container: + + :: + + curl -i $publicURL/marktwain/goodbye -X COPY -H "X-Auth-Token: $token" -H "Destination: janeausten/goodbye" + + + + + :: + + HTTP/1.1 201 Created + Content-Length: 0 + X-Copied-From-Last-Modified: Thu, 16 Jan 2014 21:19:45 GMT + X-Copied-From: marktwain/goodbye + Last-Modified: Fri, 17 Jan 2014 18:22:57 GMT + Etag: 451e372e48e0f6b1114fa0724aa79fa1 + Content-Type: text/html; charset=UTF-8 + X-Object-Meta-Movie: AmericanPie + X-Trans-Id: txdcb481ad49d24e9a81107-0052d97501 + X-Openstack-Request-Id: txdcb481ad49d24e9a81107-0052d97501 + Date: Fri, 17 Jan 2014 18:22:57 GMT + + +- Alternatively, you can use PUT to copy the ``goodbye`` object from + the ``marktwain`` container to the ``janeausten`` container. This + request requires a ``Content-Length`` header, even if it is set + to zero (0). + + :: + + curl -i $publicURL/janeausten/goodbye -X PUT -H "X-Auth-Token: $token" -H "X-Copy-From: /marktwain/goodbye" -H "Content-Length: 0" + + + + + :: + + HTTP/1.1 201 Created + Content-Length: 0 + X-Copied-From-Last-Modified: Thu, 16 Jan 2014 21:19:45 GMT + X-Copied-From: marktwain/goodbye + Last-Modified: Fri, 17 Jan 2014 18:22:57 GMT + Etag: 451e372e48e0f6b1114fa0724aa79fa1 + Content-Type: text/html; charset=UTF-8 + X-Object-Meta-Movie: AmericanPie + X-Trans-Id: txdcb481ad49d24e9a81107-0052d97501 + X-Openstack-Request-Id: txdcb481ad49d24e9a81107-0052d97501 + Date: Fri, 17 Jan 2014 18:22:57 GMT + + +When several replicas exist, the system copies from the most recent +replica. That is, the COPY operation behaves as though the +``X-Newest`` header is in the request. + +Normal response codes: 201 + + +Request +------- + +.. rest_parameters:: parameters.yaml + + - account: account + - container: container + - object: object + - multipart-manifest: multipart-manifest_copy + - symlink: symlink_copy + - X-Auth-Token: X-Auth-Token + - X-Service-Token: X-Service-Token + - Destination: Destination + - Destination-Account: Destination-Account + - Content-Type: Content-Type_obj_cu_req + - Content-Encoding: Content-Encoding + - Content-Disposition: Content-Disposition + - X-Object-Meta-name: X-Object-Meta-name + - X-Fresh-Metadata: X-Fresh-Metadata + - X-Trans-Id-Extra: X-Trans-Id-Extra + + +Response Parameters +------------------- + +.. rest_parameters:: parameters.yaml + + - Content-Length: Content-Length_cud_resp + - X-Copied-From-Last-Modified: X-Copied-From-Last-Modified + - X-Copied-From: X-Copied-From + - X-Copied-From-Account: X-Copied-From-Account + - Last-Modified: Last-Modified + - ETag: ETag_obj_copied + - X-Timestamp: X-Timestamp + - X-Trans-Id: X-Trans-Id + - X-Openstack-Request-Id: X-Openstack-Request-Id + - Date: Date + - Content-Type: Content-Type_obj_resp + + + + + +Delete object +============= + +.. rest_method:: DELETE /v1/{account}/{container}/{object} + +Permanently deletes an object from the object store. + +Object deletion occurs as soon as possible. Subsequent GET, HEAD, POST, +or DELETE operations should return a ``404 Not Found`` error code, but +may return stale data due to eventual consistency. + +For static large object manifests, you can add the +``?multipart-manifest=delete`` query parameter. This operation deletes +the segment objects and, if all deletions succeed, this operation +deletes the manifest object. + +A DELETE request made to a symlink path will delete the symlink +rather than the target object. + +An alternative to using the DELETE operation is to use +the POST operation with the ``bulk-delete`` query parameter. + +Example request and response: + +- Delete the ``helloworld`` object from the ``marktwain`` container: + + :: + + curl -i $publicURL/marktwain/helloworld -X DELETE -H "X-Auth-Token: $token" + + + + + :: + + HTTP/1.1 204 No Content + Content-Length: 0 + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: tx36c7606fcd1843f59167c-0052d6fdac + X-Openstack-Request-Id: tx36c7606fcd1843f59167c-0052d6fdac + Date: Wed, 15 Jan 2014 21:29:16 GMT + + +Typically, the DELETE operation does not return a response body. +However, with the ``multipart-manifest=delete`` query parameter, +the response body contains a list of manifest and segment objects +and the status of their DELETE operations. + +Normal response codes: 204 + + +Request +------- + +.. rest_parameters:: parameters.yaml + + - account: account + - container: container + - object: object + - multipart-manifest: multipart-manifest_delete + - X-Auth-Token: X-Auth-Token + - X-Service-Token: X-Service-Token + - X-Trans-Id-Extra: X-Trans-Id-Extra + + +Response Parameters +------------------- + +.. rest_parameters:: parameters.yaml + + - Date: Date + - X-Timestamp: X-Timestamp + - Content-Length: Content-Length_cud_resp + - Content-Type: Content-Type_cud_resp + - X-Trans-Id: X-Trans-Id + - X-Openstack-Request-Id: X-Openstack-Request-Id + + + + + +Show object metadata +==================== + +.. rest_method:: HEAD /v1/{account}/{container}/{object} + +Shows object metadata. + + + +Example requests and responses: + +- Show object metadata: + + :: + + curl $publicURL/marktwain/goodbye --head -H "X-Auth-Token: $token" + + + + + :: + + HTTP/1.1 200 OK + Content-Length: 14 + Accept-Ranges: bytes + Last-Modified: Thu, 16 Jan 2014 21:12:31 GMT + Etag: 451e372e48e0f6b1114fa0724aa79fa1 + X-Timestamp: 1389906751.73463 + X-Object-Meta-Book: GoodbyeColumbus + Content-Type: application/octet-stream + X-Trans-Id: tx37ea34dcd1ed48ca9bc7d-0052d84b6f + X-Openstack-Request-Id: tx37ea34dcd1ed48ca9bc7d-0052d84b6f + Date: Thu, 16 Jan 2014 21:13:19 GMT + + Note: The ``--head`` option was used in the above example. If we had + used ``-i -X HEAD`` and the ``Content-Length`` response header is non-zero, + the cURL command stalls after it prints the response headers because it + is waiting for a response body. However, the Object Storage system + does not return a response body for the HEAD operation. + + +If the request succeeds, the operation returns the ``200`` response +code. + + +Normal response codes: 200 + + +Request +------- + +.. rest_parameters:: parameters.yaml + + - account: account + - container: container + - object: object + - X-Auth-Token: X-Auth-Token + - X-Service-Token: X-Service-Token + - temp_url_sig: temp_url_sig + - temp_url_expires: temp_url_expires + - filename: filename + - multipart-manifest: multipart-manifest_head + - symlink: symlink + - X-Newest: X-Newest + - If-Match: If-Match + - If-None-Match: If-None-Match-get-request + - If-Modified-Since: If-Modified-Since + - If-Unmodified-Since: If-Unmodified-Since + - X-Trans-Id-Extra: X-Trans-Id-Extra + + +Response Parameters +------------------- + +.. rest_parameters:: parameters.yaml + + - Content-Length: Content-Length_obj_head_resp + - X-Object-Meta-name: X-Object-Meta-name + - Content-Disposition: Content-Disposition_resp + - Content-Encoding: Content-Encoding_resp + - X-Delete-At: X-Delete-At_resp + - X-Object-Manifest: X-Object-Manifest_resp + - Last-Modified: Last-Modified + - ETag: ETag_obj_resp + - X-Timestamp: X-Timestamp + - X-Trans-Id: X-Trans-Id + - X-Openstack-Request-Id: X-Openstack-Request-Id + - Date: Date + - X-Static-Large-Object: X-Static-Large-Object + - Content-Type: Content-Type_obj_resp + - X-Symlink-Target: X-Symlink-Target_resp + - X-Symlink-Target-Account: X-Symlink-Target-Account_resp + + +Response Example +---------------- + +See examples above. + + + +Create or update object metadata +================================ + +.. rest_method:: POST /v1/{account}/{container}/{object} + +Creates or updates object metadata. + +To create or update custom metadata, use the +``X-Object-Meta-name`` header, where ``name`` is the name of the metadata +item. + +.. include:: metadata_header_syntax.inc + +In addition to the custom metadata, you can update the +``Content-Type``, ``Content-Encoding``, ``Content-Disposition``, and +``X-Delete-At`` system metadata items. However you cannot update other +system metadata, such as ``Content-Length`` or ``Last-Modified``. + +You can use COPY as an alternate to the POST operation by copying +to the same object. With the POST operation you must specify all +metadata items, whereas with the COPY operation, you need to +specify only changed or additional items. +All metadata is preserved during the object copy. If you specify +metadata on the request to copy the object, either PUT or COPY , +the metadata overwrites any conflicting keys on the target (new) +object. + +.. note:: + + While using COPY instead of POST allows sending only a subset of + the metadata, it carries the cost of reading and rewriting the entire + contents of the object. + +A POST request deletes any existing custom metadata that you added +with a previous PUT or POST request. Consequently, you must specify +all custom metadata in the request. However, system metadata is +unchanged by the POST request unless you explicitly supply it in a +request header. + +You can also set the ``X-Delete-At`` or ``X-Delete-After`` header +to define when to expire the object. + +When used as described in this section, the POST operation creates +or replaces metadata. This form of the operation has no request +body. There are alternate uses of the POST operation as follows: + +- You can also use the `form POST feature + `_ to upload objects. + +- The POST operation when used with the ``bulk-delete`` query parameter + can be used to delete multiple objects and containers in a single + operation. + +- The POST operation when used with the ``extract-archive`` query parameter + can be used to upload an archive (tar file). The archive is then extracted + to create objects. + +A POST request must not include X-Symlink-Target header. If it does then a +400 status code is returned and the object metadata is not modified. + +When a POST request is sent to a symlink, the metadata will be applied to the +symlink, but the request will result in a ``307 Temporary Redirect`` response +to the client. The POST is never redirected to the target object, thus a +GET/HEAD request to the symlink without ``symlink=get`` will not return the +metadata that was sent as part of the POST request. + +Example requests and responses: + +- Create object metadata: + + :: + + curl -i $publicURL/marktwain/goodbye -X POST -H "X-Auth-Token: $token" -H "X-Object-Meta-Book: GoodbyeColumbus" + + + + + :: + + HTTP/1.1 202 Accepted + Content-Length: 76 + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: txb5fb5c91ba1f4f37bb648-0052d84b3f + X-Openstack-Request-Id: txb5fb5c91ba1f4f37bb648-0052d84b3f + Date: Thu, 16 Jan 2014 21:12:31 GMT + +

Accepted +

+

The request is accepted for processing. +

+ + + +- Update object metadata: + + :: + + curl -i $publicURL/marktwain/goodbye -X POST -H "X-Auth-Token: $token" -H "X-Object-Meta-Book: GoodbyeOldFriend" + + + + + :: + + HTTP/1.1 202 Accepted + Content-Length: 76 + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: tx5ec7ab81cdb34ced887c8-0052d84ca4 + X-Openstack-Request-Id: tx5ec7ab81cdb34ced887c8-0052d84ca4 + Date: Thu, 16 Jan 2014 21:18:28 GMT + +

Accepted +

+

The request is accepted for processing. +

+ + +Normal response codes: 202 + + +Request +------- + +.. rest_parameters:: parameters.yaml + + - account: account + - container: container + - object: object + - bulk-delete: bulk-delete + - extract-archive: extract-archive + - X-Auth-Token: X-Auth-Token + - X-Service-Token: X-Service-Token + - X-Object-Meta-name: X-Object-Meta-name + - X-Delete-At: X-Delete-At + - X-Delete-After: X-Delete-After + - Content-Disposition: Content-Disposition + - Content-Encoding: Content-Encoding + - Content-Type: Content-Type_obj_cu_req + - X-Trans-Id-Extra: X-Trans-Id-Extra + + +Response Parameters +------------------- + +.. rest_parameters:: parameters.yaml + + - Date: Date + - X-Timestamp: X-Timestamp + - Content-Length: Content-Length_cud_resp + - Content-Type: Content-Type_cud_resp + - X-Trans-Id: X-Trans-Id + - X-Openstack-Request-Id: X-Openstack-Request-Id + + + + diff --git a/api-ref/source/storage_endpoints.inc b/api-ref/source/storage_endpoints.inc new file mode 100644 index 0000000000..41845425d4 --- /dev/null +++ b/api-ref/source/storage_endpoints.inc @@ -0,0 +1,37 @@ +.. -*- rst -*- + +========= +Endpoints +========= + +If configured, lists endpoints for an account. + + +List endpoints +============== + +.. rest_method:: GET /v1/endpoints + +Lists endpoints for an object, account, or container. + +When the cloud provider enables middleware to list the +``/endpoints/`` path, software that needs data location information +can use this call to avoid network overhead. The cloud provider can +map the ``/endpoints/`` path to another resource, so this exact +resource might vary from provider to provider. Because it goes +straight to the middleware, the call is not authenticated, so be +sure you have tightly secured the environment and network when +using this call. + +Error response codes:201, + + +Request +------- + +This operation does not accept a request body. + + + + + diff --git a/api-ref/source/storage_info.inc b/api-ref/source/storage_info.inc new file mode 100644 index 0000000000..0487210b3d --- /dev/null +++ b/api-ref/source/storage_info.inc @@ -0,0 +1,46 @@ +.. -*- rst -*- + +=============== +Discoverability +=============== + +If configured, lists the activated capabilities for this version of +the OpenStack Object Storage API. + + +List activated capabilities +=========================== + +.. rest_method:: GET /info + +Lists the activated capabilities for this version of the OpenStack Object Storage API. + +Most of the information is "public" i.e. visible to all callers. However, some +configuration and capability items are reserved for the administrators of the +system. To access this data, the ``swiftinfo_sig`` and ``swiftinfo_expires`` +query parameters must be added to the request. + + +Normal response codes: 200 +Error response codes: + + +Request +------- + +.. rest_parameters:: parameters.yaml + + - swiftinfo_sig: swiftinfo_sig + - swiftinfo_expires: swiftinfo_expires + + + + +Response Example +---------------- + +.. literalinclude:: samples/capabilities-list-response.json + :language: javascript + + + diff --git a/babel.cfg b/babel.cfg deleted file mode 100644 index 15cd6cb76b..0000000000 --- a/babel.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[python: **.py] - diff --git a/bandit.yaml b/bandit.yaml new file mode 100644 index 0000000000..62b37689b4 --- /dev/null +++ b/bandit.yaml @@ -0,0 +1,111 @@ + +### This config may optionally select a subset of tests to run or skip by +### filling out the 'tests' and 'skips' lists given below. If no tests are +### specified for inclusion then it is assumed all tests are desired. The skips +### set will remove specific tests from the include set. This can be controlled +### using the -t/-s CLI options. Note that the same test ID should not appear +### in both 'tests' and 'skips', this would be nonsensical and is detected by +### Bandit at runtime. + +# See https://bandit.readthedocs.io/en/latest/blacklists/blacklist_calls.html +# for documentation of the available tests. + +# (optional) list included test IDs here, eg '[B101, B406]': +tests: + +# (optional) list skipped test IDs here, eg '[B101, B406]': +skips: + # We default to binding to all interfaces + - B104 + # Yes, we sometimes catch just to quietly swallow an exception + - B110 + # We use insecure randomness all over the place, because + # it's exceedingly rare that we need secure randomness + - B311 + # We dynamically build SQL all over the place + - B608 + # We often use subprocesses, and require a lot of trust in our use of them + - B404 + - B603 + - B607 + # We parse xml + - B405 + - B603 + +### (optional) plugin settings - some test plugins require configuration data +### that may be given here, per-plugin. All bandit test plugins have a built in +### set of sensible defaults and these will be used if no configuration is +### provided. It is not necessary to provide settings for every (or any) plugin +### if the defaults are acceptable. + +#any_other_function_with_shell_equals_true: +# no_shell: [os.execl, os.execle, os.execlp, os.execlpe, os.execv, os.execve, os.execvp, +# os.execvpe, os.spawnl, os.spawnle, os.spawnlp, os.spawnlpe, os.spawnv, os.spawnve, +# os.spawnvp, os.spawnvpe, os.startfile] +# shell: [os.system, os.popen, os.popen2, os.popen3, os.popen4, popen2.popen2, popen2.popen3, +# popen2.popen4, popen2.Popen3, popen2.Popen4, commands.getoutput, commands.getstatusoutput] +# subprocess: [subprocess.Popen, subprocess.call, subprocess.check_call, subprocess.check_output, +# utils.execute, utils.execute_with_timeout] +#execute_with_run_as_root_equals_true: +# function_names: [ceilometer.utils.execute, cinder.utils.execute, neutron.agent.linux.utils.execute, +# nova.utils.execute, nova.utils.trycmd] +#hardcoded_tmp_directory: +# tmp_dirs: [/tmp, /var/tmp, /dev/shm] +#linux_commands_wildcard_injection: +# no_shell: [os.execl, os.execle, os.execlp, os.execlpe, os.execv, os.execve, os.execvp, +# os.execvpe, os.spawnl, os.spawnle, os.spawnlp, os.spawnlpe, os.spawnv, os.spawnve, +# os.spawnvp, os.spawnvpe, os.startfile] +# shell: [os.system, os.popen, os.popen2, os.popen3, os.popen4, popen2.popen2, popen2.popen3, +# popen2.popen4, popen2.Popen3, popen2.Popen4, commands.getoutput, commands.getstatusoutput] +# subprocess: [subprocess.Popen, subprocess.call, subprocess.check_call, subprocess.check_output, +# utils.execute, utils.execute_with_timeout] +#password_config_option_not_marked_secret: +# function_names: [oslo.config.cfg.StrOpt, oslo_config.cfg.StrOpt] +#ssl_with_bad_defaults: +# bad_protocol_versions: [PROTOCOL_SSLv2, SSLv2_METHOD, SSLv23_METHOD, PROTOCOL_SSLv3, +# PROTOCOL_TLSv1, SSLv3_METHOD, TLSv1_METHOD] +#ssl_with_bad_version: +# bad_protocol_versions: [PROTOCOL_SSLv2, SSLv2_METHOD, SSLv23_METHOD, PROTOCOL_SSLv3, +# PROTOCOL_TLSv1, SSLv3_METHOD, TLSv1_METHOD] +#start_process_with_a_shell: +# no_shell: [os.execl, os.execle, os.execlp, os.execlpe, os.execv, os.execve, os.execvp, +# os.execvpe, os.spawnl, os.spawnle, os.spawnlp, os.spawnlpe, os.spawnv, os.spawnve, +# os.spawnvp, os.spawnvpe, os.startfile] +# shell: [os.system, os.popen, os.popen2, os.popen3, os.popen4, popen2.popen2, popen2.popen3, +# popen2.popen4, popen2.Popen3, popen2.Popen4, commands.getoutput, commands.getstatusoutput] +# subprocess: [subprocess.Popen, subprocess.call, subprocess.check_call, subprocess.check_output, +# utils.execute, utils.execute_with_timeout] +#start_process_with_no_shell: +# no_shell: [os.execl, os.execle, os.execlp, os.execlpe, os.execv, os.execve, os.execvp, +# os.execvpe, os.spawnl, os.spawnle, os.spawnlp, os.spawnlpe, os.spawnv, os.spawnve, +# os.spawnvp, os.spawnvpe, os.startfile] +# shell: [os.system, os.popen, os.popen2, os.popen3, os.popen4, popen2.popen2, popen2.popen3, +# popen2.popen4, popen2.Popen3, popen2.Popen4, commands.getoutput, commands.getstatusoutput] +# subprocess: [subprocess.Popen, subprocess.call, subprocess.check_call, subprocess.check_output, +# utils.execute, utils.execute_with_timeout] +#start_process_with_partial_path: +# no_shell: [os.execl, os.execle, os.execlp, os.execlpe, os.execv, os.execve, os.execvp, +# os.execvpe, os.spawnl, os.spawnle, os.spawnlp, os.spawnlpe, os.spawnv, os.spawnve, +# os.spawnvp, os.spawnvpe, os.startfile] +# shell: [os.system, os.popen, os.popen2, os.popen3, os.popen4, popen2.popen2, popen2.popen3, +# popen2.popen4, popen2.Popen3, popen2.Popen4, commands.getoutput, commands.getstatusoutput] +# subprocess: [subprocess.Popen, subprocess.call, subprocess.check_call, subprocess.check_output, +# utils.execute, utils.execute_with_timeout] +#subprocess_popen_with_shell_equals_true: +# no_shell: [os.execl, os.execle, os.execlp, os.execlpe, os.execv, os.execve, os.execvp, +# os.execvpe, os.spawnl, os.spawnle, os.spawnlp, os.spawnlpe, os.spawnv, os.spawnve, +# os.spawnvp, os.spawnvpe, os.startfile] +# shell: [os.system, os.popen, os.popen2, os.popen3, os.popen4, popen2.popen2, popen2.popen3, +# popen2.popen4, popen2.Popen3, popen2.Popen4, commands.getoutput, commands.getstatusoutput] +# subprocess: [subprocess.Popen, subprocess.call, subprocess.check_call, subprocess.check_output, +# utils.execute, utils.execute_with_timeout] +#subprocess_without_shell_equals_true: +# no_shell: [os.execl, os.execle, os.execlp, os.execlpe, os.execv, os.execve, os.execvp, +# os.execvpe, os.spawnl, os.spawnle, os.spawnlp, os.spawnlpe, os.spawnv, os.spawnve, +# os.spawnvp, os.spawnvpe, os.startfile] +# shell: [os.system, os.popen, os.popen2, os.popen3, os.popen4, popen2.popen2, popen2.popen3, +# popen2.popen4, popen2.Popen3, popen2.Popen4, commands.getoutput, commands.getstatusoutput] +# subprocess: [subprocess.Popen, subprocess.call, subprocess.check_call, subprocess.check_output, +# utils.execute, utils.execute_with_timeout] +#try_except_continue: {check_typed_exception: false} +#try_except_pass: {check_typed_exception: false} diff --git a/bin/swift-bench b/bin/swift-bench deleted file mode 100755 index b575b1952c..0000000000 --- a/bin/swift-bench +++ /dev/null @@ -1,175 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) 2010-2012 OpenStack, LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import sys -import signal -import uuid -from optparse import OptionParser - -from swift.common.bench import (BenchController, DistributedBenchController, - create_containers, delete_containers) -from swift.common.utils import readconf, LogAdapter, config_true_value - -# The defaults should be sufficient to run swift-bench on a SAIO -CONF_DEFAULTS = { - 'auth': os.environ.get('ST_AUTH', ''), - 'user': os.environ.get('ST_USER', ''), - 'key': os.environ.get('ST_KEY', ''), - 'auth_version': '1.0', - 'use_proxy': 'yes', - 'put_concurrency': '10', - 'get_concurrency': '10', - 'del_concurrency': '10', - 'concurrency': '', # set all 3 in one shot - 'object_sources': '', # set of file contents to read and use for PUTs - 'lower_object_size': '10', # bounded random size used if these differ - 'upper_object_size': '10', - 'object_size': '1', # only if not object_sources and lower == upper - 'num_objects': '1000', - 'num_gets': '10000', - 'delete': 'yes', - 'container_name': uuid.uuid4().hex, # really "container name base" - 'num_containers': '20', - 'url': '', # used when use_proxy = no or overrides auth X-Storage-Url - 'account': '', # used when use_proxy = no - 'devices': 'sdb1', # space-sep list - 'log_level': 'INFO', - 'timeout': '10', - 'auth_version': '1.0', - 'bench_clients': [], -} - -SAIO_DEFAULTS = { - 'auth': 'http://localhost:8080/auth/v1.0', - 'user': 'test:tester', - 'key': 'testing', -} - -if __name__ == '__main__': - usage = "usage: %prog [OPTIONS] [CONF_FILE]" - usage += """\n\nConf file with SAIO defaults: - - [bench] - auth = http://localhost:8080/auth/v1.0 - user = test:tester - key = testing - concurrency = 10 - object_size = 1 - num_objects = 1000 - num_gets = 10000 - delete = yes - auth_version = 1.0 - """ - parser = OptionParser(usage=usage) - parser.add_option('', '--saio', dest='saio', action='store_true', - default=False, help='Run benchmark with SAIO defaults') - parser.add_option('-A', '--auth', dest='auth', - help='URL for obtaining an auth token') - parser.add_option('-U', '--user', dest='user', - help='User name for obtaining an auth token') - parser.add_option('-K', '--key', dest='key', - help='Key for obtaining an auth token') - parser.add_option('-b', '--bench-clients', action='append', - metavar=':', - help=('A string of the form ":" which matches ' - 'the arguments supplied to a swift-bench-client ' - 'process. This argument must be specified ' - 'once per swift-bench-client you want to ' - 'utilize.')) - parser.add_option('-u', '--url', dest='url', - help='Storage URL') - parser.add_option('-c', '--concurrency', dest='concurrency', - help='Number of concurrent connections to use') - parser.add_option('-s', '--object-size', dest='object_size', - help='Size of objects to PUT (in bytes)') - parser.add_option('-l', '--lower-object-size', dest='lower_object_size', - help=('Lower size of objects (in bytes); ' - '--object-size will be upper-object-size')) - parser.add_option('-n', '--num-objects', dest='num_objects', - help='Number of objects to PUT') - parser.add_option('-g', '--num-gets', dest='num_gets', - help='Number of GET operations to perform') - parser.add_option('-x', '--no-delete', dest='delete', action='store_false', - help='If set, will not delete the objects created') - parser.add_option('-V', '--auth_version', dest='auth_version', - help='Authentication version') - - if len(sys.argv) == 1: - parser.print_help() - sys.exit(1) - options, args = parser.parse_args() - if options.saio: - CONF_DEFAULTS.update(SAIO_DEFAULTS) - if getattr(options, 'lower_object_size', None): - if options.object_size <= options.lower_object_size: - raise ValueError('--lower-object-size (%s) must be ' - '< --object-size (%s)' % - (options.lower_object_size, options.object_size)) - CONF_DEFAULTS['upper_object_size'] = options.object_size - if args: - conf = args[0] - if not os.path.exists(conf): - sys.exit("No such conf file: %s" % conf) - conf = readconf(conf, 'bench', log_name='swift-bench', - defaults=CONF_DEFAULTS) - conf['bench_clients'] = [] - else: - conf = CONF_DEFAULTS - parser.set_defaults(**conf) - options, _junk = parser.parse_args() - if options.concurrency is not '': - options.put_concurrency = options.concurrency - options.get_concurrency = options.concurrency - options.del_concurrency = options.concurrency - options.containers = ['%s_%d' % (options.container_name, i) - for i in xrange(int(options.num_containers))] - # check boolean options vs config parameter values - if config_true_value(str(options.delete).lower()): - options.delete = 'yes' - else: - options.delete = 'no' - - def sigterm(signum, frame): - sys.exit('Termination signal received.') - signal.signal(signal.SIGTERM, sigterm) - - logger = logging.getLogger() - logger.setLevel({ - 'debug': logging.DEBUG, - 'info': logging.INFO, - 'warning': logging.WARNING, - 'error': logging.ERROR, - 'critical': logging.CRITICAL}.get( - options.log_level.lower(), logging.INFO)) - loghandler = logging.StreamHandler() - logger.addHandler(loghandler) - logger = LogAdapter(logger, 'swift-bench') - logformat = logging.Formatter('%(server)s %(asctime)s %(levelname)s ' - '%(message)s') - loghandler.setFormatter(logformat) - - if options.use_proxy: - create_containers(logger, options) - - controller_class = DistributedBenchController if options.bench_clients \ - else BenchController - controller = controller_class(logger, options) - controller.run() - - if config_true_value(options.delete.lower()): - delete_containers(logger, options) diff --git a/bin/swift-bench-client b/bin/swift-bench-client deleted file mode 100755 index 9473b4560c..0000000000 --- a/bin/swift-bench-client +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) 2010-2012 OpenStack, LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import sys -import signal -from optparse import OptionParser - -from swift.common.bench import BenchServer -from swift.common.utils import LogAdapter - -if __name__ == '__main__': - usage = "usage: %prog " - usage += "\n\nRun a client for distributed swift-bench runs." - parser = OptionParser(usage=usage) - parser.add_option('-o', '--log-level', dest='log_level', - default='info', - help='Logging level (debug, info, etc)') - - if len(sys.argv) != 3: - parser.print_help() - sys.exit(1) - options, args = parser.parse_args() - - logger = logging.getLogger() - logger.setLevel({ - 'debug': logging.DEBUG, - 'info': logging.INFO, - 'warning': logging.WARNING, - 'error': logging.ERROR, - 'critical': logging.CRITICAL}.get( - options.log_level.lower(), logging.INFO)) - loghandler = logging.StreamHandler() - logger.addHandler(loghandler) - logger = LogAdapter(logger, 'swift-bench-client') - logformat = logging.Formatter('%(server)s %(asctime)s %(levelname)s ' - '%(message)s') - loghandler.setFormatter(logformat) - - def sigterm(signum, frame): - sys.exit('Termination signal received.') - signal.signal(signal.SIGTERM, sigterm) - signal.signal(signal.SIGINT, sigterm) - - server = BenchServer(logger, args[0], args[1]) - server.run() diff --git a/bin/swift-container-auditor b/bin/swift-container-auditor deleted file mode 100755 index b49a290f33..0000000000 --- a/bin/swift-container-auditor +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) 2010-2012 OpenStack, LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from swift.container.auditor import ContainerAuditor -from swift.common.utils import parse_options -from swift.common.daemon import run_daemon - -if __name__ == '__main__': - conf_file, options = parse_options(once=True) - run_daemon(ContainerAuditor, conf_file, **options) diff --git a/bin/swift-container-replicator b/bin/swift-container-replicator deleted file mode 100755 index 62b803c0c4..0000000000 --- a/bin/swift-container-replicator +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) 2010-2012 OpenStack, LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from swift.container.replicator import ContainerReplicator -from swift.common.utils import parse_options -from swift.common.daemon import run_daemon - -if __name__ == '__main__': - conf_file, options = parse_options(once=True) - run_daemon(ContainerReplicator, conf_file, **options) diff --git a/bin/swift-container-server b/bin/swift-container-server deleted file mode 100755 index b47dce017f..0000000000 --- a/bin/swift-container-server +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) 2010-2012 OpenStack, LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from swift.common.utils import parse_options -from swift.common.wsgi import run_wsgi - -if __name__ == '__main__': - conf_file, options = parse_options() - run_wsgi(conf_file, 'container-server', default_port=6001, **options) diff --git a/bin/swift-container-sync b/bin/swift-container-sync deleted file mode 100755 index 7b6378c4c7..0000000000 --- a/bin/swift-container-sync +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/python -# Copyright (c) 2010-2012 OpenStack, LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from swift.container.sync import ContainerSync -from swift.common.utils import parse_options -from swift.common.daemon import run_daemon - -if __name__ == '__main__': - conf_file, options = parse_options(once=True) - run_daemon(ContainerSync, conf_file, **options) diff --git a/bin/swift-container-updater b/bin/swift-container-updater deleted file mode 100755 index b80cbda092..0000000000 --- a/bin/swift-container-updater +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) 2010-2012 OpenStack, LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from swift.container.updater import ContainerUpdater -from swift.common.utils import parse_options -from swift.common.daemon import run_daemon - -if __name__ == '__main__': - conf_file, options = parse_options(once=True) - run_daemon(ContainerUpdater, conf_file, **options) diff --git a/bin/swift-dispersion-populate b/bin/swift-dispersion-populate deleted file mode 100755 index 1fe01adc0b..0000000000 --- a/bin/swift-dispersion-populate +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) 2010-2012 OpenStack, LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import traceback -from ConfigParser import ConfigParser -from cStringIO import StringIO -from sys import exit, argv, stdout -from time import time -from uuid import uuid4 - -from eventlet import GreenPool, patcher, sleep -from eventlet.pools import Pool - -from swiftclient import Connection, get_auth -from swift.common.ring import Ring -from swift.common.utils import compute_eta, get_time_units - - -def put_container(connpool, container, report): - global retries_done - try: - with connpool.item() as conn: - conn.put_container(container) - retries_done += conn.attempts - 1 - if report: - report(True) - except Exception: - if report: - report(False) - raise - - -def put_object(connpool, container, obj, report): - global retries_done - try: - with connpool.item() as conn: - conn.put_object(container, obj, StringIO(obj), - headers={'x-object-meta-dispersion': obj}) - retries_done += conn.attempts - 1 - if report: - report(True) - except Exception: - if report: - report(False) - raise - - -def report(success): - global begun, created, item_type, next_report, need_to_create, retries_done - if not success: - traceback.print_exc() - exit('Gave up due to error(s).') - created += 1 - if time() < next_report: - return - next_report = time() + 5 - eta, eta_unit = compute_eta(begun, created, need_to_create) - print '\r\x1B[KCreating %s: %d of %d, %d%s left, %d retries' % (item_type, - created, need_to_create, round(eta), eta_unit, retries_done), - stdout.flush() - - -if __name__ == '__main__': - global begun, created, item_type, next_report, need_to_create, retries_done - patcher.monkey_patch() - - conffile = '/etc/swift/dispersion.conf' - if len(argv) == 2: - conffile = argv[1] - elif len(argv) > 2: - exit('Syntax: %s [conffile]' % argv[0]) - c = ConfigParser() - if not c.read(conffile): - exit('Unable to read config file: %s' % conffile) - conf = dict(c.items('dispersion')) - swift_dir = conf.get('swift_dir', '/etc/swift') - dispersion_coverage = int(conf.get('dispersion_coverage', 1)) - retries = int(conf.get('retries', 5)) - concurrency = int(conf.get('concurrency', 25)) - - coropool = GreenPool(size=concurrency) - retries_done = 0 - - url, token = get_auth(conf['auth_url'], conf['auth_user'], - conf['auth_key'], - auth_version=conf.get('auth_version', '1.0')) - account = url.rsplit('/', 1)[1] - connpool = Pool(max_size=concurrency) - connpool.create = lambda: Connection(conf['auth_url'], - conf['auth_user'], conf['auth_key'], - retries=retries, - preauthurl=url, preauthtoken=token) - - container_ring = Ring(swift_dir, ring_name='container') - parts_left = dict((x, x) for x in xrange(container_ring.partition_count)) - item_type = 'containers' - created = 0 - retries_done = 0 - need_to_create = need_to_queue = \ - dispersion_coverage / 100.0 * container_ring.partition_count - begun = next_report = time() - next_report += 2 - while need_to_queue >= 1: - container = 'dispersion_%s' % uuid4().hex - part, _junk = container_ring.get_nodes(account, container) - if part in parts_left: - coropool.spawn(put_container, connpool, container, report) - sleep() - del parts_left[part] - need_to_queue -= 1 - coropool.waitall() - elapsed, elapsed_unit = get_time_units(time() - begun) - print '\r\x1B[KCreated %d containers for dispersion reporting, %d%s, %d ' \ - 'retries' % \ - (need_to_create, round(elapsed), elapsed_unit, retries_done) - stdout.flush() - - container = 'dispersion_objects' - put_container(connpool, container, None) - object_ring = Ring(swift_dir, ring_name='object') - parts_left = dict((x, x) for x in xrange(object_ring.partition_count)) - item_type = 'objects' - created = 0 - retries_done = 0 - need_to_create = need_to_queue = \ - dispersion_coverage / 100.0 * object_ring.partition_count - begun = next_report = time() - next_report += 2 - while need_to_queue >= 1: - obj = 'dispersion_%s' % uuid4().hex - part, _junk = object_ring.get_nodes(account, container, obj) - if part in parts_left: - coropool.spawn(put_object, connpool, container, obj, report) - sleep() - del parts_left[part] - need_to_queue -= 1 - coropool.waitall() - elapsed, elapsed_unit = get_time_units(time() - begun) - print '\r\x1B[KCreated %d objects for dispersion reporting, %d%s, %d ' \ - 'retries' % \ - (need_to_create, round(elapsed), elapsed_unit, retries_done) - stdout.flush() diff --git a/bin/swift-drive-audit b/bin/swift-drive-audit deleted file mode 100755 index 6c742a483e..0000000000 --- a/bin/swift-drive-audit +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) 2010-2012 OpenStack, LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import datetime -import os -import re -import subprocess -import sys -from ConfigParser import ConfigParser - -from swift.common.utils import get_logger - - -# To search for more types of errors, add the regex to the list below -error_re = [ - re.compile(r'\berror\b.*\b(sd[a-z]{1,2}\d?)\b'), - re.compile(r'\b(sd[a-z]{1,2}\d?)\b.*\berror\b'), -] - - -def get_devices(device_dir, logger): - devices = [] - for line in open('/proc/mounts').readlines(): - data = line.strip().split() - block_device = data[0] - mount_point = data[1] - if mount_point.startswith(device_dir): - device = {} - device['mount_point'] = mount_point - device['block_device'] = block_device - try: - device_num = os.stat(block_device).st_rdev - except OSError, e: - # If we can't stat the device, then something weird is going on - logger.error("Error: Could not stat %s!" % - block_device) - continue - device['major'] = str(os.major(device_num)) - device['minor'] = str(os.minor(device_num)) - devices.append(device) - for line in open('/proc/partitions').readlines()[2:]: - major, minor, blocks, kernel_device = line.strip().split() - device = [d for d in devices - if d['major'] == major and d['minor'] == minor] - if device: - device[0]['kernel_device'] = kernel_device - return devices - - -def get_errors(minutes): - errors = {} - start_time = datetime.datetime.now() - datetime.timedelta(minutes=minutes) - try: - for line in open('/var/log/kern.log'): - if '[ 0.000000]' in line: - # Ignore anything before the last boot - errors = {} - continue - log_time_string = '%s %s' % (start_time.year, - ' '.join(line.split()[:3])) - log_time = datetime.datetime.strptime( - log_time_string, '%Y %b %d %H:%M:%S') - if log_time > start_time: - for err in error_re: - for device in err.findall(line): - errors[device] = errors.get(device, 0) + 1 - return errors - except IOError: - logger.error("Error: Unable to open /var/log/kern.log") - print("Unable to open /var/log/kern.log") - sys.exit(1) - - -def comment_fstab(mount_point): - with open('/etc/fstab', 'r') as fstab: - with open('/etc/fstab.new', 'w') as new_fstab: - for line in fstab: - parts = line.split() - if len(parts) > 2 and line.split()[1] == mount_point: - new_fstab.write('#' + line) - else: - new_fstab.write(line) - os.rename('/etc/fstab.new', '/etc/fstab') - - -if __name__ == '__main__': - c = ConfigParser() - try: - conf_path = sys.argv[1] - except Exception: - print "Usage: %s CONF_FILE" % sys.argv[0].split('/')[-1] - sys.exit(1) - if not c.read(conf_path): - print "Unable to read config file %s" % conf_path - sys.exit(1) - conf = dict(c.items('drive-audit')) - device_dir = conf.get('device_dir', '/srv/node') - minutes = int(conf.get('minutes', 60)) - error_limit = int(conf.get('error_limit', 1)) - conf['log_name'] = conf.get('log_name', 'drive-audit') - logger = get_logger(conf, log_route='drive-audit') - devices = get_devices(device_dir, logger) - logger.debug("Devices found: %s" % str(devices)) - if not devices: - logger.error("Error: No devices found!") - errors = get_errors(minutes) - logger.debug("Errors found: %s" % str(errors)) - unmounts = 0 - for kernel_device, count in errors.items(): - if count >= error_limit: - device = \ - [d for d in devices if d['kernel_device'] == kernel_device] - if device: - mount_point = device[0]['mount_point'] - if mount_point.startswith(device_dir): - logger.info("Unmounting %s with %d errors" % - (mount_point, count)) - subprocess.call(['umount', '-fl', mount_point]) - logger.info("Commenting out %s from /etc/fstab" % - (mount_point)) - comment_fstab(mount_point) - unmounts += 1 - if unmounts == 0: - logger.info("No drives were unmounted") diff --git a/bin/swift-form-signature b/bin/swift-form-signature deleted file mode 100755 index 08a5adce53..0000000000 --- a/bin/swift-form-signature +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python - -import hmac -from hashlib import sha1 -from os.path import basename -from sys import argv, exit -from time import time - - -if __name__ == '__main__': - if len(argv) != 7: - prog = basename(argv[0]) - print 'Syntax: %s ' \ - ' ' % prog - print - print 'Where:' - print ' The prefix to use for form uploaded' - print ' objects. For example:' - print ' /v1/account/container/object_prefix_ would' - print ' ensure all form uploads have that path' - print ' prepended to the browser-given file name.' - print ' The URL to redirect the browser to after' - print ' the uploads have completed.' - print ' The maximum file size per file uploaded.' - print ' The maximum number of uploaded files' - print ' allowed.' - print ' The number of seconds from now to allow' - print ' the form post to begin.' - print ' The X-Account-Meta-Temp-URL-Key for the' - print ' account.' - print - print 'Example output:' - print ' Expires: 1323842228' - print ' Signature: 18de97e47345a82c4dbfb3b06a640dbb' - exit(1) - path, redirect, max_file_size, max_file_count, seconds, key = argv[1:] - try: - max_file_size = int(max_file_size) - except ValueError: - max_file_size = -1 - if max_file_size < 0: - print 'Please use a value greater than or equal to 0.' - exit(1) - try: - max_file_count = int(max_file_count) - except ValueError: - max_file_count = 0 - if max_file_count < 1: - print 'Please use a positive value.' - exit(1) - try: - expires = int(time() + int(seconds)) - except ValueError: - expires = 0 - if expires < 1: - print 'Please use a positive value.' - exit(1) - parts = path.split('/', 4) - # Must be four parts, ['', 'v1', 'a', 'c'], must be a v1 request, have - # account and container values, and optionally have an object prefix. - if len(parts) < 4 or parts[0] or parts[1] != 'v1' or not parts[2] or \ - not parts[3]: - print ' must point to a container at least.' - print 'For example: /v1/account/container' - print ' Or: /v1/account/container/object_prefix' - exit(1) - sig = hmac.new(key, '%s\n%s\n%s\n%s\n%s' % (path, redirect, max_file_size, - max_file_count, expires), - sha1).hexdigest() - print ' Expires:', expires - print 'Signature:', sig diff --git a/bin/swift-get-nodes b/bin/swift-get-nodes deleted file mode 100755 index fcf45ae77b..0000000000 --- a/bin/swift-get-nodes +++ /dev/null @@ -1,132 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) 2010-2012 OpenStack, LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import optparse -import sys -import urllib - -from swift.common.ring import Ring -from swift.common.utils import hash_path - - -parser = optparse.OptionParser() -parser.add_option('-a', '--all', action='store_true', - help='Show all handoff nodes') -parser.add_option('-p', '--partition', metavar='PARTITION', - help='Show nodes for a given partition') -(options, args) = parser.parse_args() - -if (len(args) < 2 or len(args) > 4) and \ - (options.partition is None or not args): - print 'Usage: %s [-a] [] []' \ - % sys.argv[0] - print ' Or: %s [-a] -p partition' % sys.argv[0] - print 'Shows the nodes responsible for the item specified.' - print 'Example:' - print ' $ %s /etc/swift/account.ring.gz MyAccount' % sys.argv[0] - print ' Partition 5743883' - print ' Hash 96ae332a60b58910784e4417a03e1ad0' - print ' 10.1.1.7:8000 sdd1' - print ' 10.1.9.2:8000 sdb1' - print ' 10.1.5.5:8000 sdf1' - print ' 10.1.5.9:8000 sdt1 # [Handoff]' - sys.exit(1) - -ringloc = None -account = None -container = None -obj = None - -if len(args) == 4: - # Account, Container and Object - ring_file, account, container, obj = args - ring = Ring(ring_file) - hash_str = hash_path(account, container, obj) - part, nodes = ring.get_nodes(account, container, obj) - target = "%s/%s/%s" % (account, container, obj) - loc = 'objects' -elif len(args) == 3: - # Account, Container - ring_file, account, container = args - ring = Ring(ring_file) - hash_str = hash_path(account, container) - part, nodes = ring.get_nodes(account, container) - target = "%s/%s" % (account, container) - loc = 'containers' -elif len(args) == 2: - # Account - ring_file, account = args - ring = Ring(ring_file) - hash_str = hash_path(account) - part, nodes = ring.get_nodes(account) - target = "%s" % (account) - loc = 'accounts' -elif len(args) == 1: - # Partition - ring_file = args[0] - ring = Ring(ring_file) - hash_str = None - part = int(options.partition) - nodes = ring.get_part_nodes(part) - target = '' - loc = ring_file.rsplit('/', 1)[-1].split('.', 1)[0] - if loc in ('account', 'container', 'object'): - loc += 's' - else: - loc = '' - -more_nodes = [] -for more_node in ring.get_more_nodes(part): - more_nodes.append(more_node) - if not options.all and len(more_nodes) >= ring.replica_count: - break - -print '\nAccount \t%s' % account -print 'Container\t%s' % container -print 'Object \t%s\n' % obj -print '\nPartition\t%s' % part -print 'Hash \t%s\n' % hash_str - -for node in nodes: - print 'Server:Port Device\t%s:%s %s' % (node['ip'], node['port'], - node['device']) -for mnode in more_nodes: - print 'Server:Port Device\t%s:%s %s\t [Handoff]' \ - % (mnode['ip'], mnode['port'], mnode['device']) -print "\n" -for node in nodes: - print 'curl -I -XHEAD "http://%s:%s/%s/%s/%s"' \ - % (node['ip'], node['port'], node['device'], part, - urllib.quote(target)) -for mnode in more_nodes: - print 'curl -I -XHEAD "http://%s:%s/%s/%s/%s" # [Handoff]' \ - % (mnode['ip'], mnode['port'], mnode['device'], part, - urllib.quote(target)) -print "\n" -for node in nodes: - if hash_str: - print 'ssh %s "ls -lah /srv/node/%s/%s/%s/%s/%s/"' % ( - node['ip'], node['device'], loc, part, hash_str[-3:], hash_str) - else: - print 'ssh %s "ls -lah /srv/node/%s/%s/%s/"' % ( - node['ip'], node['device'], loc, part) -for mnode in more_nodes: - if hash_str: - print 'ssh %s "ls -lah /srv/node/%s/%s/%s/%s/%s/" # [Handoff]' % ( - mnode['ip'], mnode['device'], loc, part, hash_str[-3:], hash_str) - else: - print 'ssh %s "ls -lah /srv/node/%s/%s/%s/" # [Handoff]' % ( - mnode['ip'], mnode['device'], loc, part) diff --git a/bin/swift-init b/bin/swift-init deleted file mode 100755 index 17dcbd0be1..0000000000 --- a/bin/swift-init +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) 2010-2012 OpenStack, LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -from optparse import OptionParser - -from swift.common.manager import Server, Manager, UnknownCommandError - -USAGE = """%prog [ ...] [options] - -Commands: -""" + '\n'.join(["%16s: %s" % x for x in Manager.list_commands()]) - - -def main(): - parser = OptionParser(USAGE) - parser.add_option('-v', '--verbose', action="store_true", - default=False, help="display verbose output") - parser.add_option('-w', '--no-wait', action="store_false", dest="wait", - default=True, help="won't wait for server to start " - "before returning") - parser.add_option('-o', '--once', action="store_true", - default=False, help="only run one pass of daemon") - # this is a negative option, default is options.daemon = True - parser.add_option('-n', '--no-daemon', action="store_false", dest="daemon", - default=True, help="start server interactively") - parser.add_option('-g', '--graceful', action="store_true", - default=False, help="send SIGHUP to supporting servers") - parser.add_option('-c', '--config-num', metavar="N", type="int", - dest="number", default=0, - help="send command to the Nth server only") - options, args = parser.parse_args() - - if len(args) < 2: - parser.print_help() - print 'ERROR: specify server(s) and command' - return 1 - - command = args[-1] - servers = args[:-1] - - # this is just a silly swap for me cause I always try to "start main" - commands = dict(Manager.list_commands()).keys() - if command not in commands and servers[0] in commands: - servers.append(command) - command = servers.pop(0) - - manager = Manager(servers) - try: - status = manager.run_command(command, **options.__dict__) - except UnknownCommandError: - parser.print_help() - print 'ERROR: unknown command, %s' % command - status = 1 - - return 1 if status else 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/bin/swift-object-auditor b/bin/swift-object-auditor deleted file mode 100755 index 2a2d38bfcd..0000000000 --- a/bin/swift-object-auditor +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) 2010-2012 OpenStack, LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from swift.obj.auditor import ObjectAuditor -from swift.common.utils import parse_options -from swift.common.daemon import run_daemon -from optparse import OptionParser - -if __name__ == '__main__': - parser = OptionParser("%prog CONFIG [options]") - parser.add_option('-z', '--zero_byte_fps', - help='Audit only zero byte files at specified files/sec') - conf_file, options = parse_options(parser=parser, once=True) - run_daemon(ObjectAuditor, conf_file, **options) diff --git a/bin/swift-object-info b/bin/swift-object-info deleted file mode 100755 index bf612cc5b7..0000000000 --- a/bin/swift-object-info +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) 2010-2012 OpenStack, LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import cPickle as pickle -from datetime import datetime -from hashlib import md5 - -from swift.common.ring import Ring -from swift.obj.server import read_metadata -from swift.common.utils import hash_path - -if __name__ == '__main__': - if len(sys.argv) <= 1: - print "Usage: %s OBJECT_FILE" % sys.argv[0] - sys.exit(1) - try: - ring = Ring('/etc/swift/', ring_name='object') - except Exception: - ring = None - datafile = sys.argv[1] - fp = open(datafile, 'rb') - metadata = read_metadata(fp) - path = metadata.pop('name', '') - content_type = metadata.pop('Content-Type', '') - ts = metadata.pop('X-Timestamp', '') - etag = metadata.pop('ETag', '') - length = metadata.pop('Content-Length', '') - if path: - print 'Path: %s' % path - account, container, obj = path.split('/', 3)[1:] - print ' Account: %s' % account - print ' Container: %s' % container - print ' Object: %s' % obj - obj_hash = hash_path(account, container, obj) - print ' Object hash: %s' % obj_hash - if ring is not None: - print 'Ring locations:' - part, nodes = ring.get_nodes(account, container, obj) - for node in nodes: - print (' %s:%s - /srv/node/%s/objects/%s/%s/%s/%s.data' % - (node['ip'], node['port'], node['device'], part, - obj_hash[-3:], obj_hash, ts)) - else: - print 'Path: Not found in metadata' - if content_type: - print 'Content-Type: %s' % content_type - else: - print 'Content-Type: Not found in metadata' - if ts: - print 'Timestamp: %s (%s)' % (datetime.fromtimestamp(float(ts)), ts) - else: - print 'Timestamp: Not found in metadata' - h = md5() - file_len = 0 - while True: - data = fp.read(64 * 1024) - if not data: - break - h.update(data) - file_len += len(data) - h = h.hexdigest() - if etag: - if h == etag: - print 'ETag: %s (valid)' % etag - else: - print "Etag: %s doesn't match file hash of %s!" % (etag, h) - else: - print 'ETag: Not found in metadata' - if length: - if file_len == int(length): - print 'Content-Length: %s (valid)' % length - else: - print "Content-Length: %s doesn't match file length of %s" % ( - length, file_len) - else: - print 'Content-Length: Not found in metadata' - print 'User Metadata: %s' % metadata - fp.close() diff --git a/bin/swift-object-replicator b/bin/swift-object-replicator deleted file mode 100755 index 2f01a209a3..0000000000 --- a/bin/swift-object-replicator +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) 2010-2012 OpenStack, LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from swift.obj.replicator import ObjectReplicator -from swift.common.utils import parse_options -from swift.common.daemon import run_daemon -from optparse import OptionParser - -if __name__ == '__main__': - parser = OptionParser("%prog CONFIG [options]") - parser.add_option('-d', '--devices', - help='Replicate only given devices. ' - 'Comma-separated list') - parser.add_option('-p', '--partitions', - help='Replicate only given partitions. ' - 'Comma-separated list') - conf_file, options = parse_options(parser=parser, once=True) - run_daemon(ObjectReplicator, conf_file, **options) diff --git a/bin/swift-object-updater b/bin/swift-object-updater deleted file mode 100755 index c7f04c965d..0000000000 --- a/bin/swift-object-updater +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) 2010-2012 OpenStack, LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from swift.obj.updater import ObjectUpdater -from swift.common.utils import parse_options -from swift.common.daemon import run_daemon - -if __name__ == '__main__': - conf_file, options = parse_options(once=True) - run_daemon(ObjectUpdater, conf_file, **options) diff --git a/bin/swift-oldies b/bin/swift-oldies deleted file mode 100755 index 15f34e5281..0000000000 --- a/bin/swift-oldies +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env python - -import optparse -import os -import signal -import subprocess -import sys - - -if __name__ == '__main__': - parser = optparse.OptionParser(usage='''%prog [options] - -Lists old Swift processes. - '''.strip()) - parser.add_option('-a', '--age', dest='hours', type='int', default=720, - help='look for processes at least HOURS old; ' - 'default: 720 (30 days)') - (options, args) = parser.parse_args() - - listing = [] - for line in subprocess.Popen( - ['ps', '-eo', 'etime,pid,args', '--no-headers'], - stdout=subprocess.PIPE).communicate()[0].split('\n'): - if not line: - continue - hours = 0 - try: - etime, pid, args = line.split(None, 2) - except ValueError: - sys.exit('Could not process ps line %r' % line) - if not args.startswith('/usr/bin/python /usr/bin/swift-') and \ - not args.startswith('/usr/bin/python /usr/local/bin/swift-'): - continue - args = args.split('-', 1)[1] - etime = etime.split('-') - if len(etime) == 2: - hours = int(etime[0]) * 24 - etime = etime[1] - elif len(etime) == 1: - etime = etime[0] - else: - sys.exit('Could not process etime value from %r' % line) - etime = etime.split(':') - if len(etime) == 3: - hours += int(etime[0]) - elif len(etime) != 2: - sys.exit('Could not process etime value from %r' % line) - if hours >= options.hours: - listing.append((str(hours), pid, args)) - - if not listing: - exit() - - hours_len = len('Hours') - pid_len = len('PID') - args_len = len('Command') - for hours, pid, args in listing: - hours_len = max(hours_len, len(hours)) - pid_len = max(pid_len, len(pid)) - args_len = max(args_len, len(args)) - args_len = min(args_len, 78 - hours_len - pid_len) - - print ('%%%ds %%%ds %%s' % (hours_len, pid_len)) % \ - ('Hours', 'PID', 'Command') - for hours, pid, args in listing: - print ('%%%ds %%%ds %%s' % (hours_len, pid_len)) % \ - (hours, pid, args[:args_len]) diff --git a/bin/swift-recon b/bin/swift-recon deleted file mode 100755 index b72cc2841d..0000000000 --- a/bin/swift-recon +++ /dev/null @@ -1,765 +0,0 @@ -#! /usr/bin/env python -""" - cmdline utility to perform cluster reconnaissance -""" - - -from eventlet.green import urllib2 -from swift.common.ring import Ring -from urlparse import urlparse -try: - import simplejson as json -except ImportError: - import json -from hashlib import md5 -import eventlet -import optparse -import time -import sys -import os - - -class Scout(object): - """ - Obtain swift recon information - """ - - def __init__(self, recon_type, verbose=False, suppress_errors=False, - timeout=5): - self.recon_type = recon_type - self.verbose = verbose - self.suppress_errors = suppress_errors - self.timeout = timeout - - def scout_host(self, base_url, recon_type): - """ - Perform the actual HTTP request to obtain swift recon telemtry. - - :param base_url: the base url of the host you wish to check. str of the - format 'http://127.0.0.1:6000/recon/' - :param recon_type: the swift recon check to request. - :returns: tuple of (recon url used, response body, and status) - """ - url = base_url + recon_type - try: - body = urllib2.urlopen(url, timeout=self.timeout).read() - content = json.loads(body) - if self.verbose: - print "-> %s: %s" % (url, content) - status = 200 - except urllib2.HTTPError as err: - if not self.suppress_errors or self.verbose: - print "-> %s: %s" % (url, err) - content = err - status = err.code - except urllib2.URLError as err: - if not self.suppress_errors or self.verbose: - print "-> %s: %s" % (url, err) - content = err - status = -1 - return url, content, status - - def scout(self, host): - """ - Obtain telemetry from a host running the swift recon middleware. - - :param host: host to check - :returns: tuple of (recon url used, response body, and status) - """ - base_url = "http://%s:%s/recon/" % (host[0], host[1]) - url, content, status = self.scout_host(base_url, self.recon_type) - return url, content, status - - -class SwiftRecon(object): - """ - Retrieve and report cluster info from hosts running recon middleware. - """ - - def __init__(self): - self.verbose = False - self.suppress_errors = False - self.timeout = 5 - self.pool_size = 30 - self.pool = eventlet.GreenPool(self.pool_size) - self.check_types = ['account', 'container', 'object'] - self.server_type = 'object' - - def _gen_stats(self, stats, name=None): - """ compute various stats from a list of values """ - cstats = [x for x in stats if x is not None] - if len(cstats) > 0: - ret_dict = {'low': min(cstats), 'high': max(cstats), - 'total': sum(cstats), 'reported': len(cstats), - 'number_none': len(stats) - len(cstats), 'name': name} - ret_dict['average'] = \ - ret_dict['total'] / float(len(cstats)) - ret_dict['perc_none'] = \ - ret_dict['number_none'] * 100.0 / len(stats) - else: - ret_dict = {'reported': 0} - return ret_dict - - def _print_stats(self, stats): - """ - print out formatted stats to console - - :param stats: dict of stats generated by _gen_stats - """ - print '[%(name)s] low: %(low)d, high: %(high)d, avg: ' \ - '%(average).1f, total: %(total)d, ' \ - 'Failed: %(perc_none).1f%%, no_result: %(number_none)d, ' \ - 'reported: %(reported)d' % stats - - def _ptime(self, timev=None): - """ - :param timev: a unix timestamp or None - :returns: a pretty string of the current time or provided time - """ - if timev: - return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timev)) - else: - return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) - - def get_devices(self, zone_filter, swift_dir, ring_name): - """ - Get a list of hosts in the ring - - :param zone_filter: Only list zones matching given filter - :param swift_dir: Directory of swift config, usually /etc/swift - :param ring_name: Name of the ring, such as 'object' - :returns: a set of tuples containing the ip and port of hosts - """ - ring_data = Ring(swift_dir, ring_name=ring_name) - if zone_filter: - ips = set((n['ip'], n['port']) for n in ring_data.devs - if n and n['zone'] == zone_filter) - else: - ips = set((n['ip'], n['port']) for n in ring_data.devs if n) - return ips - - def get_ringmd5(self, hosts, ringfile): - """ - Compare ring md5sum's with those on remote host - - :param hosts: set of hosts to check. in the format of: - set([('127.0.0.1', 6020), ('127.0.0.2', 6030)]) - :param ringfile: The local ring file to compare the md5sum with. - """ - stats = {} - matches = 0 - errors = 0 - md5sum = md5() - with open(ringfile, 'rb') as f: - block = f.read(4096) - while block: - md5sum.update(block) - block = f.read(4096) - ring_sum = md5sum.hexdigest() - recon = Scout("ringmd5", self.verbose, self.suppress_errors, - self.timeout) - print "[%s] Checking ring md5sums" % self._ptime() - if self.verbose: - print "-> On disk %s md5sum: %s" % (ringfile, ring_sum) - for url, response, status in self.pool.imap(recon.scout, hosts): - if status == 200: - stats[url] = response[ringfile] - if response[ringfile] != ring_sum: - print "!! %s (%s) doesn't match on disk md5sum" % \ - (url, response[ringfile]) - else: - matches = matches + 1 - if self.verbose: - print "-> %s matches." % url - else: - errors = errors + 1 - print "%s/%s hosts matched, %s error[s] while checking hosts." \ - % (matches, len(hosts), errors) - print "=" * 79 - - def async_check(self, hosts): - """ - Obtain and print async pending statistics - - :param hosts: set of hosts to check. in the format of: - set([('127.0.0.1', 6020), ('127.0.0.2', 6030)]) - """ - scan = {} - recon = Scout("async", self.verbose, self.suppress_errors, - self.timeout) - print "[%s] Checking async pendings" % self._ptime() - for url, response, status in self.pool.imap(recon.scout, hosts): - if status == 200: - scan[url] = response['async_pending'] - stats = self._gen_stats(scan.values(), 'async_pending') - if stats['reported'] > 0: - self._print_stats(stats) - else: - print "[async_pending] - No hosts returned valid data." - print "=" * 79 - - def umount_check(self, hosts): - """ - Check for and print unmounted drives - - :param hosts: set of hosts to check. in the format of: - set([('127.0.0.1', 6020), ('127.0.0.2', 6030)]) - """ - stats = {} - recon = Scout("unmounted", self.verbose, self.suppress_errors, - self.timeout) - print "[%s] Getting unmounted drives from %s hosts..." % \ - (self._ptime(), len(hosts)) - for url, response, status in self.pool.imap(recon.scout, hosts): - if status == 200: - stats[url] = [] - for i in response: - stats[url].append(i['device']) - for host in stats: - node = urlparse(host).netloc - for entry in stats[host]: - print "Not mounted: %s on %s" % (entry, node) - print "=" * 79 - - def expirer_check(self, hosts): - """ - Obtain and print expirer statistics - - :param hosts: set of hosts to check. in the format of: - set([('127.0.0.1', 6020), ('127.0.0.2', 6030)]) - """ - stats = {'object_expiration_pass': [], 'expired_last_pass': []} - recon = Scout("expirer/%s" % self.server_type, self.verbose, - self.suppress_errors, self.timeout) - print "[%s] Checking on expirers" % self._ptime() - for url, response, status in self.pool.imap(recon.scout, hosts): - if status == 200: - stats['object_expiration_pass'].append( - response.get('object_expiration_pass')) - stats['expired_last_pass'].append( - response.get('expired_last_pass')) - for k in stats: - if stats[k]: - computed = self._gen_stats(stats[k], name=k) - if computed['reported'] > 0: - self._print_stats(computed) - else: - print "[%s] - No hosts returned valid data." % k - else: - print "[%s] - No hosts returned valid data." % k - print "=" * 79 - - def replication_check(self, hosts): - """ - Obtain and print replication statistics - - :param hosts: set of hosts to check. in the format of: - set([('127.0.0.1', 6020), ('127.0.0.2', 6030)]) - """ - stats = {'replication_time': [], 'failure': [], 'success': [], - 'attempted': []} - recon = Scout("replication/%s" % self.server_type, self.verbose, - self.suppress_errors, self.timeout) - print "[%s] Checking on replication" % self._ptime() - for url, response, status in self.pool.imap(recon.scout, hosts): - if status == 200: - stats['replication_time'].append( - response.get('replication_time')) - repl_stats = response['replication_stats'] - if repl_stats: - for stat_key in ['attempted', 'failure', 'success']: - stats[stat_key].append(repl_stats.get(stat_key)) - for k in stats: - if stats[k]: - if k != 'replication_time': - computed = self._gen_stats(stats[k], - name='replication_%s' % k) - else: - computed = self._gen_stats(stats[k], name=k) - if computed['reported'] > 0: - self._print_stats(computed) - else: - print "[%s] - No hosts returned valid data." % k - else: - print "[%s] - No hosts returned valid data." % k - print "=" * 79 - - def object_replication_check(self, hosts): - """ - Obtain and print replication statistics from object servers - - :param hosts: set of hosts to check. in the format of: - set([('127.0.0.1', 6020), ('127.0.0.2', 6030)]) - """ - stats = {} - recon = Scout("replication", self.verbose, self.suppress_errors, - self.timeout) - print "[%s] Checking on replication" % self._ptime() - for url, response, status in self.pool.imap(recon.scout, hosts): - if status == 200: - stats[url] = response['object_replication_time'] - times = [x for x in stats.values() if x is not None] - if len(stats) > 0 and len(times) > 0: - computed = self._gen_stats(times, 'replication_time') - if computed['reported'] > 0: - self._print_stats(computed) - else: - print "[replication_time] - No hosts returned valid data." - else: - print "[replication_time] - No hosts returned valid data." - print "=" * 79 - - def updater_check(self, hosts): - """ - Obtain and print updater statistics - - :param hosts: set of hosts to check. in the format of: - set([('127.0.0.1', 6020), ('127.0.0.2', 6030)]) - """ - stats = [] - recon = Scout("updater/%s" % self.server_type, self.verbose, - self.suppress_errors, self.timeout) - print "[%s] Checking updater times" % self._ptime() - for url, response, status in self.pool.imap(recon.scout, hosts): - if status == 200: - if response['%s_updater_sweep' % self.server_type]: - stats.append(response['%s_updater_sweep' % - self.server_type]) - if len(stats) > 0: - computed = self._gen_stats(stats, name='updater_last_sweep') - if computed['reported'] > 0: - self._print_stats(computed) - else: - print "[updater_last_sweep] - No hosts returned valid data." - else: - print "[updater_last_sweep] - No hosts returned valid data." - print "=" * 79 - - def auditor_check(self, hosts): - """ - Obtain and print obj auditor statistics - - :param hosts: set of hosts to check. in the format of: - set([('127.0.0.1', 6020), ('127.0.0.2', 6030)]) - """ - scan = {} - adone = '%s_auditor_pass_completed' % self.server_type - afail = '%s_audits_failed' % self.server_type - apass = '%s_audits_passed' % self.server_type - asince = '%s_audits_since' % self.server_type - recon = Scout("auditor/%s" % self.server_type, self.verbose, - self.suppress_errors, self.timeout) - print "[%s] Checking auditor stats" % self._ptime() - for url, response, status in self.pool.imap(recon.scout, hosts): - if status == 200: - scan[url] = response - if len(scan) < 1: - print "Error: No hosts available" - return - stats = {} - stats[adone] = [scan[i][adone] for i in scan - if scan[i][adone] is not None] - stats[afail] = [scan[i][afail] for i in scan - if scan[i][afail] is not None] - stats[apass] = [scan[i][apass] for i in scan - if scan[i][apass] is not None] - stats[asince] = [scan[i][asince] for i in scan - if scan[i][asince] is not None] - for k in stats: - if len(stats[k]) < 1: - print "[%s] - No hosts returned valid data." % k - else: - if k != asince: - computed = self._gen_stats(stats[k], k) - if computed['reported'] > 0: - self._print_stats(computed) - if len(stats[asince]) >= 1: - low = min(stats[asince]) - high = max(stats[asince]) - total = sum(stats[asince]) - average = total / len(stats[asince]) - print '[last_pass] oldest: %s, newest: %s, avg: %s' % \ - (self._ptime(low), self._ptime(high), self._ptime(average)) - print "=" * 79 - - def object_auditor_check(self, hosts): - """ - Obtain and print obj auditor statistics - - :param hosts: set of hosts to check. in the format of: - set([('127.0.0.1', 6020), ('127.0.0.2', 6030)]) - """ - all_scan = {} - zbf_scan = {} - atime = 'audit_time' - bprocessed = 'bytes_processed' - passes = 'passes' - errors = 'errors' - quarantined = 'quarantined' - recon = Scout("auditor/object", self.verbose, self.suppress_errors, - self.timeout) - print "[%s] Checking auditor stats " % self._ptime() - for url, response, status in self.pool.imap(recon.scout, hosts): - if status == 200: - if response['object_auditor_stats_ALL']: - all_scan[url] = response['object_auditor_stats_ALL'] - if response['object_auditor_stats_ZBF']: - zbf_scan[url] = response['object_auditor_stats_ZBF'] - if len(all_scan) > 0: - stats = {} - stats[atime] = [all_scan[i][atime] for i in all_scan] - stats[bprocessed] = [all_scan[i][bprocessed] for i in all_scan] - stats[passes] = [all_scan[i][passes] for i in all_scan] - stats[errors] = [all_scan[i][errors] for i in all_scan] - stats[quarantined] = [all_scan[i][quarantined] for i in all_scan] - for k in stats: - if None in stats[k]: - stats[k] = [x for x in stats[k] if x is not None] - if len(stats[k]) < 1: - print "[Auditor %s] - No hosts returned valid data." % k - else: - computed = self._gen_stats(stats[k], - name='ALL_%s_last_path' % k) - if computed['reported'] > 0: - self._print_stats(computed) - else: - print "[ALL_auditor] - No hosts returned valid data." - else: - print "[ALL_auditor] - No hosts returned valid data." - if len(zbf_scan) > 0: - stats = {} - stats[atime] = [zbf_scan[i][atime] for i in zbf_scan] - stats[bprocessed] = [zbf_scan[i][bprocessed] for i in zbf_scan] - stats[errors] = [zbf_scan[i][errors] for i in zbf_scan] - stats[quarantined] = [zbf_scan[i][quarantined] for i in zbf_scan] - for k in stats: - if None in stats[k]: - stats[k] = [x for x in stats[k] if x is not None] - if len(stats[k]) < 1: - print "[Auditor %s] - No hosts returned valid data." % k - else: - computed = self._gen_stats(stats[k], - name='ZBF_%s_last_path' % k) - if computed['reported'] > 0: - self._print_stats(computed) - else: - print "[ZBF_auditor] - No hosts returned valid data." - else: - print "[ZBF_auditor] - No hosts returned valid data." - print "=" * 79 - - def load_check(self, hosts): - """ - Obtain and print load average statistics - - :param hosts: set of hosts to check. in the format of: - set([('127.0.0.1', 6020), ('127.0.0.2', 6030)]) - """ - load1 = {} - load5 = {} - load15 = {} - recon = Scout("load", self.verbose, self.suppress_errors, - self.timeout) - print "[%s] Checking load averages" % self._ptime() - for url, response, status in self.pool.imap(recon.scout, hosts): - if status == 200: - load1[url] = response['1m'] - load5[url] = response['5m'] - load15[url] = response['15m'] - stats = {"1m": load1, "5m": load5, "15m": load15} - for item in stats: - if len(stats[item]) > 0: - computed = self._gen_stats(stats[item].values(), - name='%s_load_avg' % item) - self._print_stats(computed) - else: - print "[%s_load_avg] - No hosts returned valid data." % item - print "=" * 79 - - def quarantine_check(self, hosts): - """ - Obtain and print quarantine statistics - - :param hosts: set of hosts to check. in the format of: - set([('127.0.0.1', 6020), ('127.0.0.2', 6030)]) - """ - objq = {} - conq = {} - acctq = {} - recon = Scout("quarantined", self.verbose, self.suppress_errors, - self.timeout) - print "[%s] Checking quarantine" % self._ptime() - for url, response, status in self.pool.imap(recon.scout, hosts): - if status == 200: - objq[url] = response['objects'] - conq[url] = response['containers'] - acctq[url] = response['accounts'] - stats = {"objects": objq, "containers": conq, "accounts": acctq} - for item in stats: - if len(stats[item]) > 0: - computed = self._gen_stats(stats[item].values(), - name='quarantined_%s' % item) - self._print_stats(computed) - else: - print "No hosts returned valid data." - print "=" * 79 - - def socket_usage(self, hosts): - """ - Obtain and print /proc/net/sockstat statistics - - :param hosts: set of hosts to check. in the format of: - set([('127.0.0.1', 6020), ('127.0.0.2', 6030)]) - """ - inuse4 = {} - mem = {} - inuse6 = {} - timewait = {} - orphan = {} - recon = Scout("sockstat", self.verbose, self.suppress_errors, - self.timeout) - print "[%s] Checking socket usage" % self._ptime() - for url, response, status in self.pool.imap(recon.scout, hosts): - if status == 200: - inuse4[url] = response['tcp_in_use'] - mem[url] = response['tcp_mem_allocated_bytes'] - inuse6[url] = response['tcp6_in_use'] - timewait[url] = response['time_wait'] - orphan[url] = response['orphan'] - stats = {"tcp_in_use": inuse4, "tcp_mem_allocated_bytes": mem, - "tcp6_in_use": inuse6, "time_wait": timewait, - "orphan": orphan} - for item in stats: - if len(stats[item]) > 0: - computed = self._gen_stats(stats[item].values(), item) - self._print_stats(computed) - else: - print "No hosts returned valid data." - print "=" * 79 - - def disk_usage(self, hosts, top=0): - """ - Obtain and print disk usage statistics - - :param hosts: set of hosts to check. in the format of: - set([('127.0.0.1', 6020), ('127.0.0.2', 6030)]) - """ - stats = {} - highs = [] - lows = [] - raw_total_used = [] - raw_total_avail = [] - percents = {} - top_percents = [(None, 0)] * top - recon = Scout("diskusage", self.verbose, self.suppress_errors, - self.timeout) - print "[%s] Checking disk usage now" % self._ptime() - for url, response, status in self.pool.imap(recon.scout, hosts): - if status == 200: - hostusage = [] - for entry in response: - if entry['mounted']: - used = float(entry['used']) / float(entry['size']) \ - * 100.0 - raw_total_used.append(entry['used']) - raw_total_avail.append(entry['avail']) - hostusage.append(round(used, 2)) - for ident, oused in top_percents: - if oused < used: - top_percents.append( - (url + ' ' + entry['device'], used)) - top_percents.sort(key=lambda x: -x[1]) - top_percents.pop() - break - stats[url] = hostusage - - for url in stats: - if len(stats[url]) > 0: - #get per host hi/los for another day - low = min(stats[url]) - high = max(stats[url]) - highs.append(high) - lows.append(low) - for percent in stats[url]: - percents[int(percent)] = percents.get(int(percent), 0) + 1 - else: - print "-> %s: Error. No drive info available." % url - - if len(lows) > 0: - low = min(lows) - high = max(highs) - #dist graph shamelessly stolen from https://github.com/gholt/tcod - print "Distribution Graph:" - mul = 69.0 / max(percents.values()) - for percent in sorted(percents): - print '% 3d%%%5d %s' % (percent, percents[percent], - '*' * int(percents[percent] * mul)) - raw_used = sum(raw_total_used) - raw_avail = sum(raw_total_avail) - raw_total = raw_used + raw_avail - avg_used = 100.0 * raw_used / raw_total - print "Disk usage: space used: %s of %s" % (raw_used, raw_total) - print "Disk usage: space free: %s of %s" % (raw_avail, raw_total) - print "Disk usage: lowest: %s%%, highest: %s%%, avg: %s%%" % \ - (low, high, avg_used) - else: - print "No hosts returned valid data." - print "=" * 79 - if top_percents: - print 'TOP %s' % top - for ident, used in top_percents: - if ident: - url, device = ident.split() - host = urlparse(url).netloc.split(':')[0] - print '%.02f%% %s' % (used, '%-15s %s' % (host, device)) - - def main(self): - """ - Retrieve and report cluster info from hosts running recon middleware. - """ - print "=" * 79 - usage = ''' - usage: %prog [-v] [--suppress] [-a] [-r] [-u] [-d] - [-l] [--md5] [--auditor] [--updater] [--expirer] [--sockstat] - - \taccount|container|object - Defaults to object server. - - ex: %prog container -l --auditor - ''' - args = optparse.OptionParser(usage) - args.add_option('--verbose', '-v', action="store_true", - help="Print verbose info") - args.add_option('--suppress', action="store_true", - help="Suppress most connection related errors") - args.add_option('--async', '-a', action="store_true", - help="Get async stats") - args.add_option('--replication', '-r', action="store_true", - help="Get replication stats") - args.add_option('--auditor', action="store_true", - help="Get auditor stats") - args.add_option('--updater', action="store_true", - help="Get updater stats") - args.add_option('--expirer', action="store_true", - help="Get expirer stats") - args.add_option('--unmounted', '-u', action="store_true", - help="Check cluster for unmounted devices") - args.add_option('--diskusage', '-d', action="store_true", - help="Get disk usage stats") - args.add_option('--loadstats', '-l', action="store_true", - help="Get cluster load average stats") - args.add_option('--quarantined', '-q', action="store_true", - help="Get cluster quarantine stats") - args.add_option('--md5', action="store_true", - help="Get md5sum of servers ring and compare to " - "local copy") - args.add_option('--sockstat', action="store_true", - help="Get cluster socket usage stats") - args.add_option('--top', type='int', metavar='COUNT', default=0, - help='Also show the top COUNT entries in rank order.') - args.add_option('--all', action="store_true", - help="Perform all checks. Equal to -arudlq --md5 " - "--sockstat") - args.add_option('--zone', '-z', type="int", - help="Only query servers in specified zone") - args.add_option('--timeout', '-t', type="int", metavar="SECONDS", - help="Time to wait for a response from a server", - default=5) - args.add_option('--swiftdir', default="/etc/swift", - help="Default = /etc/swift") - options, arguments = args.parse_args() - - if len(sys.argv) <= 1 or len(arguments) > 1: - args.print_help() - sys.exit(0) - - if arguments: - if arguments[0] in self.check_types: - self.server_type = arguments[0] - else: - print "Invalid Server Type" - args.print_help() - sys.exit(1) - else: - self.server_type = 'object' - - swift_dir = options.swiftdir - ring_file = os.path.join(swift_dir, '%s.ring.gz' % self.server_type) - self.verbose = options.verbose - self.suppress_errors = options.suppress - self.timeout = options.timeout - - if options.zone: - hosts = self.get_devices(options.zone, swift_dir, self.server_type) - else: - hosts = self.get_devices(None, swift_dir, self.server_type) - - print "--> Starting reconnaissance on %s hosts" % len(hosts) - print "=" * 79 - - if options.all: - if self.server_type == 'object': - self.async_check(hosts) - self.object_replication_check(hosts) - self.object_auditor_check(hosts) - self.updater_check(hosts) - self.expirer_check(hosts) - elif self.server_type == 'container': - self.replication_check(hosts) - self.auditor_check(hosts) - self.updater_check(hosts) - elif self.server_type == 'account': - self.replication_check(hosts) - self.auditor_check(hosts) - self.umount_check(hosts) - self.load_check(hosts) - self.disk_usage(hosts) - self.get_ringmd5(hosts, ring_file) - self.quarantine_check(hosts) - self.socket_usage(hosts) - else: - if options.async: - if self.server_type == 'object': - self.async_check(hosts) - else: - print "Error: Can't check async's on non object servers." - if options.unmounted: - self.umount_check(hosts) - if options.replication: - if self.server_type == 'object': - self.object_replication_check(hosts) - else: - self.replication_check(hosts) - if options.auditor: - if self.server_type == 'object': - self.object_auditor_check(hosts) - else: - self.auditor_check(hosts) - if options.updater: - if self.server_type == 'account': - print "Error: Can't check updaters on account servers." - else: - self.updater_check(hosts) - if options.expirer: - if self.server_type == 'object': - self.expirer_check(hosts) - else: - print "Error: Can't check expired on non object servers." - if options.loadstats: - self.load_check(hosts) - if options.diskusage: - self.disk_usage(hosts, options.top) - if options.md5: - self.get_ringmd5(hosts, ring_file) - if options.quarantined: - self.quarantine_check(hosts) - if options.sockstat: - self.socket_usage(hosts) - - -if __name__ == '__main__': - try: - reconnoiter = SwiftRecon() - reconnoiter.main() - except KeyboardInterrupt: - print '\n' diff --git a/bin/swift-recon-cron b/bin/swift-recon-cron deleted file mode 100755 index d8cb010780..0000000000 --- a/bin/swift-recon-cron +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env python -""" -swift-recon-cron.py -""" - -import os -import sys -from ConfigParser import ConfigParser -from swift.common.utils import get_logger, dump_recon_cache - - -def get_async_count(device_dir, logger): - async_count = 0 - for i in os.listdir(device_dir): - asyncdir = os.path.join(device_dir, i, "async_pending") - if os.path.isdir(asyncdir): - for entry in os.listdir(asyncdir): - if os.path.isdir(os.path.join(asyncdir, entry)): - async_hdir = os.path.join(asyncdir, entry) - async_count += len(os.listdir(async_hdir)) - return async_count - - -def main(): - c = ConfigParser() - try: - conf_path = sys.argv[1] - except Exception: - print "Usage: %s CONF_FILE" % sys.argv[0].split('/')[-1] - print "ex: swift-recon-cron /etc/swift/object-server.conf" - sys.exit(1) - if not c.read(conf_path): - print "Unable to read config file %s" % conf_path - sys.exit(1) - conf = dict(c.items('filter:recon')) - device_dir = conf.get('devices', '/srv/node') - recon_cache_path = conf.get('recon_cache_path', '/var/cache/swift') - recon_lock_path = conf.get('recon_lock_path', '/var/lock') - cache_file = os.path.join(recon_cache_path, "object.recon") - lock_dir = os.path.join(recon_lock_path, "swift-recon-object-cron") - conf['log_name'] = conf.get('log_name', 'recon-cron') - logger = get_logger(conf, log_route='recon-cron') - try: - os.mkdir(lock_dir) - except OSError as e: - logger.critical(_(str(e))) - print str(e) - sys.exit(1) - try: - asyncs = get_async_count(device_dir, logger) - except Exception: - logger.exception( - _('Exception during recon-cron while accessing devices')) - - dump_recon_cache({'async_pending': asyncs}, cache_file, logger) - - try: - os.rmdir(lock_dir) - except Exception: - logger.exception(_('Exception remove cronjob lock')) - -if __name__ == '__main__': - main() diff --git a/bin/swift-ring-builder b/bin/swift-ring-builder deleted file mode 100755 index a31021e3c0..0000000000 --- a/bin/swift-ring-builder +++ /dev/null @@ -1,628 +0,0 @@ -#! /usr/bin/env python -# Copyright (c) 2010-2012 OpenStack, LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import cPickle as pickle -from array import array -from errno import EEXIST -from itertools import islice, izip -from os import mkdir -from os.path import basename, dirname, exists, join as pathjoin -from sys import argv, exit -from textwrap import wrap -from time import time - -from swift.common import exceptions -from swift.common.ring import RingBuilder - - -MAJOR_VERSION = 1 -MINOR_VERSION = 3 -EXIT_SUCCESS = 0 -EXIT_WARNING = 1 -EXIT_ERROR = 2 - - -def format_device(dev): - """ - Format a device for display. - """ - if ':' in dev['ip']: - return 'd%(id)sz%(zone)s-[%(ip)s]:%(port)s/%(device)s_"%(meta)s"' % dev - else: - return 'd%(id)sz%(zone)s-%(ip)s:%(port)s/%(device)s_"%(meta)s"' % dev - - -class Commands: - - def unknown(): - print 'Unknown command: %s' % argv[2] - exit(EXIT_ERROR) - - def create(): - """ -swift-ring-builder create - - Creates with 2^ partitions and . - is number of hours to restrict moving a partition more - than once. - """ - if len(argv) < 6: - print Commands.create.__doc__.strip() - exit(EXIT_ERROR) - builder = RingBuilder(int(argv[3]), int(argv[4]), int(argv[5])) - backup_dir = pathjoin(dirname(argv[1]), 'backups') - try: - mkdir(backup_dir) - except OSError, err: - if err.errno != EEXIST: - raise - pickle.dump(builder.to_dict(), open(pathjoin(backup_dir, - '%d.' % time() + basename(argv[1])), 'wb'), protocol=2) - pickle.dump(builder.to_dict(), open(argv[1], 'wb'), protocol=2) - exit(EXIT_SUCCESS) - - def default(): - """ -swift-ring-builder - Shows information about the ring and the devices within. - """ - print '%s, build version %d' % (argv[1], builder.version) - zones = 0 - balance = 0 - if builder.devs: - zones = len(set(d['zone'] for d in builder.devs if d is not None)) - balance = builder.get_balance() - print '%d partitions, %d replicas, %d zones, %d devices, %.02f ' \ - 'balance' % (builder.parts, builder.replicas, zones, - len([d for d in builder.devs if d]), balance) - print 'The minimum number of hours before a partition can be ' \ - 'reassigned is %s' % builder.min_part_hours - if builder.devs: - print 'Devices: id zone ip address port name ' \ - 'weight partitions balance meta' - weighted_parts = builder.parts * builder.replicas / \ - sum(d['weight'] for d in builder.devs if d is not None) - for dev in builder.devs: - if dev is None: - continue - if not dev['weight']: - if dev['parts']: - balance = 999.99 - else: - balance = 0 - else: - balance = 100.0 * dev['parts'] / \ - (dev['weight'] * weighted_parts) - 100.0 - print ' %5d %5d %15s %5d %9s %6.02f %10s %7.02f %s' % \ - (dev['id'], dev['zone'], dev['ip'], dev['port'], - dev['device'], dev['weight'], dev['parts'], balance, - dev['meta']) - exit(EXIT_SUCCESS) - - def search(): - """ -swift-ring-builder search - Shows information about matching devices. - """ - if len(argv) < 4: - print Commands.search.__doc__.strip() - print - print builder.search_devs.__doc__.strip() - exit(EXIT_ERROR) - devs = builder.search_devs(argv[3]) - if not devs: - print 'No matching devices found' - exit(EXIT_ERROR) - print 'Devices: id zone ip address port name ' \ - 'weight partitions balance meta' - weighted_parts = builder.parts * builder.replicas / \ - sum(d['weight'] for d in builder.devs if d is not None) - for dev in devs: - if not dev['weight']: - if dev['parts']: - balance = 999.99 - else: - balance = 0 - else: - balance = 100.0 * dev['parts'] / \ - (dev['weight'] * weighted_parts) - 100.0 - print ' %5d %5d %15s %5d %9s %6.02f %10s %7.02f %s' % \ - (dev['id'], dev['zone'], dev['ip'], dev['port'], - dev['device'], dev['weight'], dev['parts'], balance, - dev['meta']) - exit(EXIT_SUCCESS) - - def list_parts(): - """ -swift-ring-builder list_parts [] .. - Returns a 2 column list of all the partitions that are assigned to any of - the devices matching the search values given. The first column is the - assigned partition number and the second column is the number of device - matches for that partition. The list is ordered from most number of matches - to least. If there are a lot of devices to match against, this command - could take a while to run. - """ - if len(argv) < 4: - print Commands.list_parts.__doc__.strip() - print - print builder.search_devs.__doc__.strip() - exit(EXIT_ERROR) - devs = [] - for arg in argv[3:]: - devs.extend(builder.search_devs(arg) or []) - if not devs: - print 'No matching devices found' - exit(EXIT_ERROR) - devs = [d['id'] for d in devs] - matches = [array('i') for x in xrange(builder.replicas)] - for part in xrange(builder.parts): - count = len([d for d in builder.get_part_devices(part) - if d['id'] in devs]) - if count: - matches[builder.replicas - count].append(part) - print 'Partition Matches' - for index, parts in enumerate(matches): - for part in parts: - print '%9d %7d' % (part, builder.replicas - index) - exit(EXIT_SUCCESS) - - def add(): - """ -swift-ring-builder add - z-:/_ - [z-:/_ ] ... - - Adds devices to the ring with the given information. No partitions will be - assigned to the new device until after running 'rebalance'. This is so you - can make multiple device changes and rebalance them all just once. - """ - if len(argv) < 5 or len(argv) % 2 != 1: - print Commands.add.__doc__.strip() - exit(EXIT_ERROR) - - devs_and_weights = izip(islice(argv, 3, len(argv), 2), - islice(argv, 4, len(argv), 2)) - for devstr, weightstr in devs_and_weights: - if not devstr.startswith('z'): - print 'Invalid add value: %s' % devstr - exit(EXIT_ERROR) - i = 1 - while i < len(devstr) and devstr[i].isdigit(): - i += 1 - zone = int(devstr[1:i]) - rest = devstr[i:] - - if not rest.startswith('-'): - print 'Invalid add value: %s' % devstr - print "The on-disk ring builder is unchanged.\n" - exit(EXIT_ERROR) - i = 1 - if rest[i] == '[': - i += 1 - while i < len(rest) and rest[i] != ']': - i += 1 - i += 1 - ip = rest[1:i].lstrip('[').rstrip(']') - rest = rest[i:] - else: - while i < len(rest) and rest[i] in '0123456789.': - i += 1 - ip = rest[1:i] - rest = rest[i:] - - if not rest.startswith(':'): - print 'Invalid add value: %s' % devstr - print "The on-disk ring builder is unchanged.\n" - exit(EXIT_ERROR) - i = 1 - while i < len(rest) and rest[i].isdigit(): - i += 1 - port = int(rest[1:i]) - rest = rest[i:] - - if not rest.startswith('/'): - print 'Invalid add value: %s' % devstr - print "The on-disk ring builder is unchanged.\n" - exit(EXIT_ERROR) - i = 1 - while i < len(rest) and rest[i] != '_': - i += 1 - device_name = rest[1:i] - rest = rest[i:] - - meta = '' - if rest.startswith('_'): - meta = rest[1:] - - try: - weight = float(weightstr) - except ValueError: - print 'Invalid weight value: %s' % weightstr - print "The on-disk ring builder is unchanged.\n" - exit(EXIT_ERROR) - - if weight < 0: - print 'Invalid weight value (must be positive): %s' % weightstr - print "The on-disk ring builder is unchanged.\n" - exit(EXIT_ERROR) - - for dev in builder.devs: - if dev is None: - continue - if dev['ip'] == ip and dev['port'] == port and \ - dev['device'] == device_name: - print 'Device %d already uses %s:%d/%s.' % \ - (dev['id'], dev['ip'], dev['port'], dev['device']) - print "The on-disk ring builder is unchanged.\n" - exit(EXIT_ERROR) - - builder.add_dev({'zone': zone, 'ip': ip, 'port': port, - 'device': device_name, 'weight': weight, - 'meta': meta}) - new_dev = builder.search_devs( - 'z%s-%s:%s/%s' % (zone, ip, port, device_name))[0]['id'] - if ':' in ip: - print 'Device z%s-[%s]:%s/%s_"%s" with %s weight got id %s' % \ - (zone, ip, port, device_name, meta, weight, new_dev) - else: - print 'Device z%s-%s:%s/%s_"%s" with %s weight got id %s' % \ - (zone, ip, port, device_name, meta, weight, new_dev) - pickle.dump(builder.to_dict(), open(argv[1], 'wb'), protocol=2) - exit(EXIT_SUCCESS) - - def set_weight(): - """ -swift-ring-builder set_weight - [ 1: - print 'Matched more than one device:' - for dev in devs: - print ' d%(id)sz%(zone)s-%(ip)s:%(port)s/%(device)s_' \ - '"%(meta)s"' % dev - if raw_input('Are you sure you want to update the weight for ' - 'these %s devices? (y/N) ' % len(devs)) != 'y': - print 'Aborting device modifications' - exit(EXIT_ERROR) - for dev in devs: - builder.set_dev_weight(dev['id'], weight) - print 'd%(id)sz%(zone)s-%(ip)s:%(port)s/%(device)s_' \ - '"%(meta)s" weight set to %(weight)s' % dev - pickle.dump(builder.to_dict(), open(argv[1], 'wb'), protocol=2) - exit(EXIT_SUCCESS) - - def set_info(): - """ -swift-ring-builder set_info - :/_ - [ :/_] ... - - For each search-value, resets the matched device's information. - This information isn't used to assign partitions, so you can use - 'write_ring' afterward to rewrite the current ring with the newer - device information. Any of the parts are optional in the final - :/_ parameter; just give what you - want to change. For instance set_info d74 _"snet: 5.6.7.8" would - just update the meta data for device id 74. - """ - if len(argv) < 5 or len(argv) % 2 != 1: - print Commands.set_info.__doc__.strip() - print - print builder.search_devs.__doc__.strip() - exit(EXIT_ERROR) - - searches_and_changes = izip(islice(argv, 3, len(argv), 2), - islice(argv, 4, len(argv), 2)) - - for search_value, change_value in searches_and_changes: - devs = builder.search_devs(search_value) - change = [] - if len(change_value) and change_value[0].isdigit(): - i = 1 - while (i < len(change_value) and - change_value[i] in '0123456789.'): - i += 1 - change.append(('ip', change_value[:i])) - change_value = change_value[i:] - elif len(change_value) and change_value[0] == '[': - i = 1 - while i < len(change_value) and change_value[i] != ']': - i += 1 - i += 1 - change.append(('ip', change_value[:i].lstrip('[').rstrip(']'))) - change_value = change_value[i:] - if change_value.startswith(':'): - i = 1 - while i < len(change_value) and change_value[i].isdigit(): - i += 1 - change.append(('port', int(change_value[1:i]))) - change_value = change_value[i:] - if change_value.startswith('/'): - i = 1 - while i < len(change_value) and change_value[i] != '_': - i += 1 - change.append(('device', change_value[1:i])) - change_value = change_value[i:] - if change_value.startswith('_'): - change.append(('meta', change_value[1:])) - change_value = '' - if change_value or not change: - raise ValueError('Invalid set info change value: %s' % - repr(argv[4])) - if not devs: - print("Search value \"%s\" matched 0 devices.\n" - "The on-disk ring builder is unchanged.\n" - % search_value) - exit(EXIT_ERROR) - if len(devs) > 1: - print 'Matched more than one device:' - for dev in devs: - print ' %s' % format_device(dev) - if raw_input('Are you sure you want to update the info for ' - 'these %s devices? (y/N) ' % len(devs)) != 'y': - print 'Aborting device modifications' - exit(EXIT_ERROR) - for dev in devs: - orig_dev_string = format_device(dev) - test_dev = dict(dev) - for key, value in change: - test_dev[key] = value - for check_dev in builder.devs: - if not check_dev or check_dev['id'] == test_dev['id']: - continue - if check_dev['ip'] == test_dev['ip'] and \ - check_dev['port'] == test_dev['port'] and \ - check_dev['device'] == test_dev['device']: - print 'Device %d already uses %s:%d/%s.' % \ - (check_dev['id'], check_dev['ip'], - check_dev['port'], check_dev['device']) - exit(EXIT_ERROR) - for key, value in change: - dev[key] = value - print 'Device %s is now %s' % (orig_dev_string, - format_device(dev)) - pickle.dump(builder.to_dict(), open(argv[1], 'wb'), protocol=2) - exit(EXIT_SUCCESS) - - def remove(): - """ -swift-ring-builder remove [search-value ...] - Removes the device(s) from the ring. This should normally just be used for - a device that has failed. For a device you wish to decommission, it's best - to set its weight to 0, wait for it to drain all its data, then use this - remove command. This will not take effect until after running 'rebalance'. - This is so you can make multiple device changes and rebalance them all just - once. - """ - if len(argv) < 4: - print Commands.remove.__doc__.strip() - print - print builder.search_devs.__doc__.strip() - exit(EXIT_ERROR) - - for search_value in argv[3:]: - devs = builder.search_devs(search_value) - if not devs: - print("Search value \"%s\" matched 0 devices.\n" - "The on-disk ring builder is unchanged." % search_value) - exit(EXIT_ERROR) - if len(devs) > 1: - print 'Matched more than one device:' - for dev in devs: - print ' d%(id)sz%(zone)s-%(ip)s:%(port)s/%(device)s_' \ - '"%(meta)s"' % dev - if raw_input('Are you sure you want to remove these %s ' - 'devices? (y/N) ' % len(devs)) != 'y': - print 'Aborting device removals' - exit(EXIT_ERROR) - for dev in devs: - try: - builder.remove_dev(dev['id']) - except exceptions.RingBuilderError, e: - print '-' * 79 - print( - "An error occurred while removing device with id %d\n" - "This usually means that you attempted to remove\n" - "the last device in a ring. If this is the case,\n" - "consider creating a new ring instead.\n" - "The on-disk ring builder is unchanged.\n" - "Original exception message: %s" % - (dev['id'], e.message) - ) - print '-' * 79 - exit(EXIT_ERROR) - - print 'd%(id)sz%(zone)s-%(ip)s:%(port)s/%(device)s_' \ - '"%(meta)s" marked for removal and will be removed' \ - ' next rebalance.' % dev - pickle.dump(builder.to_dict(), open(argv[1], 'wb'), protocol=2) - exit(EXIT_SUCCESS) - - def rebalance(): - """ -swift-ring-builder rebalance - Attempts to rebalance the ring by reassigning partitions that haven't been - recently reassigned. - """ - devs_changed = builder.devs_changed - try: - last_balance = builder.get_balance() - parts, balance = builder.rebalance() - except exceptions.RingBuilderError, e: - print '-' * 79 - print ("An error has occurred during ring validation. Common\n" - "causes of failure are rings that are empty or do not\n" - "have enough devices to accommodate the replica count.\n" - "Original exception message:\n %s" % e.message - ) - print '-' * 79 - exit(EXIT_ERROR) - if not parts: - print 'No partitions could be reassigned.' - print 'Either none need to be or none can be due to ' \ - 'min_part_hours [%s].' % builder.min_part_hours - exit(EXIT_WARNING) - if not devs_changed and abs(last_balance - balance) < 1: - print 'Cowardly refusing to save rebalance as it did not change ' \ - 'at least 1%.' - exit(EXIT_WARNING) - try: - builder.validate() - except exceptions.RingValidationError, e: - print '-' * 79 - print ("An error has occurred during ring validation. Common\n" - "causes of failure are rings that are empty or do not\n" - "have enough devices to accommodate the replica count.\n" - "Original exception message:\n %s" % e.message - ) - print '-' * 79 - exit(EXIT_ERROR) - print 'Reassigned %d (%.02f%%) partitions. Balance is now %.02f.' % \ - (parts, 100.0 * parts / builder.parts, balance) - status = EXIT_SUCCESS - if balance > 5: - print '-' * 79 - print 'NOTE: Balance of %.02f indicates you should push this ' % \ - balance - print ' ring, wait at least %d hours, and rebalance/repush.' \ - % builder.min_part_hours - print '-' * 79 - status = EXIT_WARNING - ts = time() - builder.get_ring().save( - pathjoin(backup_dir, '%d.' % ts + basename(ring_file))) - pickle.dump(builder.to_dict(), open(pathjoin(backup_dir, - '%d.' % ts + basename(argv[1])), 'wb'), protocol=2) - builder.get_ring().save(ring_file) - pickle.dump(builder.to_dict(), open(argv[1], 'wb'), protocol=2) - exit(status) - - def validate(): - """ -swift-ring-builder validate - Just runs the validation routines on the ring. - """ - builder.validate() - exit(EXIT_SUCCESS) - - def write_ring(): - """ -swift-ring-builder write_ring - Just rewrites the distributable ring file. This is done automatically after - a successful rebalance, so really this is only useful after one or more - 'set_info' calls when no rebalance is needed but you want to send out the - new device information. - """ - ring_data = builder.get_ring() - if not ring_data._replica2part2dev_id: - if ring_data.devs: - print 'Warning: Writing a ring with no partition ' \ - 'assignments but with devices; did you forget to run ' \ - '"rebalance"?' - else: - print 'Warning: Writing an empty ring' - ring_data.save( - pathjoin(backup_dir, '%d.' % time() + basename(ring_file))) - ring_data.save(ring_file) - exit(EXIT_SUCCESS) - - def pretend_min_part_hours_passed(): - builder.pretend_min_part_hours_passed() - pickle.dump(builder.to_dict(), open(argv[1], 'wb'), protocol=2) - exit(EXIT_SUCCESS) - - def set_min_part_hours(): - """ -swift-ring-builder set_min_part_hours - Changes the to the given . This should be set to - however long a full replication/update cycle takes. We're working on a way - to determine this more easily than scanning logs. - """ - if len(argv) < 4: - print Commands.set_min_part_hours.__doc__.strip() - exit(EXIT_ERROR) - builder.change_min_part_hours(int(argv[3])) - print 'The minimum number of hours before a partition can be ' \ - 'reassigned is now set to %s' % argv[3] - pickle.dump(builder.to_dict(), open(argv[1], 'wb'), protocol=2) - exit(EXIT_SUCCESS) - - -if __name__ == '__main__': - if len(argv) < 2: - print "swift-ring-builder %(MAJOR_VERSION)s.%(MINOR_VERSION)s\n" % \ - globals() - print Commands.default.__doc__.strip() - print - cmds = [c for c, f in Commands.__dict__.iteritems() - if f.__doc__ and c[0] != '_' and c != 'default'] - cmds.sort() - for cmd in cmds: - print Commands.__dict__[cmd].__doc__.strip() - print - print RingBuilder.search_devs.__doc__.strip() - print - for line in wrap(' '.join(cmds), 79, initial_indent='Quick list: ', - subsequent_indent=' '): - print line - print ('Exit codes: 0 = operation successful\n' - ' 1 = operation completed with warnings\n' - ' 2 = error') - exit(EXIT_SUCCESS) - - if exists(argv[1]): - builder = RingBuilder.load(argv[1]) - elif len(argv) < 3 or argv[2] != 'create': - print 'Ring Builder file does not exist: %s' % argv[1] - exit(EXIT_ERROR) - - backup_dir = pathjoin(dirname(argv[1]), 'backups') - try: - mkdir(backup_dir) - except OSError, err: - if err.errno != EEXIST: - raise - - ring_file = argv[1] - if ring_file.endswith('.builder'): - ring_file = ring_file[:-len('.builder')] - ring_file += '.ring.gz' - - if len(argv) == 2: - command = "default" - else: - command = argv[2] - Commands.__dict__.get(command, Commands.unknown)() diff --git a/bin/swift-temp-url b/bin/swift-temp-url deleted file mode 100755 index da7595a753..0000000000 --- a/bin/swift-temp-url +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python - -import hmac -from hashlib import sha1 -from os.path import basename -from sys import argv, exit -from time import time - - -if __name__ == '__main__': - if len(argv) != 5: - prog = basename(argv[0]) - print 'Syntax: %s ' % prog - print - print 'Where:' - print ' The method to allow, GET or PUT.' - print ' Note: HEAD will also be allowed.' - print ' The number of seconds from now to allow requests.' - print ' The full path to the resource.' - print ' Example: /v1/AUTH_account/c/o' - print ' The X-Account-Meta-Temp-URL-Key for the account.' - print - print 'Example output:' - print ' /v1/AUTH_account/c/o?temp_url_sig=34d49efc32fe6e3082e411e' \ - 'eeb85bd8a&temp_url_expires=1323482948' - print - print 'This can be used to form a URL to give out for the access ' - print 'allowed. For example:' - print ' echo https://swift-cluster.example.com`%s GET 60 ' \ - '/v1/AUTH_account/c/o mykey`' % prog - print - print 'Might output:' - print ' https://swift-cluster.example.com/v1/AUTH_account/c/o?' \ - 'temp_url_sig=34d49efc32fe6e3082e411eeeb85bd8a&' \ - 'temp_url_expires=1323482948' - exit(1) - method, seconds, path, key = argv[1:] - if method not in ('GET', 'PUT'): - print 'Please use either the GET or PUT method.' - exit(1) - try: - expires = int(time() + int(seconds)) - except ValueError: - expires = 0 - if expires < 1: - print 'Please use a positive value.' - exit(1) - parts = path.split('/', 4) - # Must be five parts, ['', 'v1', 'a', 'c', 'o'], must be a v1 request, have - # account, container, and object values, and the object value can't just - # have '/'s. - if len(parts) != 5 or parts[0] or parts[1] != 'v1' or not parts[2] or \ - not parts[3] or not parts[4].strip('/'): - print ' must point to an object.' - print 'For example: /v1/account/container/object' - exit(1) - sig = hmac.new(key, '%s\n%s\n%s' % (method, expires, path), - sha1).hexdigest() - print '%s?temp_url_sig=%s&temp_url_expires=%s' % (path, sig, expires) diff --git a/bindep.txt b/bindep.txt new file mode 100644 index 0000000000..4510a6059b --- /dev/null +++ b/bindep.txt @@ -0,0 +1,39 @@ +# This is a cross-platform list tracking distribution packages needed by tests; +# see http://docs.openstack.org/infra/bindep/ for additional information. + +build-essential [platform:dpkg] +linux-headers [platform:apk] +gcc [platform:rpm platform:apk] +gettext [!platform:suse] +gettext-runtime [platform:suse] +liberasurecode-dev [platform:dpkg] +# There's no library in CentOS 7 but Fedora and openSUSE have it. +liberasurecode-devel [platform:rpm !platform:centos] +libffi-dev [platform:dpkg platform:apk] +libffi-devel [platform:rpm] +libxml2-dev [platform:dpkg platform:apk] +libxml2-devel [platform:rpm] +libxslt-devel [platform:rpm] +libxslt1-dev [platform:dpkg] +libxslt-dev [platform:apk] +memcached +python3-dev [platform:dpkg platform:apk test] +python3-devel [platform:rpm test] +# python3-devel does not pull in the python3 package on openSUSE so +# we need to be explicit. The python3 package contains the XML module +# which is required by a python3 virtualenv. +# See https://bugzilla.suse.com/show_bug.cgi?id=1046990 +python3 [platform:suse platform:apk test] +rsync +xfsprogs +libssl-dev [platform:dpkg] +openssl-devel [platform:redhat] +openssl-dev [platform:apk] +libopenssl-devel [platform:suse] +py-cffi [platform:apk] +musl-dev [platform:apk] +man-db [pep8] +man [platform:rpm pep8] +# libsrvg2 is required to build docs +librsvg2-tools [doc platform:rpm] +librsvg2-bin [doc platform:dpkg] diff --git a/doc/manpages/account-server.conf.5 b/doc/manpages/account-server.conf.5 index acc1742ba8..f3a9c1851a 100644 --- a/doc/manpages/account-server.conf.5 +++ b/doc/manpages/account-server.conf.5 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2012 OpenStack, LLC. +.\" Copyright (c) 2010-2012 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -20,7 +20,7 @@ .SH NAME .LP .B account-server.conf -\- configuration file for the openstack-swift account server +\- configuration file for the OpenStack Swift account server @@ -42,7 +42,7 @@ certain number of key/value parameters which are described later. Any line that begins with a '#' symbol is ignored. You can find more information about python-pastedeploy configuration format at -\fIhttp://pythonpaste.org/deploy/#config-format\fR +\fIhttps://docs.pylonsproject.org/projects/pastedeploy/en/latest/#config-format\fR @@ -56,26 +56,91 @@ are acceptable within this section. IP address the account server should bind to. The default is 0.0.0.0 which will make it bind to all available addresses. .IP "\fBbind_port\fR" -TCP port the account server should bind to. The default is 6002. +TCP port the account server should bind to. The default is 6202. +.IP "\fBkeep_idle\fR" +Value to set for socket TCP_KEEPIDLE. The default value is 600. +.IP "\fBbind_timeout\fR" +Timeout to bind socket. The default is 30. .IP \fBbacklog\fR TCP backlog. Maximum number of allowed pending connections. The default value is 4096. .IP \fBworkers\fR -Number of account server workers to fork. The default is 1. +The number of pre-forked processes that will accept connections. Zero means +no fork. The default is auto which will make the server try to match the +number of effective cpu cores if python multiprocessing is available (included +with most python distributions >= 2.6) or fallback to one. It's worth noting +that individual workers will use many eventlet co-routines to service multiple +concurrent requests. +.IP \fBmax_clients\fR +Maximum number of clients one worker can process simultaneously (it will +actually accept(2) N + 1). Setting this to one (1) will only handle one request +at a time, without accepting another request concurrently. The default is 1024. .IP \fBuser\fR The system user that the account server will run as. The default is swift. .IP \fBswift_dir\fR Swift configuration directory. The default is /etc/swift. .IP \fBdevices\fR -Parent directory or where devices are mounted. Default is /srv/node. +Parent directory of where devices are mounted. Default is /srv/node. .IP \fBmount_check\fR Whether or not check if the devices are mounted to prevent accidentally writing to the root device. The default is set to true. +.IP \fBdisable_fallocate\fR +Disable pre-allocate disk space for a file. The default is false. .IP \fBlog_name\fR Label used when logging. The default is swift. .IP \fBlog_facility\fR Syslog log facility. The default is LOG_LOCAL0. .IP \fBlog_level\fR Logging level. The default is INFO. +.IP "\fBlog_address\fR +Logging address. The default is /dev/log. +.IP \fBlog_max_line_length\fR +The following caps the length of log lines to the value given; no limit if +set to 0, the default. +.IP \fBlog_custom_handlers\fR +Comma separated list of functions to call to setup custom log handlers. +functions get passed: conf, name, log_to_console, log_route, fmt, logger, +adapted_logger. The default is empty. +.IP \fBlog_udp_host\fR +If set, log_udp_host will override log_address. +.IP "\fBlog_udp_port\fR +UDP log port, the default is 514. +.IP \fBlog_statsd_host\fR +StatsD server. IPv4/IPv6 addresses and hostnames are +supported. If a hostname resolves to an IPv4 and IPv6 address, the IPv4 +address will be used. +.IP \fBlog_statsd_port\fR +The default is 8125. +.IP \fBlog_statsd_default_sample_rate\fR +The default is 1. +.IP \fBlog_statsd_sample_rate_factor\fR +The default is 1. +.IP \fBlog_statsd_metric_prefix\fR +The default is empty. +.IP \fBdb_preallocation\fR +If you don't mind the extra disk space usage in overhead, you can turn this +on to preallocate disk space with SQLite databases to decrease fragmentation. +The default is false. +.IP \fBeventlet_debug\fR +Debug mode for eventlet library. The default is false. +.IP \fBfallocate_reserve\fR +You can set fallocate_reserve to the number of bytes or percentage of disk +space you'd like fallocate to reserve, whether there is space for the given +file size or not. Percentage will be used if the value ends with a '%'. +The default is 1%. +.IP \fBnice_priority\fR +Modify scheduling priority of server processes. Niceness values range from -20 +(most favorable to the process) to 19 (least favorable to the process). +The default does not modify priority. +.IP \fBionice_class\fR +Modify I/O scheduling class of server processes. I/O niceness class values +are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and IOPRIO_CLASS_IDLE (idle). +The default does not modify class and priority. +Work only with ionice_priority. +.IP \fBionice_priority\fR +Modify I/O scheduling priority of server processes. I/O niceness priority +is a number which goes from 0 to 7. The higher the value, the lower +the I/O priority of the process. Work only with ionice_class. +Ignored if IOPRIO_CLASS_IDLE is set. .RE .PD @@ -108,12 +173,33 @@ This is normally \fBegg:swift#account\fR. Label used when logging. The default is account-server. .IP "\fBset log_facility\fR Syslog log facility. The default is LOG_LOCAL0. -.IP "\fB set log_level\fR +.IP "\fBset log_level\fR Logging level. The default is INFO. -.IP "\fB set log_requests\fR +.IP "\fBset log_requests\fR Enables request logging. The default is True. -.IP "\fB set log_address\fR +.IP "\fBset log_address\fR Logging address. The default is /dev/log. +.IP "\fBreplication_server\fR +Configure parameter for creating specific server. +To handle all verbs, including replication verbs, do not specify +"replication_server" (this is the default). To only handle replication, +set to a true value (e.g. "true" or "1"). To handle only non-replication +verbs, set to "false". Unless you have a separate replication network, you +should not specify any value for "replication_server". The default is empty. +.IP \fBnice_priority\fR +Modify scheduling priority of server processes. Niceness values range from -20 +(most favorable to the process) to 19 (least favorable to the process). +The default does not modify priority. +.IP \fBionice_class\fR +Modify I/O scheduling class of server processes. I/O niceness class values +are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and IOPRIO_CLASS_IDLE (idle). +The default does not modify class and priority. +Work only with ionice_priority. +.IP \fBionice_priority\fR +Modify I/O scheduling priority of server processes. I/O niceness priority +is a number which goes from 0 to 7. The higher the value, the lower +the I/O priority of the process. Work only with ionice_class. +Ignored if IOPRIO_CLASS_IDLE is set. .RE .PD @@ -149,6 +235,36 @@ and ensure that swift has read/write. The default is /var/cache/swift. .RE .PD +.RS 0 +.IP "\fB[filter:xprofile]\fR" +.RS 3 +.IP "\fBuse\fR" +Entry point for paste.deploy for the xprofile middleware. This is the reference to the installed python egg. +This is normally \fBegg:swift#xprofile\fR. +.IP "\fBprofile_module\fR" +This option enable you to switch profilers which should inherit from python +standard profiler. Currently the supported value can be 'cProfile', 'eventlet.green.profile' etc. +.IP "\fBlog_filename_prefix\fR" +This prefix will be used to combine process ID and timestamp to name the +profile data file. Make sure the executing user has permission to write +into this path (missing path segments will be created, if necessary). +If you enable profiling in more than one type of daemon, you must override +it with an unique value like, the default is /var/log/swift/profile/account.profile. +.IP "\fBdump_interval\fR" +The profile data will be dumped to local disk based on above naming rule +in this interval. The default is 5.0. +.IP "\fBdump_timestamp\fR" +Be careful, this option will enable profiler to dump data into the file with +time stamp which means there will be lots of files piled up in the directory. +The default is false +.IP "\fBpath\fR" +This is the path of the URL to access the mini web UI. The default is __profile__. +.IP "\fBflush_at_shutdown\fR" +Clear the data when the wsgi server shutdown. The default is false. +.IP "\fBunwind\fR" +Unwind the iterator of applications. Default is false. +.RE +.PD .SH ADDITIONAL SECTIONS @@ -167,24 +283,16 @@ Syslog log facility. The default is LOG_LOCAL0. Logging level. The default is INFO. .IP \fBlog_address\fR Logging address. The default is /dev/log. -.IP \fBvm_test_mode\fR -Indicates that you are using a VM environment. The default is no. .IP \fBper_diff\fR -The default is 1000. +Maximum number of database rows that will be sync'd in a single HTTP replication request. The default is 1000. .IP \fBmax_diffs\fR This caps how long the replicator will spend trying to sync a given database per pass so the other databases don't get starved. The default is 100. .IP \fBconcurrency\fR Number of replication workers to spawn. The default is 8. .IP "\fBrun_pause [deprecated]\fR" -Time in seconds to wait between replication passes. The default is 10. +Time in seconds to wait between replication passes. The default is 30. .IP \fBinterval\fR Replaces run_pause with the more standard "interval", which means the replicator won't pause unless it takes less than the interval set. The default is 30. -.IP \fBerror_suppression_interval\fR -How long without an error before a node's error count is reset. This will also be how long before a node is re-enabled after suppression is triggered. -The default is 60 seconds. -.IP \fBerror_suppression_limit\fR -How many errors can accumulate before a node is temporarily ignored. The default -is 10 seconds. .IP \fBnode_timeout\fR Request timeout to external services. The default is 10 seconds. .IP \fBconn_timeout\fR @@ -192,6 +300,29 @@ Connection timeout to external services. The default is 0.5 seconds. .IP \fBreclaim_age\fR Time elapsed in seconds before an account can be reclaimed. The default is 604800 seconds. +.IP \fBrsync_compress\fR +Allow rsync to compress data which is transmitted to destination node +during sync. However, this is applicable only when destination node is in +a different region than the local one. The default is false. +.IP \fBrsync_module\fR +Format of the rsync module where the replicator will send data. See +etc/rsyncd.conf-sample for some usage examples. +.IP \fBrecon_cache_path\fR +Path to recon cache directory. The default is /var/cache/swift. +.IP \fBnice_priority\fR +Modify scheduling priority of server processes. Niceness values range from -20 +(most favorable to the process) to 19 (least favorable to the process). +The default does not modify priority. +.IP \fBionice_class\fR +Modify I/O scheduling class of server processes. I/O niceness class values +are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and IOPRIO_CLASS_IDLE (idle). +The default does not modify class and priority. +Work only with ionice_priority. +.IP \fBionice_priority\fR +Modify I/O scheduling priority of server processes. I/O niceness priority +is a number which goes from 0 to 7. The higher the value, the lower +the I/O priority of the process. Work only with ionice_class. +Ignored if IOPRIO_CLASS_IDLE is set. .RE @@ -210,6 +341,24 @@ Logging level. The default is INFO. Logging address. The default is /dev/log. .IP \fBinterval\fR Will audit, at most, 1 account per device per interval. The default is 1800 seconds. +.IP \fBaccounts_per_second\fR +Maximum accounts audited per second. Should be tuned according to individual system specs. 0 is unlimited. The default is 200. +.IP \fBrecon_cache_path\fR +Path to recon cache directory. The default is /var/cache/swift. +.IP \fBnice_priority\fR +Modify scheduling priority of server processes. Niceness values range from -20 +(most favorable to the process) to 19 (least favorable to the process). +The default does not modify priority. +.IP \fBionice_class\fR +Modify I/O scheduling class of server processes. I/O niceness class values +are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and IOPRIO_CLASS_IDLE (idle). +The default does not modify class and priority. +Work only with ionice_priority. +.IP \fBionice_priority\fR +Modify I/O scheduling priority of server processes. I/O niceness priority +is a number which goes from 0 to 7. The higher the value, the lower +the I/O priority of the process. Work only with ionice_class. +Ignored if IOPRIO_CLASS_IDLE is set. .RE @@ -234,6 +383,36 @@ Minimum time for a pass to take. The default is 3600 seconds. Request timeout to external services. The default is 10 seconds. .IP \fBconn_timeout\fR Connection timeout to external services. The default is 0.5 seconds. +.IP \fBdelay_reaping\fR +Normally, the reaper begins deleting account information for deleted accounts +immediately; you can set this to delay its work however. The value is in +seconds. The default is 0. The sum of this value and the +container-updater interval should be less than the account-replicator +reclaim_age. This ensures that once the account-reaper has deleted a +container there is sufficient time for the container-updater to report to the +account before the account DB is removed. +.IP \fBreap_warn_after\fR +If the account fails to be reaped due to a persistent error, the +account reaper will log a message such as: + Account has not been reaped since +You can search logs for this message if space is not being reclaimed +after you delete account(s). +Default is 2592000 seconds (30 days). This is in addition to any time +requested by delay_reaping. +.IP \fBnice_priority\fR +Modify scheduling priority of server processes. Niceness values range from -20 +(most favorable to the process) to 19 (least favorable to the process). +The default does not modify priority. +.IP \fBionice_class\fR +Modify I/O scheduling class of server processes. I/O niceness class values +are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and IOPRIO_CLASS_IDLE (idle). +The default does not modify class and priority. +Work only with ionice_priority. +.IP \fBionice_priority\fR +Modify I/O scheduling priority of server processes. I/O niceness priority +is a number which goes from 0 to 7. The higher the value, the lower +the I/O priority of the process. Work only with ionice_class. +Ignored if IOPRIO_CLASS_IDLE is set. .RE .PD @@ -243,13 +422,11 @@ Connection timeout to external services. The default is 0.5 seconds. .SH DOCUMENTATION .LP More in depth documentation about the swift-account-server and -also Openstack-Swift as a whole can be found at -.BI http://swift.openstack.org/admin_guide.html +also OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/admin_guide.html and -.BI http://swift.openstack.org +.BI https://docs.openstack.org/swift/latest/ .SH "SEE ALSO" .BR swift-account-server(1), - - diff --git a/doc/manpages/container-reconciler.conf.5 b/doc/manpages/container-reconciler.conf.5 new file mode 100644 index 0000000000..79797b6496 --- /dev/null +++ b/doc/manpages/container-reconciler.conf.5 @@ -0,0 +1,182 @@ +.\" +.\" Author: HCLTech-SSW +.\" Copyright (c) 2010-2017 OpenStack Foundation. +.\" +.\" Licensed under the Apache License, Version 2.0 (the "License"); +.\" you may not use this file except in compliance with the License. +.\" You may obtain a copy of the License at +.\" +.\" http://www.apache.org/licenses/LICENSE-2.0 +.\" +.\" Unless required by applicable law or agreed to in writing, software +.\" distributed under the License is distributed on an "AS IS" BASIS, +.\" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +.\" implied. +.\" See the License for the specific language governing permissions and +.\" limitations under the License. +.\" +.TH container-reconciler.conf 5 "10/25/2017" "Linux" "OpenStack Swift" + +.SH NAME +.LP +.B container-reconciler.conf +\- configuration file for the OpenStack Swift container reconciler + + +.SH SYNOPSIS +.LP +.B container-reconciler.conf + + +.SH DESCRIPTION +.PP +This is the configuration file used by the container reconciler. + +The configuration file follows the python-pastedeploy syntax. The file is divided +into sections, which are enclosed by square brackets. Each section will contain a +certain number of key/value parameters which are described later. + +Any line that begins with a '#' symbol is ignored. + +You can find more information about python-pastedeploy configuration format at +\fIhttps://docs.pylonsproject.org/projects/pastedeploy/en/latest/#config-format\fR + + +.SH GLOBAL SECTION +.PD 1 +.RS 0 +This is indicated by section named [DEFAULT]. Below are the parameters that +are acceptable within this section. + +.IP "\fBlog_address\fR" +Location where syslog sends the logs to. The default is /dev/log. +.IP "\fBlog_custom_handlers \fR" +Comma-separated list of functions to call to setup custom log handlers. +.IP "\fBlog_facility\fR" +Syslog log facility. The default is LOG_LOCAL0. +.IP "\fBlog_level\fR" +Log level used for logging. The default is INFO. +.IP "\fBlog_name\fR" +Label used when logging. The default is swift. +.IP "\fBlog_statsd_default_sample_rate\fR" +Defines the probability of sending a sample for any given event or +timing measurement. The default is 1.0. +.IP "\fBlog_statsd_host\fR" +If not set, the StatsD feature is disabled. The default is localhost. +.IP "\fBlog_statsd_metric_prefix\fR" +Value will be prepended to every metric sent to the StatsD server. +.IP "\fBlog_statsd_port\fR" +The port value for the StatsD server. The default is 8125. +.IP "\fBlog_statsd_sample_rate_factor\fR" +It is not recommended to set this to a value less than 1.0, if frequency of +logging is too high, tune the log_statsd_default_sample_rate instead. +The default value is 1.0. +.IP "\fBlog_udp_host\fR" +If not set, the UDP receiver for syslog is disabled. +.IP "\fBlog_udp_port\fR" +Port value for UDP receiver, if enabled. The default is 514. +.IP "\fBswift_dir\fR" +Swift configuration directory. The default is /etc/swift. +.IP "\fBuser\fR" +User to run as. The default is swift. +.RE +.PD + + +.SH CONTAINER RECONCILER SECTION +.PD 1 +.RS 0 +.IP "\fB[container-reconciler]\fR" +.RE +.RS 3 +.IP "\fBinterval\fR" +Minimum time for a pass to take. The default is 30 seconds. +.IP "\fBreclaim_age\fR" +Time elapsed in seconds before an object can be reclaimed. The default is 604800 seconds. +.IP "\fBrequest_tries\fR" +Server errors from requests will be retried by default. The default is 3. +.RE +.PD + + +.SH PIPELINE SECTION +.PD 1 +.RS 0 +.IP "\fB[pipeline:main]\fR" +.RE +.RS 3 +.IP "\fBpipeline\fR" +Pipeline to use for processing operations. The default is "catch_errors proxy-logging cache proxy-server". +.RE +.PD + + +.SH APP SECTION +.PD 1 +.RS 0 +\fBFor details of the available options see proxy-server.conf.5.\fR + +.RS 0 +.IP "\fB[app:proxy-server]\fR" +.RE +.RS 3 +.IP "\fBuse\fR" +Entry point for paste.deploy in the server. +This is normally \fBegg:swift#proxy\fR. +.RE +.PD + + +.SH FILTER SECTIONS +.PD 1 +.RS 0 +Any section that has its name prefixed by "filter:" indicates a filter section. +Filters are used to specify configuration parameters for specific swift middlewares. +Below are the filters available and respective acceptable parameters. + +\fBFor details of the available options for each filter section see proxy-server.conf.5.\fR + +.RS 0 +.IP "\fB[filter:cache]\fR" +.RE +Caching middleware that manages caching in swift. + +.RS 3 +.IP "\fBuse\fR" +Entry point for paste.deploy in the server. +This is normally \fBegg:swift#memcache\fR. +.RE +.PD + + +.RS 0 +.IP "\fB[filter:catch_errors]\fR" +.RE +.RS 3 +.IP "\fBuse\fR" +Entry point for paste.deploy in the server. +This is normally \fBegg:swift#catch_errors\fR. +.RE +.PD + + +.RS 0 +.IP "\fB[filter:proxy-logging]\fR" +.RE +.RS 3 +.IP "\fBuse\fR" +Entry point for paste.deploy in the server. +This is normally \fBegg:swift#proxy_logging\fR. +.RE +.PD + + +.SH DOCUMENTATION +.LP +More in depth documentation in regards to +.BI swift-container-reconciler +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/overview_policies.html. + +.SH "SEE ALSO" +.BR swift-container-reconciler(1) diff --git a/doc/manpages/container-server.conf.5 b/doc/manpages/container-server.conf.5 index 406545b186..000bc3e94e 100644 --- a/doc/manpages/container-server.conf.5 +++ b/doc/manpages/container-server.conf.5 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2012 OpenStack, LLC. +.\" Copyright (c) 2010-2012 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -20,7 +20,7 @@ .SH NAME .LP .B container-server.conf -\- configuration file for the openstack-swift container server +\- configuration file for the OpenStack Swift container server @@ -42,7 +42,7 @@ certain number of key/value parameters which are described later. Any line that begins with a '#' symbol is ignored. You can find more information about python-pastedeploy configuration format at -\fIhttp://pythonpaste.org/deploy/#config-format\fR +\fIhttps://docs.pylonsproject.org/projects/pastedeploy/en/latest/#config-format\fR @@ -56,20 +56,41 @@ are acceptable within this section. IP address the container server should bind to. The default is 0.0.0.0 which will make it bind to all available addresses. .IP "\fBbind_port\fR" -TCP port the container server should bind to. The default is 6001. +TCP port the container server should bind to. The default is 6201. +.IP "\fBkeep_idle\fR" +Value to set for socket TCP_KEEPIDLE. The default value is 600. +.IP "\fBbind_timeout\fR" +Timeout to bind socket. The default is 30. .IP \fBbacklog\fR TCP backlog. Maximum number of allowed pending connections. The default value is 4096. .IP \fBworkers\fR -Number of container server workers to fork. The default is 1. +The number of pre-forked processes that will accept connections. Zero means +no fork. The default is auto which will make the server try to match the +number of effective cpu cores if python multiprocessing is available (included +with most python distributions >= 2.6) or fallback to one. It's worth noting +that individual workers will use many eventlet co-routines to service multiple +concurrent requests. +.IP \fBmax_clients\fR +Maximum number of clients one worker can process simultaneously (it will +actually accept(2) N + 1). Setting this to one (1) will only handle one request +at a time, without accepting another request concurrently. The default is 1024. +.IP \fBallowed_sync_hosts\fR +This is a comma separated list of hosts allowed in the X-Container-Sync-To +field for containers. This is the old-style of using container sync. It is +strongly recommended to use the new style of a separate +container-sync-realms.conf -- see container-sync-realms.conf-sample +allowed_sync_hosts = 127.0.0.1 .IP \fBuser\fR The system user that the container server will run as. The default is swift. .IP \fBswift_dir\fR Swift configuration directory. The default is /etc/swift. .IP \fBdevices\fR -Parent directory or where devices are mounted. Default is /srv/node. +Parent directory of where devices are mounted. Default is /srv/node. .IP \fBmount_check\fR Whether or not check if the devices are mounted to prevent accidentally writing to the root device. The default is set to true. +.IP \fBdisable_fallocate\fR +Disable pre-allocate disk space for a file. The default is false. .IP \fBlog_name\fR Label used when logging. The default is swift. .IP \fBlog_facility\fR @@ -78,6 +99,54 @@ Syslog log facility. The default is LOG_LOCAL0. Logging level. The default is INFO. .IP \fBlog_address\fR Logging address. The default is /dev/log. +.IP \fBlog_max_line_length\fR +The following caps the length of log lines to the value given; no limit if +set to 0, the default. +.IP \fBlog_custom_handlers\fR +Comma separated list of functions to call to setup custom log handlers. +functions get passed: conf, name, log_to_console, log_route, fmt, logger, +adapted_logger. The default is empty. +.IP \fBlog_udp_host\fR +If set, log_udp_host will override log_address. +.IP "\fBlog_udp_port\fR +UDP log port, the default is 514. +.IP \fBlog_statsd_host\fR +StatsD server. IPv4/IPv6 addresses and hostnames are +supported. If a hostname resolves to an IPv4 and IPv6 address, the IPv4 +address will be used. +.IP \fBlog_statsd_port\fR +The default is 8125. +.IP \fBlog_statsd_default_sample_rate\fR +The default is 1. +.IP \fBlog_statsd_sample_rate_factor\fR +The default is 1. +.IP \fBlog_statsd_metric_prefix\fR +The default is empty. +.IP \fBdb_preallocation\fR +If you don't mind the extra disk space usage in overhead, you can turn this +on to preallocate disk space with SQLite databases to decrease fragmentation. +The default is false. +.IP \fBeventlet_debug\fR +Debug mode for eventlet library. The default is false. +.IP \fBfallocate_reserve\fR +You can set fallocate_reserve to the number of bytes or percentage of disk +space you'd like fallocate to reserve, whether there is space for the given +file size or not. Percentage will be used if the value ends with a '%'. +The default is 1%. +.IP \fBnice_priority\fR +Modify scheduling priority of server processes. Niceness values range from -20 +(most favorable to the process) to 19 (least favorable to the process). +The default does not modify priority. +.IP \fBionice_class\fR +Modify I/O scheduling class of server processes. I/O niceness class values +are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and IOPRIO_CLASS_IDLE (idle). +The default does not modify class and priority. +Work only with ionice_priority. +.IP \fBionice_priority\fR +Modify I/O scheduling priority of server processes. I/O niceness priority +is a number which goes from 0 to 7. The higher the value, the lower +the I/O priority of the process. Work only with ionice_class. +Ignored if IOPRIO_CLASS_IDLE is set. .RE .PD @@ -110,16 +179,39 @@ This is normally \fBegg:swift#container\fR. Label used when logging. The default is container-server. .IP "\fBset log_facility\fR Syslog log facility. The default is LOG_LOCAL0. -.IP "\fB set log_level\fR +.IP "\fBset log_level\fR Logging level. The default is INFO. -.IP "\fB set log_requests\fR +.IP "\fBset log_requests\fR Enables request logging. The default is True. -.IP "\fB set log_address\fR +.IP "\fBset log_address\fR Logging address. The default is /dev/log. .IP \fBnode_timeout\fR Request timeout to external services. The default is 3 seconds. .IP \fBconn_timeout\fR Connection timeout to external services. The default is 0.5 seconds. +.IP \fBallow_versions\fR +The default is false. +.IP \fBreplication_server\fR +Configure parameter for creating specific server. +To handle all verbs, including replication verbs, do not specify +"replication_server" (this is the default). To only handle replication, +set to a True value (e.g. "True" or "1"). To handle only non-replication +verbs, set to "False". Unless you have a separate replication network, you +should not specify any value for "replication_server". +.IP \fBnice_priority\fR +Modify scheduling priority of server processes. Niceness values range from -20 +(most favorable to the process) to 19 (least favorable to the process). +The default does not modify priority. +.IP \fBionice_class\fR +Modify I/O scheduling class of server processes. I/O niceness class values +are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and IOPRIO_CLASS_IDLE (idle). +The default does not modify class and priority. +Work only with ionice_priority. +.IP \fBionice_priority\fR +Modify I/O scheduling priority of server processes. I/O niceness priority +is a number which goes from 0 to 7. The higher the value, the lower +the I/O priority of the process. Work only with ionice_class. +Ignored if IOPRIO_CLASS_IDLE is set. .RE .PD @@ -155,6 +247,36 @@ and ensure that swift has read/write. The default is /var/cache/swift. .RE .PD +.RS 0 +.IP "\fB[filter:xprofile]\fR" +.RS 3 +.IP "\fBuse\fR" +Entry point for paste.deploy for the xprofile middleware. This is the reference to the installed python egg. +This is normally \fBegg:swift#xprofile\fR. +.IP "\fBprofile_module\fR" +This option enable you to switch profilers which should inherit from python +standard profiler. Currently the supported value can be 'cProfile', 'eventlet.green.profile' etc. +.IP "\fBlog_filename_prefix\fR" +This prefix will be used to combine process ID and timestamp to name the +profile data file. Make sure the executing user has permission to write +into this path (missing path segments will be created, if necessary). +If you enable profiling in more than one type of daemon, you must override +it with an unique value like, the default is /var/log/swift/profile/account.profile. +.IP "\fBdump_interval\fR" +The profile data will be dumped to local disk based on above naming rule +in this interval. The default is 5.0. +.IP "\fBdump_timestamp\fR" +Be careful, this option will enable profiler to dump data into the file with +time stamp which means there will be lots of files piled up in the directory. +The default is false +.IP "\fBpath\fR" +This is the path of the URL to access the mini web UI. The default is __profile__. +.IP "\fBflush_at_shutdown\fR" +Clear the data when the wsgi server shutdown. The default is false. +.IP "\fBunwind\fR" +Unwind the iterator of applications. Default is false. +.RE +.PD .SH ADDITIONAL SECTIONS @@ -173,16 +295,14 @@ Syslog log facility. The default is LOG_LOCAL0. Logging level. The default is INFO. .IP \fBlog_address\fR Logging address. The default is /dev/log. -.IP \fBvm_test_mode\fR -Indicates that you are using a VM environment. The default is no. -.IP \fBer_diff\fR -The default is 1000. +.IP \fBper_diff\fR +Maximum number of database rows that will be sync'd in a single HTTP replication request. The default is 1000. .IP \fBmax_diffs\fR This caps how long the replicator will spend trying to sync a given database per pass so the other databases don't get starved. The default is 100. .IP \fBconcurrency\fR Number of replication workers to spawn. The default is 8. .IP "\fBrun_pause [deprecated]\fR" -Time in seconds to wait between replication passes. The default is 10. +Time in seconds to wait between replication passes. The default is 30. .IP \fBinterval\fR Replaces run_pause with the more standard "interval", which means the replicator won't pause unless it takes less than the interval set. The default is 30. .IP \fBnode_timeout\fR @@ -192,6 +312,29 @@ Connection timeout to external services. The default is 0.5 seconds. .IP \fBreclaim_age\fR Time elapsed in seconds before an container can be reclaimed. The default is 604800 seconds. +.IP \fBrsync_compress\fR +Allow rsync to compress data which is transmitted to destination node +during sync. However, this is applicable only when destination node is in +a different region than the local one. The default is false. +.IP \fBrsync_module\fR +Format of the rsync module where the replicator will send data. See +etc/rsyncd.conf-sample for some usage examples. +.IP \fBrecon_cache_path\fR +Path to recon cache directory. The default is /var/cache/swift. +.IP \fBnice_priority\fR +Modify scheduling priority of server processes. Niceness values range from -20 +(most favorable to the process) to 19 (least favorable to the process). +The default does not modify priority. +.IP \fBionice_class\fR +Modify I/O scheduling class of server processes. I/O niceness class values +are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and IOPRIO_CLASS_IDLE (idle). +The default does not modify class and priority. +Work only with ionice_priority. +.IP \fBionice_priority\fR +Modify I/O scheduling priority of server processes. I/O niceness priority +is a number which goes from 0 to 7. The higher the value, the lower +the I/O priority of the process. Work only with ionice_class. +Ignored if IOPRIO_CLASS_IDLE is set. .RE @@ -210,15 +353,33 @@ Logging address. The default is /dev/log. .IP \fBinterval\fR Minimum time for a pass to take. The default is 300 seconds. .IP \fBconcurrency\fR -Number of reaper workers to spawn. The default is 4. +Number of updater workers to spawn. The default is 4. .IP \fBnode_timeout\fR Request timeout to external services. The default is 3 seconds. .IP \fBconn_timeout\fR Connection timeout to external services. The default is 0.5 seconds. -.IP \fBslowdown = 0.01\fR -Slowdown will sleep that amount between containers. The default is 0.01 seconds. +.IP \fBcontainers_per_second\fR +Maximum containers updated per second. Should be tuned according to individual system specs. 0 is unlimited. The default is 50. +.IP "\fBslowdown [deprecated]\fR" +Slowdown will sleep that amount between containers. The default is 0.01 seconds. Deprecated in favor of containers_per_second .IP \fBaccount_suppression_time\fR Seconds to suppress updating an account that has generated an error. The default is 60 seconds. +.IP \fBrecon_cache_path\fR +Path to recon cache directory. The default is /var/cache/swift. +.IP \fBnice_priority\fR +Modify scheduling priority of server processes. Niceness values range from -20 +(most favorable to the process) to 19 (least favorable to the process). +The default does not modify priority. +.IP \fBionice_class\fR +Modify I/O scheduling class of server processes. I/O niceness class values +are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and IOPRIO_CLASS_IDLE (idle). +The default does not modify class and priority. +Work only with ionice_priority. +.IP \fBionice_priority\fR +Modify I/O scheduling priority of server processes. I/O niceness priority +is a number which goes from 0 to 7. The higher the value, the lower +the I/O priority of the process. Work only with ionice_class. +Ignored if IOPRIO_CLASS_IDLE is set. .RE .PD @@ -237,6 +398,24 @@ Logging level. The default is INFO. Logging address. The default is /dev/log. .IP \fBinterval\fR Will audit, at most, 1 container per device per interval. The default is 1800 seconds. +.IP \fBcontainers_per_second\fR +Maximum containers audited per second. Should be tuned according to individual system specs. 0 is unlimited. The default is 200. +.IP \fBrecon_cache_path\fR +Path to recon cache directory. The default is /var/cache/swift. +.IP \fBnice_priority\fR +Modify scheduling priority of server processes. Niceness values range from -20 +(most favorable to the process) to 19 (least favorable to the process). +The default does not modify priority. +.IP \fBionice_class\fR +Modify I/O scheduling class of server processes. I/O niceness class values +are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and IOPRIO_CLASS_IDLE (idle). +The default does not modify class and priority. +Work only with ionice_priority. +.IP \fBionice_priority\fR +Modify I/O scheduling priority of server processes. I/O niceness priority +is a number which goes from 0 to 7. The higher the value, the lower +the I/O priority of the process. Work only with ionice_class. +Ignored if IOPRIO_CLASS_IDLE is set. .RE @@ -259,6 +438,26 @@ If you need to use an HTTP Proxy, set it here; defaults to no proxy. Will audit, at most, each container once per interval. The default is 300 seconds. .IP \fBcontainer_time\fR Maximum amount of time to spend syncing each container per pass. The default is 60 seconds. +.IP \fBconn_timeout\fR +Connection timeout to external services. The default is 5 seconds. +.IP \fBrequest_tries\fR +Server errors from requests will be retried by default. The default is 3. +.IP \fBinternal_client_conf_path\fR +Internal client config file path. +.IP \fBnice_priority\fR +Modify scheduling priority of server processes. Niceness values range from -20 +(most favorable to the process) to 19 (least favorable to the process). +The default does not modify priority. +.IP \fBionice_class\fR +Modify I/O scheduling class of server processes. I/O niceness class values +are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and IOPRIO_CLASS_IDLE (idle). +The default does not modify class and priority. +Work only with ionice_priority. +.IP \fBionice_priority\fR +Modify I/O scheduling priority of server processes. I/O niceness priority +is a number which goes from 0 to 7. The higher the value, the lower +the I/O priority of the process. Work only with ionice_class. +Ignored if IOPRIO_CLASS_IDLE is set. .RE .PD @@ -268,13 +467,11 @@ Maximum amount of time to spend syncing each container per pass. The default is .SH DOCUMENTATION .LP More in depth documentation about the swift-container-server and -also Openstack-Swift as a whole can be found at -.BI http://swift.openstack.org/admin_guide.html +also OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/admin_guide.html and -.BI http://swift.openstack.org +.BI https://docs.openstack.org/swift/latest/ .SH "SEE ALSO" -.BR swift-container-server(1), - - +.BR swift-container-server(1) diff --git a/doc/manpages/container-sync-realms.conf.5 b/doc/manpages/container-sync-realms.conf.5 new file mode 100644 index 0000000000..e96b400115 --- /dev/null +++ b/doc/manpages/container-sync-realms.conf.5 @@ -0,0 +1,138 @@ +.\" +.\" Author: HCLTech-SSW +.\" Copyright (c) 2010-2017 OpenStack Foundation. +.\" +.\" Licensed under the Apache License, Version 2.0 (the "License"); +.\" you may not use this file except in compliance with the License. +.\" You may obtain a copy of the License at +.\" +.\" http://www.apache.org/licenses/LICENSE-2.0 +.\" +.\" Unless required by applicable law or agreed to in writing, software +.\" distributed under the License is distributed on an "AS IS" BASIS, +.\" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +.\" implied. +.\" See the License for the specific language governing permissions and +.\" limitations under the License. +.\" +.TH container-sync-realms.conf 5 "10/09/2017" "Linux" "OpenStack Swift" + +.SH NAME +.LP +.B container-sync-realms.conf +\- configuration file for the OpenStack Swift container sync realms + + + +.SH SYNOPSIS +.LP +.B container-sync-realms.conf + + + +.SH DESCRIPTION +.PP +This is the configuration file used by the Object storage Swift to perform container to container +synchronization. This configuration file is used to configure clusters to allow/accept sync +requests to/from other clusters. Using this configuration file, the user specifies where +to sync their container to along with a secret synchronization key. + +You can find more information about container to container synchronization at +\fIhttps://docs.openstack.org/swift/latest/overview_container_sync.html\fR + +The configuration file follows the python-pastedeploy syntax. The file is divided +into sections, which are enclosed by square brackets. Each section will contain a +certain number of key/value parameters which are described later. + +Any line that begins with a '#' symbol is ignored. + +You can find more information about python-pastedeploy configuration format at +\fIhttps://docs.pylonsproject.org/projects/pastedeploy/en/latest/#config-format\fR + + + +.SH GLOBAL SECTION +.PD 1 +.RS 0 +This is indicated by section named [DEFAULT]. Below are the parameters that +are acceptable within this section. + +.IP "\fBmtime_check_interval\fR" +The number of seconds between checking the modified time of this config file for changes +and therefore reloading it. The default value is 300. +.RE +.PD + + + +.SH REALM SECTIONS +.PD 1 +.RS 0 +Each section name is the name of a sync realm, for example [realm1]. +A sync realm is a set of clusters that have agreed to allow container syncing with each other. +Realm names will be considered case insensitive. Below are the parameters that are acceptable +within this section. + +.IP "\fBcluster_clustername1\fR" +Any values in the realm section whose name begin with cluster_ will indicate the name and +endpoint of a cluster and will be used by external users in their container's +X-Container-Sync-To metadata header values with the format as "realm_name/cluster_name/container_name". +The Realm and cluster names are considered to be case insensitive. +.IP "\fBcluster_clustername2\fR" +Any values in the realm section whose name begin with cluster_ will indicate the name and +endpoint of a cluster and will be used by external users in their container's +X-Container-Sync-To metadata header values with the format as "realm_name/cluster_name/container_name". +The Realm and cluster names are considered to be case insensitive. + +The endpoint is what the container sync daemon will use when sending out +requests to that cluster. Keep in mind this endpoint must be reachable by all +container servers, since that is where the container sync daemon runs. Note +that the endpoint ends with /v1/ and that the container sync daemon will then +add the account/container/obj name after that. + +.IP "\fBkey\fR" +The key is the overall cluster-to-cluster key used in combination with the external +users' key that they set on their containers' X-Container-Sync-Key metadata header +values. These keys will be used to sign each request the container sync daemon makes +and used to validate each incoming container sync request. +.IP "\fBkey2\fR" +The key2 is optional and is an additional key incoming requests will be checked +against. This is so you can rotate keys if you wish; you move the existing +key to key2 and make a new key value. +.RE +.PD + +.SH EXAMPLE +.nf +.RS 0 +[DEFAULT] +mtime_check_interval = 300 + + +[realm1] +key = realm1key +key2 = realm1key2 +cluster_clustername1 = https://host1/v1/ +cluster_clustername2 = https://host2/v1/ + +[realm2] +key = realm2key +key2 = realm2key2 +cluster_clustername3 = https://host3/v1/ +cluster_clustername4 = https://host4/v1/ +.RE +.fi + + +.SH DOCUMENTATION +.LP +More in depth documentation in regards to +.BI swift-container-sync +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/overview_container_sync.html +and +.BI https://docs.openstack.org/swift/latest/ + + +.SH "SEE ALSO" +.BR swift-container-sync(1) diff --git a/doc/manpages/dispersion.conf.5 b/doc/manpages/dispersion.conf.5 index 8355304473..7698ee338e 100644 --- a/doc/manpages/dispersion.conf.5 +++ b/doc/manpages/dispersion.conf.5 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2012 OpenStack, LLC. +.\" Copyright (c) 2010-2012 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -14,42 +14,58 @@ .\" implied. .\" See the License for the specific language governing permissions and .\" limitations under the License. -.\" +.\" .TH dispersion.conf 5 "8/26/2011" "Linux" "OpenStack Swift" -.SH NAME +.SH NAME .LP .B dispersion.conf -\- configuration file for the openstack-swift dispersion tools +\- configuration file for the OpenStack Swift dispersion tools .SH SYNOPSIS .LP .B dispersion.conf -.SH DESCRIPTION +.SH DESCRIPTION .PP This is the configuration file used by the dispersion populate and report tools. -The file format consists of the '[dispersion]' module as the header and available parameters. -Any line that begins with a '#' symbol is ignored. +The file format consists of the '[dispersion]' module as the header and available parameters. +Any line that begins with a '#' symbol is ignored. .SH PARAMETERS -.PD 1 +.PD 1 .RS 0 +.IP "\fBauth_version\fR" +Authentication system API version. The default is 1.0. .IP "\fBauth_url\fR" -Authentication system URL -.IP "\fBauth_user\fR" +Authentication system URL +.IP "\fBauth_user\fR" Authentication system account/user name .IP "\fBauth_key\fR" -Authentication system account/user password +Authentication system account/user password +.IP "\fBproject_name\fR" +Project name in case of keystone auth version 3 +.IP "\fBproject_domain_name\fR" +Project domain name in case of keystone auth version 3 +.IP "\fBuser_domain_name\fR" +User domain name in case of keystone auth version 3 +.IP "\fBendpoint_type\fR" +The default is 'publicURL'. +.IP "\fBkeystone_api_insecure\fR" +The default is false. .IP "\fBswift_dir\fR" -Location of openstack-swift configuration and ring files +Location of OpenStack Swift configuration and ring files .IP "\fBdispersion_coverage\fR" -Percentage of partition coverage to use. The default is 1. +Percentage of partition coverage to use. The default is 1.0. .IP "\fBretries\fR" -Maximum number of attempts +Maximum number of attempts. The defaul is 5. .IP "\fBconcurrency\fR" Concurrency to use. The default is 25. +.IP "\fBcontainer_populate\fR" +The default is true. +.IP "\fBobject_populate\fR" +The default is true. .IP "\fBdump_json\fR" Whether to output in json format. The default is no. .IP "\fBcontainer_report\fR" @@ -60,30 +76,34 @@ Whether to run the object report. The default is yes. .PD .SH SAMPLE -.PD 0 +.PD 0 .RS 0 .IP "[dispersion]" .IP "auth_url = https://127.0.0.1:443/auth/v1.0" .IP "auth_user = dpstats:dpstats" .IP "auth_key = dpstats" .IP "swift_dir = /etc/swift" -.IP "# dispersion_coverage = 1" +.IP "# keystone_api_insecure = no" +.IP "# project_name = dpstats" +.IP "# project_domain_name = default" +.IP "# user_domain_name = default" +.IP "# dispersion_coverage = 1.0" .IP "# retries = 5" .IP "# concurrency = 25" .IP "# dump_json = no" .IP "# container_report = yes" .IP "# object_report = yes" .RE -.PD +.PD + - .SH DOCUMENTATION .LP More in depth documentation about the swift-dispersion utilities and -also Openstack-Swift as a whole can be found at -.BI http://swift.openstack.org/admin_guide.html#cluster-health -and -.BI http://swift.openstack.org +also OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/admin_guide.html#dispersion-report +and +.BI https://docs.openstack.org/swift/latest/ .SH "SEE ALSO" diff --git a/doc/manpages/object-expirer.conf.5 b/doc/manpages/object-expirer.conf.5 index 8de307ee22..42ca4e0756 100644 --- a/doc/manpages/object-expirer.conf.5 +++ b/doc/manpages/object-expirer.conf.5 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2012 OpenStack, LLC. +.\" Copyright (c) 2012 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -14,13 +14,13 @@ .\" implied. .\" See the License for the specific language governing permissions and .\" limitations under the License. -.\" +.\" .TH object-expirer.conf 5 "03/15/2012" "Linux" "OpenStack Swift" -.SH NAME +.SH NAME .LP .B object-expirer.conf -\- configuration file for the openstack-swift object exprier daemon +\- configuration file for the OpenStack Swift object expirer daemon @@ -30,54 +30,100 @@ -.SH DESCRIPTION +.SH DESCRIPTION .PP -This is the configuration file used by the object expirer daemon. The daemon's -function is to query the internal hidden expiring_objects_account to discover +This is the configuration file used by the object expirer daemon. The daemon's +function is to query the internal hidden expiring_objects_account to discover objects that need to be deleted and to then delete them. The configuration file follows the python-pastedeploy syntax. The file is divided -into sections, which are enclosed by square brackets. Each section will contain a -certain number of key/value parameters which are described later. +into sections, which are enclosed by square brackets. Each section will contain a +certain number of key/value parameters which are described later. -Any line that begins with a '#' symbol is ignored. +Any line that begins with a '#' symbol is ignored. -You can find more information about python-pastedeploy configuration format at -\fIhttp://pythonpaste.org/deploy/#config-format\fR +You can find more information about python-pastedeploy configuration format at +\fIhttps://docs.pylonsproject.org/projects/pastedeploy/en/latest/#config-format\fR .SH GLOBAL SECTION -.PD 1 +.PD 1 .RS 0 -This is indicated by section named [DEFAULT]. Below are the parameters that -are acceptable within this section. +This is indicated by section named [DEFAULT]. Below are the parameters that +are acceptable within this section. -.IP \fBswift_dir\fR +.IP \fBswift_dir\fR Swift configuration directory. The default is /etc/swift. -.IP \fBuser\fR -The system user that the object server will run as. The default is swift. -.IP \fBlog_name\fR +.IP \fBuser\fR +The system user that the object server will run as. The default is swift. +.IP \fBlog_name\fR Label used when logging. The default is swift. -.IP \fBlog_facility\fR +.IP \fBlog_facility\fR Syslog log facility. The default is LOG_LOCAL0. -.IP \fBlog_level\fR +.IP \fBlog_level\fR Logging level. The default is INFO. .IP \fBlog_address\fR Logging address. The default is /dev/log. +.IP \fBlog_max_line_length\fR +The following caps the length of log lines to the value given; no limit if +set to 0, the default. +.IP \fBlog_custom_handlers\fR +Comma separated list of functions to call to setup custom log handlers. +functions get passed: conf, name, log_to_console, log_route, fmt, logger, +adapted_logger. The default is empty. +.IP \fBlog_udp_host\fR +If set, log_udp_host will override log_address. +.IP "\fBlog_udp_port\fR +UDP log port, the default is 514. +.IP \fBlog_statsd_host\fR +StatsD server. IPv4/IPv6 addresses and hostnames are +supported. If a hostname resolves to an IPv4 and IPv6 address, the IPv4 +address will be used. +.IP \fBlog_statsd_port\fR +The default is 8125. +.IP \fBlog_statsd_default_sample_rate\fR +The default is 1. +.IP \fBlog_statsd_sample_rate_factor\fR +The default is 1. +.IP \fBlog_statsd_metric_prefix\fR +The default is empty. +.IP \fBdelay_reaping_account/container\fR +Normally, the expirer begins reaping expired objects immediately. You can add +options prefixed with "delay_reaping_" in the form of +"delay_reaping_[/]" to cause the expirer to delay processing of +tasks in those account or account/container. The [/] part of the +config option names should url-quote the paths. The value is in seconds. The +default is no delay for any tasks. +.IP \fBround_robin_task_cache_size\fR +Number of tasks objects to cache before processing. +.IP \fBnice_priority\fR +Modify scheduling priority of server processes. Niceness values range from -20 +(most favorable to the process) to 19 (least favorable to the process). +The default does not modify priority. +.IP \fBionice_class\fR +Modify I/O scheduling class of server processes. I/O niceness class values +are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and IOPRIO_CLASS_IDLE (idle). +The default does not modify class and priority. +Work only with ionice_priority. +.IP \fBionice_priority\fR +Modify I/O scheduling priority of server processes. I/O niceness priority +is a number which goes from 0 to 7. The higher the value, the lower +the I/O priority of the process. Work only with ionice_class. +Ignored if IOPRIO_CLASS_IDLE is set. .RE .PD .SH PIPELINE SECTION -.PD 1 +.PD 1 .RS 0 This is indicated by section name [pipeline:main]. Below are the parameters that -are acceptable within this section. +are acceptable within this section. .IP "\fBpipeline\fR" -It is used when you need to apply a number of filters. It is a list of filters +It is used when you need to apply a number of filters. It is a list of filters ended by an application. The default should be \fB"catch_errors cache proxy-server"\fR .RE .PD @@ -85,24 +131,38 @@ ended by an application. The default should be \fB"catch_errors cache proxy-serv .SH APP SECTION -.PD 1 +.PD 1 .RS 0 This is indicated by section name [app:object-server]. Below are the parameters that are acceptable within this section. .IP "\fBuse\fR" -Entry point for paste.deploy for the object server. This is the reference to the installed python egg. -The default is \fBegg:swift#proxy\fR. See proxy-server.conf-sample for options or See proxy-server.conf manpage. +Entry point for paste.deploy for the object server. This is the reference to the installed python egg. +The default is \fBegg:swift#proxy\fR. See proxy-server.conf-sample for options or See proxy-server.conf manpage. +.IP \fBnice_priority\fR +Modify scheduling priority of server processes. Niceness values range from -20 +(most favorable to the process) to 19 (least favorable to the process). +The default does not modify priority. +.IP \fBionice_class\fR +Modify I/O scheduling class of server processes. I/O niceness class values +are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and IOPRIO_CLASS_IDLE (idle). +The default does not modify class and priority. +Work only with ionice_priority. +.IP \fBionice_priority\fR +Modify I/O scheduling priority of server processes. I/O niceness priority +is a number which goes from 0 to 7. The higher the value, the lower +the I/O priority of the process. Work only with ionice_class. +Ignored if IOPRIO_CLASS_IDLE is set. .RE .PD .SH FILTER SECTION -.PD 1 +.PD 1 .RS 0 Any section that has its name prefixed by "filter:" indicates a filter section. Filters are used to specify configuration parameters for specific swift middlewares. -Below are the filters available and respective acceptable parameters. +Below are the filters available and respective acceptable parameters. .RS 0 .IP "\fB[filter:cache]\fR" @@ -117,8 +177,8 @@ The default is \fBegg:swift#memcache\fR. See proxy-server.conf-sample for option .RE -.RS 0 -.IP "\fB[filter:catch_errors]\fR" +.RS 0 +.IP "\fB[filter:catch_errors]\fR" .RE .RS 3 .IP \fBuse\fR @@ -126,17 +186,79 @@ Entry point for paste.deploy for the catch_errors middleware. This is the refere The default is \fBegg:swift#catch_errors\fR. See proxy-server.conf-sample for options or See proxy-server.conf manpage. .RE +.RS 0 +.IP "\fB[filter:proxy-logging]\fR" +.RE + +Logging for the proxy server now lives in this middleware. +If the access_* variables are not set, logging directives from [DEFAULT] +without "access_" will be used. + +.RS 3 +.IP \fBuse\fR +Entry point for paste.deploy for the proxy_logging middleware. This is the reference to the installed python egg. +This is normally \fBegg:swift#proxy_logging\fR. See proxy-server.conf-sample for options or See proxy-server.conf manpage. +.RE + .PD +.SH OBJECT EXPIRER SECTION +.PD 1 +.RS 0 +.IP "\fB[object-expirer]\fR" +.RE +.RS 3 +.IP \fBinterval\fR +Replaces run_pause with the more standard "interval", which means the replicator won't pause unless it takes less than the interval set. The default is 300. +.IP \fBreport_interval\fR +The default is 300 seconds. +.IP \fBrequest_tries\fR +The number of times the expirer's internal client will +attempt any given request in the event of failure. The default is 3. +.IP \fBconcurrency\fR +Number of expirer workers to spawn. The default is 1. +.IP \fBprocesses\fR +Processes is how many parts to divide the work into, one part per process that will be doing the work. +Processes set 0 means that a single process will be doing all the work. +Processes can also be specified on the command line and will override the config value. +The default is 0. +.IP \fBprocess\fR +Process is which of the parts a particular process will work on process can also be specified +on the command line and will override the config value process is "zero based", if you want +to use 3 processes, you should run processes with process set to 0, 1, and 2. The default is 0. +.IP \fBreclaim_age\fR +The expirer will re-attempt expiring if the source object is not available +up to reclaim_age seconds before it gives up and deletes the entry in the +queue. The default is 604800 seconds. +.IP \fBrecon_cache_path\fR +Path to recon cache directory. The default is /var/cache/swift. +.IP \fBnice_priority\fR +Modify scheduling priority of server processes. Niceness values range from -20 +(most favorable to the process) to 19 (least favorable to the process). +The default does not modify priority. +.IP \fBionice_class\fR +Modify I/O scheduling class of server processes. I/O niceness class values +are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and IOPRIO_CLASS_IDLE (idle). +The default does not modify class and priority. +Work only with ionice_priority. +.IP \fBionice_priority\fR +Modify I/O scheduling priority of server processes. I/O niceness priority +is a number which goes from 0 to 7. The higher the value, the lower +the I/O priority of the process. Work only with ionice_class. +Ignored if IOPRIO_CLASS_IDLE is set. + +.RE +.PD + .SH DOCUMENTATION .LP More in depth documentation about the swift-object-expirer and -also Openstack-Swift as a whole can be found at -.BI http://swift.openstack.org/admin_guide.html -and -.BI http://swift.openstack.org +also OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/admin_guide.html +and +.BI https://docs.openstack.org/swift/latest/ .SH "SEE ALSO" diff --git a/doc/manpages/object-server.conf.5 b/doc/manpages/object-server.conf.5 index 4087cde138..a8c743e6d4 100644 --- a/doc/manpages/object-server.conf.5 +++ b/doc/manpages/object-server.conf.5 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2012 OpenStack, LLC. +.\" Copyright (c) 2010-2012 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -20,7 +20,7 @@ .SH NAME .LP .B object-server.conf -\- configuration file for the openstack-swift object server +\- configuration file for the OpenStack Swift object server @@ -33,7 +33,8 @@ .SH DESCRIPTION .PP This is the configuration file used by the object server and other object -background services, such as; replicator, updater and auditor. +background services, such as; replicator, reconstructor, updater, auditor, and +expirer. The configuration file follows the python-pastedeploy syntax. The file is divided into sections, which are enclosed by square brackets. Each section will contain a @@ -42,7 +43,7 @@ certain number of key/value parameters which are described later. Any line that begins with a '#' symbol is ignored. You can find more information about python-pastedeploy configuration format at -\fIhttp://pythonpaste.org/deploy/#config-format\fR +\fIhttps://docs.pylonsproject.org/projects/pastedeploy/en/latest/#config-format\fR @@ -56,20 +57,39 @@ are acceptable within this section. IP address the object server should bind to. The default is 0.0.0.0 which will make it bind to all available addresses. .IP "\fBbind_port\fR" -TCP port the object server should bind to. The default is 6000. +TCP port the object server should bind to. The default is 6200. +.IP "\fBkeep_idle\fR" +Value to set for socket TCP_KEEPIDLE. The default value is 600. +.IP "\fBbind_timeout\fR" +Timeout to bind socket. The default is 30. .IP \fBbacklog\fR TCP backlog. Maximum number of allowed pending connections. The default value is 4096. .IP \fBworkers\fR -Number of object server workers to fork. The default is 1. +The number of pre-forked processes that will accept connections. Zero means +no fork. The default is auto which will make the server try to match the +number of effective cpu cores if python multiprocessing is available (included +with most python distributions >= 2.6) or fallback to one. It's worth noting +that individual workers will use many eventlet co-routines to service multiple +concurrent requests. +.IP \fBmax_clients\fR +Maximum number of clients one worker can process simultaneously (it will +actually accept(2) N + 1). Setting this to one (1) will only handle one request +at a time, without accepting another request concurrently. The default is 1024. .IP \fBuser\fR The system user that the object server will run as. The default is swift. .IP \fBswift_dir\fR Swift configuration directory. The default is /etc/swift. .IP \fBdevices\fR -Parent directory or where devices are mounted. Default is /srv/node. +Parent directory of where devices are mounted. Default is /srv/node. .IP \fBmount_check\fR Whether or not check if the devices are mounted to prevent accidentally writing to the root device. The default is set to true. +.IP \fBdisable_fallocate\fR +Disable pre-allocate disk space for a file. The default is false. +.IP \fBservers_per_port\fR +Make object-server run this many worker processes per unique port of "local" +ring devices across all storage policies. The default value of 0 disables this +feature. .IP \fBlog_name\fR Label used when logging. The default is swift. .IP \fBlog_facility\fR @@ -78,6 +98,70 @@ Syslog log facility. The default is LOG_LOCAL0. Logging level. The default is INFO. .IP \fBlog_address\fR Logging address. The default is /dev/log. +.IP \fBlog_max_line_length\fR +The following caps the length of log lines to the value given; no limit if +set to 0, the default. +.IP \fBlog_custom_handlers\fR +Comma separated list of functions to call to setup custom log handlers. +functions get passed: conf, name, log_to_console, log_route, fmt, logger, +adapted_logger. The default is empty. +.IP \fBlog_udp_host\fR +If set, log_udp_host will override log_address. +.IP "\fBlog_udp_port\fR +UDP log port, the default is 514. +.IP \fBlog_statsd_host\fR +StatsD server. IPv4/IPv6 addresses and hostnames are +supported. If a hostname resolves to an IPv4 and IPv6 address, the IPv4 +address will be used. +.IP \fBlog_statsd_port\fR +The default is 8125. +.IP \fBlog_statsd_default_sample_rate\fR +The default is 1. +.IP \fBlog_statsd_sample_rate_factor\fR +The default is 1. +.IP \fBlog_statsd_metric_prefix\fR +The default is empty. +.IP \fBeventlet_debug\fR +Debug mode for eventlet library. The default is false. +.IP \fBfallocate_reserve\fR +You can set fallocate_reserve to the number of bytes or percentage of disk +space you'd like fallocate to reserve, whether there is space for the given +file size or not. Percentage will be used if the value ends with a '%'. +The default is 1%. +.IP \fBnode_timeout\fR +Request timeout to external services. The default is 3 seconds. +.IP \fBconn_timeout\fR +Connection timeout to external services. The default is 0.5 seconds. +.IP \fBcontainer_update_timeout\fR +Time to wait while sending a container update on object update. The default is 1 second. +.IP \fBclient_timeout\fR +Time to wait while receiving each chunk of data from a client or another +backend node. The default is 60. +.IP \fBnetwork_chunk_size\fR +The default is 65536. +.IP \fBdisk_chunk_size\fR +The default is 65536. +.IP \fBreclaim_age\fR +Time elapsed in seconds before an object can be reclaimed. The default is +604800 seconds. +.IP \fBcommit_window\fR +Time in seconds during which a newly written non-durable data file will not be +reclaimed. The value should be greater than zero and much less than +reclaim_age. The default is 60.0 seconds. +.IP \fBnice_priority\fR +Modify scheduling priority of server processes. Niceness values range from -20 +(most favorable to the process) to 19 (least favorable to the process). +The default does not modify priority. +.IP \fBionice_class\fR +Modify I/O scheduling class of server processes. I/O niceness class values +are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and IOPRIO_CLASS_IDLE (idle). +The default does not modify class and priority. +Work only with ionice_priority. +.IP \fBionice_priority\fR +Modify I/O scheduling priority of server processes. I/O niceness priority +is a number which goes from 0 to 7. The higher the value, the lower +the I/O priority of the process. Work only with ionice_class. +Ignored if IOPRIO_CLASS_IDLE is set. .RE .PD @@ -106,20 +190,85 @@ that are acceptable within this section. .IP "\fBuse\fR" Entry point for paste.deploy for the object server. This is the reference to the installed python egg. This is normally \fBegg:swift#object\fR. -.IP "\fBset log_name\fR +.IP "\fBset log_name\fR" Label used when logging. The default is object-server. -.IP "\fBset log_facility\fR +.IP "\fBset log_facility\fR" Syslog log facility. The default is LOG_LOCAL0. -.IP "\fB set log_level\fR +.IP "\fBset log_level\fR" Logging level. The default is INFO. -.IP "\fB set log_requests\fR +.IP "\fBset log_requests\fR" Enables request logging. The default is True. -.IP "\fB set log_address\fR +.IP "\fBset log_address\fR" Logging address. The default is /dev/log. +.IP "\fBmax_upload_time\fR" +The default is 86400. +.IP "\fBslow\fR" +The default is 0. +.IP "\fBkeep_cache_size\fR" +Objects smaller than this are not evicted from the buffercache once read. The default is 5242880. +.IP "\fBkeep_cache_private\fR" +If true, objects for authenticated GET requests may be kept in buffer cache +if small enough. The default is false. +.IP "\fBmb_per_sync\fR" +On PUTs, sync data every n MB. The default is 512. +.IP "\fBallowed_headers\fR" +Comma separated list of headers that can be set in metadata on an object. +This list is in addition to X-Object-Meta-* headers and cannot include Content-Type, etag, Content-Length, or deleted. +The default is 'Content-Disposition, Content-Encoding, X-Delete-At, X-Object-Manifest, X-Static-Large-Object, Cache-Control, Content-Language, Expires, X-Robots-Tag'. +.IP "\fBreplication_server\fR" +Configure parameter for creating specific server +To handle all verbs, including replication verbs, do not specify +"replication_server" (this is the default). To only handle replication, +set to a True value (e.g. "True" or "1"). To handle only non-replication +verbs, set to "False". Unless you have a separate replication network, you +should not specify any value for "replication_server". +.IP "\fBreplication_concurrency\fR" +Set to restrict the number of concurrent incoming SSYNC requests +Set to 0 for unlimited (the default is 4). Note that SSYNC requests are only used +by the object reconstructor or the object replicator when configured to use ssync. +.IP "\fBreplication_concurrency_per_device\fR" +Set to restrict the number of concurrent incoming SSYNC requests per device; +set to 0 for unlimited requests per devices. This can help control I/O to each +device. This does not override replication_concurrency described above, so you +may need to adjust both parameters depending on your hardware or network +capacity. Defaults to 1. +.IP "\fBreplication_lock_timeout\fR" +Number of seconds to wait for an existing replication device lock before +giving up. The default is 15. +.IP "\fBreplication_failure_threshold\fR" +.IP "\fBreplication_failure_ratio\fR" +These two settings control when the SSYNC subrequest handler will +abort an incoming SSYNC attempt. An abort will occur if there are at +least threshold number of failures and the value of failures / successes +exceeds the ratio. The defaults of 100 and 1.0 means that at least 100 +failures have to occur and there have to be more failures than successes for +an abort to occur. +.IP "\fBsplice\fR" +Use splice() for zero-copy object GETs. This requires Linux kernel +version 3.0 or greater. If you set "splice = yes" but the kernel +does not support it, error messages will appear in the object server +logs at startup, but your object servers should continue to function. +The default is false. .IP \fBnode_timeout\fR Request timeout to external services. The default is 3 seconds. .IP \fBconn_timeout\fR Connection timeout to external services. The default is 0.5 seconds. +.IP \fBcontainer_update_timeout\fR +Time to wait while sending a container update on object update. The default is 1 second. +.IP \fBnice_priority\fR +Modify scheduling priority of server processes. Niceness values range from -20 +(most favorable to the process) to 19 (least favorable to the process). +The default does not modify priority. +.IP \fBionice_class\fR +Modify I/O scheduling class of server processes. I/O niceness class values +are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and IOPRIO_CLASS_IDLE (idle). +The default does not modify class and priority. +Work only with ionice_priority. +.IP \fBionice_priority\fR +Modify I/O scheduling priority of server processes. I/O niceness priority +is a number which goes from 0 to 7. The higher the value, the lower +the I/O priority of the process. Work only with ionice_class. +Ignored if IOPRIO_CLASS_IDLE is set. .RE .PD @@ -153,9 +302,41 @@ This is normally \fBegg:swift#recon\fR. The recon_cache_path simply sets the directory where stats for a few items will be stored. Depending on the method of deployment you may need to create this directory manually and ensure that swift has read/write. The default is /var/cache/swift. +.IP "\fBrecon_lock_path\fR" +The default is /var/lock. .RE .PD +.RS 0 +.IP "\fB[filter:xprofile]\fR" +.RS 3 +.IP "\fBuse\fR" +Entry point for paste.deploy for the xprofile middleware. This is the reference to the installed python egg. +This is normally \fBegg:swift#xprofile\fR. +.IP "\fBprofile_module\fR" +This option enable you to switch profilers which should inherit from python +standard profiler. Currently the supported value can be 'cProfile', 'eventlet.green.profile' etc. +.IP "\fBlog_filename_prefix\fR" +This prefix will be used to combine process ID and timestamp to name the +profile data file. Make sure the executing user has permission to write +into this path (missing path segments will be created, if necessary). +If you enable profiling in more than one type of daemon, you must override +it with an unique value like, the default is /var/log/swift/profile/account.profile. +.IP "\fBdump_interval\fR" +The profile data will be dumped to local disk based on above naming rule +in this interval. The default is 5.0. +.IP "\fBdump_timestamp\fR" +Be careful, this option will enable profiler to dump data into the file with +time stamp which means there will be lots of files piled up in the directory. +The default is false +.IP "\fBpath\fR" +This is the path of the URL to access the mini web UI. The default is __profile__. +.IP "\fBflush_at_shutdown\fR" +Clear the data when the wsgi server shutdown. The default is false. +.IP "\fBunwind\fR" +Unwind the iterator of applications. Default is false. +.RE +.PD .SH ADDITIONAL SECTIONS @@ -174,35 +355,130 @@ Syslog log facility. The default is LOG_LOCAL0. Logging level. The default is INFO. .IP \fBlog_address\fR Logging address. The default is /dev/log. -.IP \fBvm_test_mode\fR -Indicates that you are using a VM environment. The default is no. .IP \fBdaemonize\fR Whether or not to run replication as a daemon. The default is yes. -.IP \fBrun_pause\fR +.IP "\fBrun_pause [deprecated]\fR" +Time in seconds to wait between replication passes. The default is 30. +.IP \fBinterval\fR Time in seconds to wait between replication passes. The default is 30. .IP \fBconcurrency\fR Number of replication workers to spawn. The default is 1. .IP \fBstats_interval\fR Interval in seconds between logging replication statistics. The default is 300. +.IP \fBsync_method\fR +The sync method to use; default is rsync but you can use ssync to try the +EXPERIMENTAL all-swift-code-no-rsync-callouts method. Once ssync is verified +as having performance comparable to, or better than, rsync, we plan to +deprecate rsync so we can move on with more features for replication. .IP \fBrsync_timeout\fR Max duration of a partition rsync. The default is 900 seconds. .IP \fBrsync_io_timeout\fR Passed to rsync for I/O OP timeout. The default is 30 seconds. +.IP \fBrsync_compress\fR +Allow rsync to compress data which is transmitted to destination node +during sync. However, this is applicable only when destination node is in +a different region than the local one. +NOTE: Objects that are already compressed (for example: .tar.gz, .mp3) might +slow down the syncing process. The default is false. +.IP \fBrsync_module\fR +Format of the rsync module where the replicator will send data. See +etc/rsyncd.conf-sample for some usage examples. The default is empty. +.IP \fBnode_timeout\fR +Request timeout to external services. The default is 10 seconds. +.IP \fBrsync_bwlimit\fR +Passed to rsync for bandwidth limit in kB/s. The default is 0 (unlimited). .IP \fBhttp_timeout\fR Max duration of an HTTP request. The default is 60 seconds. .IP \fBlockup_timeout\fR Attempts to kill all workers if nothing replicates for lockup_timeout seconds. The default is 1800 seconds. -.IP \fBreclaim_age\fR -Time elapsed in seconds before an object can be reclaimed. The default is -604800 seconds. -.IP \fBrecon_enable\fR -Enable logging of replication stats for recon. The default is on. +.IP \fBring_check_interval\fR +The default is 15. +.IP \fBrsync_error_log_line_length\fR +Limits how long rsync error log lines are. 0 (default) means to log the entire line. +.IP "\fBrecon_cache_path\fR" +The recon_cache_path simply sets the directory where stats for a few items will be stored. +Depending on the method of deployment you may need to create this directory manually +and ensure that swift has read/write.The default is /var/cache/swift. +.IP "\fBhandoffs_first\fR" +The flag to replicate handoffs prior to canonical partitions. +It allows one to force syncing and deleting handoffs quickly. +If set to a True value(e.g. "True" or "1"), partitions +that are not supposed to be on the node will be replicated first. +The default is false. +.IP "\fBhandoff_delete\fR" +The number of replicas which are ensured in swift. +If the number less than the number of replicas is set, object-replicator +could delete local handoffs even if all replicas are not ensured in the +cluster. Object-replicator would remove local handoff partition directories +after syncing partition when the number of successful responses is greater +than or equal to this number. By default(auto), handoff partitions will be +removed when it has successfully replicated to all the canonical nodes. + +The handoffs_first and handoff_delete are options for a special case +such as disk full in the cluster. These two options SHOULD NOT BE +CHANGED, except for such an extreme situations. (e.g. disks filled up +or are about to fill up. Anyway, DO NOT let your drives fill up). +.IP \fBnice_priority\fR +Modify scheduling priority of server processes. Niceness values range from -20 +(most favorable to the process) to 19 (least favorable to the process). +The default does not modify priority. +.IP \fBionice_class\fR +Modify I/O scheduling class of server processes. I/O niceness class values +are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and IOPRIO_CLASS_IDLE (idle). +The default does not modify class and priority. +Work only with ionice_priority. +.IP \fBionice_priority\fR +Modify I/O scheduling priority of server processes. I/O niceness priority +is a number which goes from 0 to 7. The higher the value, the lower +the I/O priority of the process. Work only with ionice_class. +Ignored if IOPRIO_CLASS_IDLE is set. +.RE + + +.RS 0 +.IP "\fB[object-reconstructor]\fR" +.RE +.RS 3 +.IP \fBlog_name\fR +Label used when logging. The default is object-reconstructor. +.IP \fBlog_facility\fR +Syslog log facility. The default is LOG_LOCAL0. +.IP \fBlog_level\fR +Logging level. The default is INFO. +.IP \fBlog_address\fR +Logging address. The default is /dev/log. +.IP \fBdaemonize\fR +Whether or not to run replication as a daemon. The default is yes. +.IP "\fBrun_pause [deprecated]\fR" +Time in seconds to wait between replication passes. The default is 30. +.IP \fBinterval\fR +Time in seconds to wait between replication passes. The default is 30. +.IP \fBconcurrency\fR +Number of replication workers to spawn. The default is 1. +.IP \fBstats_interval\fR +Interval in seconds between logging replication statistics. The default is 300. +.IP \fBnode_timeout\fR +Request timeout to external services. The default is 10 seconds. +.IP \fBhttp_timeout\fR +Max duration of an HTTP request. The default is 60 seconds. +.IP \fBlockup_timeout\fR +Attempts to kill all workers if nothing replicates for lockup_timeout seconds. The +default is 1800 seconds. +.IP \fBring_check_interval\fR +The default is 15. .IP "\fBrecon_cache_path\fR" The recon_cache_path simply sets the directory where stats for a few items will be stored. Depending on the method of deployment you may need to create this directory manually and ensure that swift has read/write.The default is /var/cache/swift. +.IP "\fBhandoffs_first\fR" +The flag to replicate handoffs prior to canonical partitions. +It allows one to force syncing and deleting handoffs quickly. +If set to a True value(e.g. "True" or "1"), partitions +that are not supposed to be on the node will be replicated first. +The default is false. .RE +.PD .RS 0 @@ -220,13 +496,31 @@ Logging address. The default is /dev/log. .IP \fBinterval\fR Minimum time for a pass to take. The default is 300 seconds. .IP \fBconcurrency\fR -Number of reaper workers to spawn. The default is 1. +Number of updater workers to spawn. The default is 1. .IP \fBnode_timeout\fR Request timeout to external services. The default is 10 seconds. -.IP \fBconn_timeout\fR -Connection timeout to external services. The default is 0.5 seconds. -.IP \fBslowdown = 0.01\fR -Slowdown will sleep that amount between objects. The default is 0.01 seconds. +.IP \fBobjects_per_second\fR +Maximum objects updated per second. Should be tuned according to individual system specs. 0 is unlimited. The default is 50. +.IP "\fBslowdown [deprecated]\fR" +Slowdown will sleep that amount between objects. The default is 0.01 seconds. Deprecated in favor of objects_per_second. +.IP "\fBrecon_cache_path\fR" +The recon_cache_path simply sets the directory where stats for a few items will be stored. +Depending on the method of deployment you may need to create this directory manually +and ensure that swift has read/write. The default is /var/cache/swift. +.IP \fBnice_priority\fR +Modify scheduling priority of server processes. Niceness values range from -20 +(most favorable to the process) to 19 (least favorable to the process). +The default does not modify priority. +.IP \fBionice_class\fR +Modify I/O scheduling class of server processes. I/O niceness class values +are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and IOPRIO_CLASS_IDLE (idle). +The default does not modify class and priority. +Work only with ionice_priority. +.IP \fBionice_priority\fR +Modify I/O scheduling priority of server processes. I/O niceness priority +is a number which goes from 0 to 7. The higher the value, the lower +the I/O priority of the process. Work only with ionice_class. +Ignored if IOPRIO_CLASS_IDLE is set. .RE .PD @@ -244,31 +538,125 @@ Logging level. The default is INFO. .IP \fBlog_address\fR Logging address. The default is /dev/log. +.IP \fBdisk_chunk_size\fR +The default is 65536. .IP \fBfiles_per_second\fR Maximum files audited per second. Should be tuned according to individual system specs. 0 is unlimited. The default is 20. .IP \fBbytes_per_second\fR Maximum bytes audited per second. Should be tuned according to individual system specs. 0 is unlimited. The default is 10000000. +.IP \fBconcurrency\fR +Number of auditor workers to spawn. The default is 1. .IP \fBlog_time\fR The default is 3600 seconds. .IP \fBzero_byte_files_per_second\fR The default is 50. +.IP "\fBrecon_cache_path\fR" +The recon_cache_path simply sets the directory where stats for a few items will be stored. +Depending on the method of deployment you may need to create this directory manually +and ensure that swift has read/write. The default is /var/cache/swift. +.IP \fBobject_size_stats\fR +Takes a comma separated list of ints. If set, the object auditor will +increment a counter for every object whose size is <= to the given break +points and report the result after a full scan. +.IP \fBrsync_tempfile_timeout\fR +Time elapsed in seconds before rsync tempfiles will be unlinked. Config value of "auto" +will try to use object-replicator's rsync_timeout + 900 or fall-back to 86400 (1 day). +.IP \fBnice_priority\fR +Modify scheduling priority of server processes. Niceness values range from -20 +(most favorable to the process) to 19 (least favorable to the process). +The default does not modify priority. +.IP \fBionice_class\fR +Modify I/O scheduling class of server processes. I/O niceness class values +are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and IOPRIO_CLASS_IDLE (idle). +The default does not modify class and priority. +Work only with ionice_priority. +.IP \fBionice_priority\fR +Modify I/O scheduling priority of server processes. I/O niceness priority +is a number which goes from 0 to 7. The higher the value, the lower +the I/O priority of the process. Work only with ionice_class. +Ignored if IOPRIO_CLASS_IDLE is set. .RE +.RS 0 +.IP "\fB[object-expirer]\fR" +.RE +.RS 3 +.IP \fBlog_name\fR +Label used when logging. The default is object-expirer. +.IP \fBlog_facility\fR +Syslog log facility. The default is LOG_LOCAL0. +.IP \fBlog_level\fR +Logging level. The default is INFO. +.IP \fBlog_address\fR +Logging address. The default is /dev/log. +.IP \fBinterval\fR +Minimum time for a pass to take. The default is 300 seconds. +.IP \fBreport_interval\fR +Minimum time for a pass to report. The default is 300 seconds. +.IP \fBrequest_tries\fR +The number of times the expirer's internal client will +attempt any given request in the event of failure. The default is 3. +.IP \fBconcurrency\fR +Number of expirer workers to spawn. The default is 1. +.IP \fBdequeue_from_legacy\fR +The flag to execute legacy style expirer tasks. The default is false. +.IP \fBprocesses\fR +Processes can only be used in conjunction with `dequeue_from_legacy`. +Processes is how many parts to divide the legacy work into, one part per process that will be doing the work. +Processes set 0 means that a single process will be doing all the legacy work. +Processes can also be specified on the command line and will override the config value. +The default is 0. +.IP \fBprocess\fR +Process can only be used in conjunction with `dequeue_from_legacy`. +Process is which of the parts a particular legacy process will work on process can also be specified +on the command line and will override the config value process is "zero based", if you want +to use 3 processes, you should run processes with process set to 0, 1, and 2. The default is 0. +.IP \fBreclaim_age\fR +The expirer will re-attempt expiring if the source object is not available up +to reclaim_age seconds before it gives up and deletes the task in the queue. +The default is 604800 seconds (= 1 week). +.IP \fBdelay_reaping_account/container\fR +Normally, the expirer begins reaping expired objects immediately. You can add +options prefixed with "delay_reaping_" in the form of +"delay_reaping_[/]" to cause the expirer to delay processing of +tasks in those account or account/container. The [/] part of the +config option names should url-quote the paths. The value is in seconds. The +default is no delay for any tasks. +.IP \fBround_robin_task_cache_size\fR +Number of tasks objects to cache before processing. +.IP \fBrecon_cache_path\fR +Path to recon cache directory. The default is /var/cache/swift +.IP \fBnice_priority\fR +Modify scheduling priority of server processes. Niceness values range from -20 +(most favorable to the process) to 19 (least favorable to the process). +The default does not modify priority. +.IP \fBionice_class\fR +Modify I/O scheduling class of server processes. I/O niceness class values +are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and IOPRIO_CLASS_IDLE (idle). +The default does not modify class and priority. +Work only with ionice_priority. +.IP \fBionice_priority\fR +Modify I/O scheduling priority of server processes. I/O niceness priority +is a number which goes from 0 to 7. The higher the value, the lower +the I/O priority of the process. Work only with ionice_class. +Ignored if IOPRIO_CLASS_IDLE is set. +.RE +.PD + + .SH DOCUMENTATION .LP More in depth documentation about the swift-object-server and -also Openstack-Swift as a whole can be found at -.BI http://swift.openstack.org/admin_guide.html +also OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/admin_guide.html and -.BI http://swift.openstack.org +.BI https://docs.openstack.org/swift/latest/ .SH "SEE ALSO" .BR swift-object-server(1), - - diff --git a/doc/manpages/proxy-server.conf.5 b/doc/manpages/proxy-server.conf.5 index 2d077e0e45..6fe7c93163 100644 --- a/doc/manpages/proxy-server.conf.5 +++ b/doc/manpages/proxy-server.conf.5 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2012 OpenStack, LLC. +.\" Copyright (c) 2010-2012 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -20,7 +20,7 @@ .SH NAME .LP .B proxy-server.conf -\- configuration file for the openstack-swift proxy server +\- configuration file for the OpenStack Swift proxy server @@ -41,7 +41,7 @@ certain number of key/value parameters which are described later. Any line that begins with a '#' symbol is ignored. You can find more information about python-pastedeploy configuration format at -\fIhttp://pythonpaste.org/deploy/#config-format\fR +\fIhttps://docs.pylonsproject.org/projects/pastedeploy/en/latest/#config-format\fR @@ -56,12 +56,38 @@ IP address the proxy server should bind to. The default is 0.0.0.0 which will ma it bind to all available addresses. .IP "\fBbind_port\fR" TCP port the proxy server should bind to. The default is 80. +.IP "\fBkeep_idle\fR" +Value to set for socket TCP_KEEPIDLE. The default value is 600. +.IP "\fBbind_timeout\fR" +Timeout to bind socket. The default is 30. .IP \fBbacklog\fR TCP backlog. Maximum number of allowed pending connections. The default value is 4096. +.IP \fBadmin_key\fR +Key to use for admin calls that are HMAC signed. Default is empty, +which will disable admin calls to /info. +.IP \fBdisallowed_sections\fR +Allows the ability to withhold sections from showing up in the public calls +to /info. You can withhold subsections by separating the dict level with a +".". The following would cause the sections 'container_quotas' and 'tempurl' +to not be listed, and the key max_failed_deletes would be removed from +bulk_delete. Default value is 'swift.valid_api_versions' which allows all +registered features to be listed via HTTP GET /info except +swift.valid_api_versions information .IP \fBworkers\fR -Number of container server workers to fork. The default is 1. +The number of pre-forked processes that will accept connections. Zero means +no fork. The default is auto which will make the server try to match the +number of effective cpu cores if python multiprocessing is available (included +with most python distributions >= 2.6) or fallback to one. It's worth noting +that individual workers will use many eventlet co-routines to service multiple +concurrent requests. +.IP \fBmax_clients\fR +Maximum number of clients one worker can process simultaneously (it will +actually accept(2) N + 1). Setting this to one (1) will only handle one request +at a time, without accepting another request concurrently. The default is 1024. .IP \fBuser\fR -The system user that the container server will run as. The default is swift. +The system user that the proxy server will run as. The default is swift. +.IP \fBexpose_info\fR +Enables exposing configuration settings via HTTP GET /info. The default is true. .IP \fBswift_dir\fR Swift configuration directory. The default is /etc/swift. .IP \fBcert_file\fR @@ -78,6 +104,62 @@ Syslog log facility. The default is LOG_LOCAL0. Logging level. The default is INFO. .IP \fBlog_address\fR Logging address. The default is /dev/log. +.IP \fBlog_max_line_length\fR +To cap the length of log lines to the value given. No limit if set to 0, the default. +.IP \fBlog_headers\fR +The default is false. +.IP \fBlog_custom_handlers\fR +Comma separated list of functions to call to setup custom log handlers. +functions get passed: conf, name, log_to_console, log_route, fmt, logger, +adapted_logger. The default is empty. +.IP \fBlog_udp_host\fR +If set, log_udp_host will override log_address. +.IP "\fBlog_udp_port\fR +UDP log port, the default is 514. +.IP \fBlog_statsd_host\fR +StatsD server. IPv4/IPv6 addresses and hostnames are +supported. If a hostname resolves to an IPv4 and IPv6 address, the IPv4 +address will be used. +.IP \fBlog_statsd_port\fR +The default is 8125. +.IP \fBlog_statsd_default_sample_rate\fR +The default is 1. +.IP \fBlog_statsd_sample_rate_factor\fR +The default is 1. +.IP \fBlog_statsd_metric_prefix\fR +The default is empty. +.IP \fBclient_timeout\fR +Time to wait while receiving each chunk of data from a client or another +backend node. The default is 60. +.IP \fBeventlet_debug\fR +Debug mode for eventlet library. The default is false. +.IP \fBtrans_id_suffix\fR +This optional suffix (default is empty) that would be appended to the swift transaction +id allows one to easily figure out from which cluster that X-Trans-Id belongs to. +This is very useful when one is managing more than one swift cluster. +.IP \fBcors_allow_origin\fR +List of origin hosts that are allowed for CORS requests in addition to what +the container has set. Use a comma separated list of full URL (http://foo.bar:1234,https://foo.bar) +.IP \fBstrict_cors_mode\fR +If True (default) then CORS requests are only allowed if their Origin header +matches an allowed origin. Otherwise, any Origin is allowed. +.IP \fBcors_expose_headers\fR +Comma separated list of headers to expose through Access-Control-Expose-Headers, +in addition to the defaults and any headers set in container metadata. +.IP \fBnice_priority\fR +Modify scheduling priority of server processes. Niceness values range from -20 +(most favorable to the process) to 19 (least favorable to the process). +The default does not modify priority. +.IP \fBionice_class\fR +Modify I/O scheduling class of server processes. I/O niceness class values +are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and IOPRIO_CLASS_IDLE (idle). +The default does not modify class and priority. +Work only with ionice_priority. +.IP \fBionice_priority\fR +Modify I/O scheduling priority of server processes. I/O niceness priority +is a number which goes from 0 to 7. The higher the value, the lower +the I/O priority of the process. Work only with ionice_class. +Ignored if IOPRIO_CLASS_IDLE is set. .RE .PD @@ -91,8 +173,13 @@ are acceptable within this section. .IP "\fBpipeline\fR" It is used when you need apply a number of filters. It is a list of filters -ended by an application. The normal pipeline is "catch_errors healthcheck -cache ratelimit tempauth proxy-logging proxy-server". +ended by an application. The normal pipeline is "catch_errors gatekeeper healthcheck proxy-logging cache container_sync bulk tempurl ratelimit tempauth container-quotas account-quotas slo dlo versioned_writes proxy-logging proxy-server". + +Note: The double proxy-logging in the pipeline is not a mistake. The +left-most proxy-logging is there to log requests that were handled in +middleware and never made it through to the right-most middleware (and +proxy server). Double logging is prevented for normal requests. See +proxy-logging docs. .RE .PD @@ -114,6 +201,7 @@ This is normally \fBegg:swift#healthcheck\fR. An optional filesystem path which, if present, will cause the healthcheck URL to return "503 Service Unavailable" with a body of "DISABLED BY FILE". .RE +.PD .RS 0 @@ -141,20 +229,35 @@ systems are in use for one Swift cluster. The default is AUTH. .IP \fBauth_prefix\fR The auth prefix will cause requests beginning with this prefix to be routed to the auth subsystem, for granting tokens, etc. The default is /auth/. +.IP \fBrequire_group\fR +The require_group parameter names a group that must be presented by +either X-Auth-Token or X-Service-Token. Usually this parameter is +used only with multiple reseller prefixes (e.g., SERVICE_require_group=blah). +By default, no group is needed. Do not use .admin. .IP \fBtoken_life\fR This is the time in seconds before the token expires. The default is 86400. -.IP \fBallowed_sync_hosts\fR -This is a comma separated list of hosts allowed to send X-Container-Sync-Key requests. +.IP \fBallow_overrides\fR +This allows middleware higher in the WSGI pipeline to override auth +processing, useful for middleware such as tempurl and formpost. If you know +you're not going to use such middleware and you want a bit of extra security, +you can set this to false. The default is true. +.IP \fBstorage_url_scheme\fR +This specifies what scheme to return with storage urls: +http, https, or default (chooses based on what the server is running as) +This can be useful with an SSL load balancer in front of a non-SSL server. .IP \fBuser__\fR Lastly, you need to list all the accounts/users you want here. The format is: user__ = [group] [group] [...] [storage_url] +or if you want underscores in or , you can base64 encode them +(with no equal signs) and use this format: +user64__ = [group] [group] [...] [storage_url] There are special groups of: \fI.reseller_admin\fR who can do anything to any account for this auth and also \fI.admin\fR who can do anything within the account. If neither of these groups are specified, the user can only access containers that have been explicitly allowed for them by a \fI.admin\fR or \fI.reseller_admin\fR. -The trailing optional storage_url allows you to specify an alternate url to hand +The trailing optional storage_url allows you to specify an alternate URL to hand back to the user upon authentication. If not specified, this defaults to \fIhttp[s]://:/v1/_\fR where http or https depends on whether cert_file is specified in the [DEFAULT] section, and are based @@ -173,6 +276,113 @@ Here are example entries, required for running the tests: .RE .PD +.RS 0 +.IP "\fB[filter:authtoken]\fR" +.RE + +To enable Keystone authentication you need to have the auth token +middleware first to be configured. Here is an example below, please +refer to the keystone's documentation for details about the +different settings. + +You'll need to have as well the keystoneauth middleware enabled +and have it in your main pipeline so instead of having tempauth in +there you can change it to: authtoken keystoneauth + +The auth credentials ("project_domain_name", "user_domain_name", "username", +"project_name", "password") must match the Keystone credentials for the Swift +service. The example values shown here assume a user named "swift" with admin +role on a project named "service", both being in the Keystone domain with id +"default". Refer to the KeystoneMiddleware documentation at +.BI https://docs.openstack.org/keystonemiddleware/latest/middlewarearchitecture.html#configuration +for other examples. + +.PD 0 +.RS 10 +.IP "paste.filter_factory = keystonemiddleware.auth_token:filter_factory" +.IP "www_authenticate_uri = http://keystonehost:5000" +.IP "auth_url = http://keystonehost:5000" +.IP "auth_plugin = password" +.IP "project_domain_id = default" +.IP "user_domain_id = default" +.IP "project_name = service" +.IP "username = swift" +.IP "password = password" +.IP "" +.IP "# delay_auth_decision defaults to False, but leaving it as false will" +.IP "# prevent other auth systems, staticweb, tempurl, formpost, and ACLs from" +.IP "# working. This value must be explicitly set to True." +.IP "delay_auth_decision = False" +.IP +.IP "cache = swift.cache" +.IP "include_service_catalog = False" +.RE +.PD + + +.RS 0 +.IP "\fB[filter:keystoneauth]\fR" +.RE + +Keystone authentication middleware. + +.RS 3 +.IP \fBuse\fR +Entry point for paste.deploy for the keystoneauth middleware. This is the reference to the installed python egg. +This is normally \fBegg:swift#keystoneauth\fR. +.IP \fBreseller_prefix\fR +The reseller_prefix option lists account namespaces that this middleware is +responsible for. The prefix is placed before the Keystone project id. +For example, for project 12345678, and prefix AUTH, the account is +named AUTH_12345678 (i.e., path is /v1/AUTH_12345678/...). +Several prefixes are allowed by specifying a comma-separated list +as in: "reseller_prefix = AUTH, SERVICE". The empty string indicates a +single blank/empty prefix. If an empty prefix is required in a list of +prefixes, a value of '' (two single quote characters) indicates a +blank/empty prefix. Except for the blank/empty prefix, an underscore ('_') +character is appended to the value unless already present. +.IP \fBoperator_roles\fR +The user must have at least one role named by operator_roles on a +project in order to create, delete and modify containers and objects +and to set and read privileged headers such as ACLs. +If there are several reseller prefix items, you can prefix the +parameter so it applies only to those accounts (for example +the parameter SERVICE_operator_roles applies to the /v1/SERVICE_ +path). If you omit the prefix, the option applies to all reseller +prefix items. For the blank/empty prefix, prefix with '' (do not put +underscore after the two single quote characters). +.IP \fBreseller_admin_role\fR +The reseller admin role has the ability to create and delete accounts. +.IP \fBallow_overrides\fR +This allows middleware higher in the WSGI pipeline to override auth +processing, useful for middleware such as tempurl and formpost. If you know +you're not going to use such middleware and you want a bit of extra security, +you can set this to false. +.IP \fBservice_roles\fR +If the service_roles parameter is present, an X-Service-Token must be +present in the request that when validated, grants at least one role listed +in the parameter. The X-Service-Token may be scoped to any project. +If there are several reseller prefix items, you can prefix the +parameter so it applies only to those accounts (for example +the parameter SERVICE_service_roles applies to the /v1/SERVICE_ +path). If you omit the prefix, the option applies to all reseller +prefix items. For the blank/empty prefix, prefix with '' (do not put +underscore after the two single quote characters). +By default, no service_roles are required. +.IP \fBdefault_domain_id\fR +For backwards compatibility, keystoneauth will match names in cross-tenant +access control lists (ACLs) when both the requesting user and the tenant +are in the default domain i.e the domain to which existing tenants are +migrated. The default_domain_id value configured here should be the same as +the value used during migration of tenants to keystone domains. +.IP \fBallow_names_in_acls\fR +For a new installation, or an installation in which keystone projects may +move between domains, you should disable backwards compatible name matching +in ACLs by setting allow_names_in_acls to false: +.RE +.PD + + .RS 0 .IP "\fB[filter:cache]\fR" .RE @@ -191,27 +401,18 @@ Syslog log facility. The default is LOG_LOCAL0. Logging level. The default is INFO. .IP "\fBset log_address\fR" Logging address. The default is /dev/log. -.IP "\fBset log_headers\fR " +.IP "\fBset log_headers\fR" Enables the ability to log request headers. The default is False. +.IP \fBmemcache_max_connections\fR +Sets the maximum number of connections to each memcached server per worker. .IP \fBmemcache_servers\fR -If not set in the configuration file, the value for memcache_servers will be read from /etc/swift/memcache.conf (see memcache.conf-sample) or lacking that file, it will default to the value below. You can specify multiple servers separated with commas, as in: 10.1.2.3:11211,10.1.2.4:11211. This can be a list separated by commas. The default is 127.0.0.1:11211. -.IP \fBmemcache_serialization_support\fR -This sets how memcache values are serialized and deserialized: -.RE - -.PD 0 -.RS 10 -.IP "0 = older, insecure pickle serialization" -.IP "1 = json serialization but pickles can still be read (still insecure)" -.IP "2 = json serialization only (secure and the default)" +If not set in the configuration file, the value for memcache_servers will be +read from /etc/swift/memcache.conf (see memcache.conf-sample) or lacking that +file, it will default to 127.0.0.1:11211. You can specify multiple servers +separated with commas, as in: 10.1.2.3:11211,10.1.2.4:11211. (IPv6 +addresses must follow rfc3986 section-3.2.2, i.e. [::1]:11211) .RE - -.RS 10 -To avoid an instant full cache flush, existing installations should upgrade with 0, then set to 1 and reload, then after some time (24 hours) set to 2 and reload. In the future, the ability to use pickle serialization will be removed. - -If not set in the configuration file, the value for memcache_serialization_support will be read from /etc/swift/memcache.conf if it exists (see memcache.conf-sample). Otherwise, the default value as indicated above will be used. -.RE - +.PD .RS 0 @@ -251,22 +452,23 @@ rate but better average accuracy. The default is 5. .IP \fBaccount_ratelimit\fR If set, will limit PUT and DELETE requests to /account_name/container_name. Number is in requests per second. If set to 0 means disabled. The default is 0. -.IP \fBaccount_whitelist\fR -Comma separated lists of account names that will not be rate limited. The default is ''. -.IP \fBaccount_blacklist\fR -Comma separated lists of account names that will not be allowed. Returns a 497 response. -The default is ''. .IP \fBcontainer_ratelimit_size\fR When set with container_limit_x = r: for containers of size x, limit requests per second to r. Will limit PUT, DELETE, and POST requests to /a/c/o. The default is ''. +.IP \fBcontainer_listing_ratelimit_size\fR +Similarly to the above container-level write limits, the following will limit +container GET (listing) requests. .RE +.PD + .RS 0 .IP "\fB[filter:domain_remap]\fR" .RE -Middleware that translates container and account parts of a domain to path parameters that the proxy server understands. The container.account.storageurl/object gets translated to container.account.storageurl/path_root/account/container/object and account.storageurl/path_root/container/object gets translated to account.storageurl/path_root/account/container/object +Middleware that translates container and account parts of a domain to path parameters that the proxy server understands. +The container.account.storageurl/object gets translated to container.account.storageurl/path_root/account/container/object and account.storageurl/path_root/container/object gets translated to account.storageurl/path_root/account/container/object .RS 3 .IP \fBuse\fR @@ -274,12 +476,16 @@ Entry point for paste.deploy for the domain_remap middleware. This is the refere This is normally \fBegg:swift#domain_remap\fR. .IP "\fBset log_name\fR" Label used when logging. The default is domain_remap. +.IP "\fBset log_facility\fR" +Syslog log facility. The default is LOG_LOCAL0. +.IP "\fBset log_level\fR " +Logging level. The default is INFO. .IP "\fBset log_address\fR" Logging address. The default is /dev/log. -.IP "\fBset log_headers\fR" +.IP "\fBset log_headers\fR " Enables the ability to log request headers. The default is False. .IP \fBstorage_domain\fR -The domain to be used by the middleware. +The domain to be used by the middleware. Multiple domains can be specified separated by a comma. .IP \fBpath_root\fR The path root value for the storage URL. The default is v1. .IP \fBreseller_prefixes\fR @@ -287,11 +493,15 @@ Browsers can convert a host header to lowercase, so check that reseller prefix on the account is the correct case. This is done by comparing the items in the reseller_prefixes config option to the found prefix. If they match except for case, the item from reseller_prefixes will be used -instead of the found reseller prefix. The reseller_prefixes list is exclusive. -If defined, any request with an account prefix not in that list will be ignored -by this middleware. Defaults to 'AUTH'. +instead of the found reseller prefix. When none match, the default reseller +prefix is used. When no default reseller prefix is configured, any request with +an account prefix not in that list will be ignored by this middleware. +Defaults to 'AUTH'. +.IP \fBdefault_reseller_prefix\fR +The default reseller prefix. This is used when none of the configured +reseller_prefixes match. When not set, no reseller prefix is added. .RE - +.PD .RS 0 @@ -312,7 +522,7 @@ Logging address. The default is /dev/log. .IP "\fBset log_headers\fR" Enables the ability to log request headers. The default is False. .RE - +.PD .RS 0 @@ -340,8 +550,12 @@ The domain to be used by the middleware. .IP \fBlookup_depth\fR How deep in the CNAME chain to look for something that matches the storage domain. The default is 1. +.IP \fBnameservers\fR +Specify the nameservers to use to do the CNAME resolution. If unset, the system +configuration is used. Multiple nameservers can be specified separated by a comma. +Default is unset. .RE - +.PD .RS 0 @@ -354,8 +568,6 @@ Note: Put staticweb just after your auth filter(s) in the pipeline .IP \fBuse\fR Entry point for paste.deploy for the staticweb middleware. This is the reference to the installed python egg. This is normally \fBegg:swift#staticweb\fR. -.IP \fBcache_timeout\fR -Seconds to cache container x-container-meta-web-* header values. The default is 300 seconds. .IP "\fBset log_name\fR" Label used when logging. The default is staticweb. .IP "\fBset log_facility\fR" @@ -366,23 +578,22 @@ Logging level. The default is INFO. Logging address. The default is /dev/log. .IP "\fBset log_headers\fR" Enables the ability to log request headers. The default is False. -.IP "\fBset access_log_name\fR" -Label used when logging. The default is staticweb. -.IP "\fBset access_log_facility\fR" -Syslog log facility. The default is LOG_LOCAL0. -.IP "\fBset access_log_level\fR " -Logging level. The default is INFO. .RE - +.PD .RS 0 .IP "\fB[filter:tempurl]\fR" .RE -Note: Put tempurl just before your auth filter(s) in the pipeline +Note: Put tempurl before slo, dlo, and your auth filter(s) in the pipeline .RS 3 +.IP \fBuse\fR +Entry point for paste.deploy for the tempurl middleware. This is the reference to the installed python egg. +This is normally \fBegg:swift#tempurl\fR. +.IP \fBmethods\fR +The methods allowed with Temp URLs. The default is 'GET HEAD PUT POST DELETE'. .IP \fBincoming_remove_headers\fR The headers to remove from incoming requests. Simply a whitespace delimited list of header names and names can optionally end with '*' to indicate a prefix match. incoming_allow_headers is a list of exceptions to these removals. .IP \fBincoming_allow_headers\fR @@ -391,9 +602,8 @@ The headers allowed as exceptions to incoming_remove_headers. Simply a whitespac The headers to remove from outgoing responses. Simply a whitespace delimited list of header names and names can optionally end with '*' to indicate a prefix match. outgoing_allow_headers is a list of exceptions to these removals. .IP "\fBoutgoing_allow_headers\fR" The headers allowed as exceptions to outgoing_remove_headers. Simply a whitespace delimited list of header names and names can optionally end with '*' to indicate a prefix match. -.IP "\fBset log_level\fR " .RE - +.PD .RS 0 @@ -407,6 +617,7 @@ Note: Put formpost just before your auth filter(s) in the pipeline Entry point for paste.deploy for the formpost middleware. This is the reference to the installed python egg. This is normally \fBegg:swift#formpost\fR. .RE +.PD @@ -421,16 +632,29 @@ Note: Just needs to be placed before the proxy-server in the pipeline. Entry point for paste.deploy for the name_check middleware. This is the reference to the installed python egg. This is normally \fBegg:swift#name_check\fR. .IP \fBforbidden_chars\fR -Characters that will not be allowed in a name. +Characters that will not be allowed in a name. The default is '"`<>. .IP \fBmaximum_length\fR -Maximum number of characters that can be in the name. +Maximum number of characters that can be in the name. The default is 255. .IP \fBforbidden_regexp\fR -Python regular expressions of substrings that will not be allowed in a name. +Python regular expressions of substrings that will not be allowed in a name. The default is /\./|/\.\./|/\.$|/\.\.$. +.RE +.PD + + +.RS 0 +.IP "\fB[filter:list-endpoints]\fR" +.RS 3 +.IP \fBuse\fR +Entry point for paste.deploy for the list_endpoints middleware. This is the reference to the installed python egg. +This is normally \fBegg:swift#list_endpoints\fR. +.IP \fBlist_endpoints_path\fR +The default is '/endpoints/'. .RE +.PD .RS 0 -.IP "\fB[filter:proxy_logging]\fR" +.IP "\fB[filter:proxy-logging]\fR" .RE Logging for the proxy server now lives in this middleware. @@ -456,25 +680,304 @@ unset. Default is 514. .IP \fBaccess_log_statsd_host\fR You can use log_statsd_* from [DEFAULT], or override them here. -Default is localhost. +StatsD server. IPv4/IPv6 addresses and hostnames are +supported. If a hostname resolves to an IPv4 and IPv6 address, the IPv4 +address will be used. .IP \fBaccess_log_statsd_port\fR Default is 8125. .IP \fBaccess_log_statsd_default_sample_rate\fR Default is 1. -.IP \fBaccess_log_statsd_metric_prefix = +.IP \fBaccess_log_statsd_sample_rate_factor\fR +The default is 1. +.IP \fBaccess_log_statsd_metric_prefix\fR Default is "" (empty-string) .IP \fBaccess_log_headers\fR Default is False. +.IP \fBaccess_log_headers_only\fR +If access_log_headers is True and access_log_headers_only is set only +these headers are logged. Multiple headers can be defined as comma separated +list like this: access_log_headers_only = Host, X-Object-Meta-Mtime +.IP \fBreveal_sensitive_prefix\fR +By default, the X-Auth-Token is logged. To obscure the value, +set reveal_sensitive_prefix to the number of characters to log. +For example, if set to 12, only the first 12 characters of the +token appear in the log. An unauthorized access of the log file +won't allow unauthorized usage of the token. However, the first +12 or so characters is unique enough that you can trace/debug +token usage. Set to 0 to suppress the token completely (replaced +by '...' in the log). The default is 16 chars. +Note: reveal_sensitive_prefix will not affect the value logged with access_log_headers=True. .IP \fBlog_statsd_valid_http_methods\fR What HTTP methods are allowed for StatsD logging (comma-sep); request methods not in this list will have "BAD_METHOD" for the portion of the metric. Default is "GET,HEAD,POST,PUT,DELETE,COPY,OPTIONS". +.IP \fBlog_anonymization_method\fR +Hashing algorithm for anonymization. Must be one of algorithms supported by Python's hashlib. Default is MD5. +.IP \fBlog_anonymization_salt\fR +Salt added as prefix before hashing the value to anonymize. Default is empty (no salt). +.IP "\fBlog_msg_template\fR" +Template used to format access logs. All words surrounded by curly brackets will be substituted with the appropriate values. + .RE +.PD 0 +.RS 10 +.IP "Some keywords map to timestamps and can be converted to standard dates formats using the matching transformers: 'datetime', 'asctime' or 'iso8601'." +.IP "Other transformers for timestamps are 's', 'ms', 'us' and 'ns' for seconds, milliseconds, microseconds and nanoseconds." +.IP "Python's strftime directives can also be used as tranformers (a, A, b, B, c, d, H, I, j, m, M, p, S, U, w, W, x, X, y, Y, Z)." +.IP "Some keywords map to user data that could be anonymized by using the transformer 'anonymized'." +.IP "Keywords availables are:" +.PD 0 +.RS 7 +.IP "client_ip (anonymizable)" +.IP "remote_addr (anonymizable)" +.IP "method (request method)" +.IP "path (anonymizable)" +.IP "protocol" +.IP "status_int" +.IP "referer (anonymizable)" +.IP "user_agent (anonymizable)" +.IP "auth_token" +.IP "bytes_recvd (number of bytes received)" +.IP "bytes_sent (number of bytes sent)" +.IP "client_etag (anonymizable)" +.IP "transaction_id" +.IP "headers (anonymizable)" +.IP "request_time (difference between start and end timestamps) +.IP "source" +.IP "log_info" +.IP "start_time (timestamp at the receiving, timestamp)" +.IP "end_time (timestamp at the end of the treatment, timestamp)" +.IP "ttfb (duration between request and first bytes is sent)" +.IP "policy_index" +.IP "account (account name, anonymizable)" +.IP "container (container name, anonymizable)" +.IP "object (object name, anonymizable)" +.IP "pid (PID of the process emitting the log line)" +.PD +.RE + +.IP "Example: '{client_ip.anonymized} {remote_addr.anonymized} {start_time.iso8601} {end_time.H}:{end_time.M} {method} acc:{account} cnt:{container} obj:{object.anonymized}'" +.IP "Default: '{client_ip} {remote_addr} {end_time.datetime} {method} {path} {protocol} {status_int} {referer} {user_agent} {auth_token} {bytes_recvd} {bytes_sent} {client_etag} {transaction_id} {headers} {request_time} {source} {log_info} {start_time} {end_time} {policy_index}'" +.IP "Warning: A bad log message template will raise an error in initialization." +.RE +.PD +.RS 0 +.IP "\fB[filter:bulk]\fR" +.RE + +Note: Put before both ratelimit and auth in the pipeline. + +.RS 3 +.IP \fBuse\fR +Entry point for paste.deploy for the bulk middleware. This is the reference to the installed python egg. +This is normally \fBegg:swift#bulk\fR. +.IP \fBmax_containers_per_extraction\fR +The default is 10000. +.IP \fBmax_failed_extractions\fR +The default is 1000. +.IP \fBmax_deletes_per_request\fR +The default is 10000. +.IP \fBmax_failed_deletes\fR +The default is 1000. + +In order to keep a connection active during a potentially long bulk request, +Swift may return whitespace prepended to the actual response body. This +whitespace will be yielded no more than every yield_frequency seconds. +The default is 10. +.IP \fByield_frequency\fR + +.IP \fBdelete_container_retry_count\fR +Note: This parameter is used during a bulk delete of objects and +their container. This would frequently fail because it is very likely +that all replicated objects have not been deleted by the time the middleware got a +successful response. It can be configured the number of retries. And the +number of seconds to wait between each retry will be 1.5**retry +The default is 0. +.RE .PD +.RS 0 +.IP "\fB[filter:slo]\fR" +.RE + +Note: Put after auth and staticweb in the pipeline. + +.RS 3 +.IP \fBuse\fR +Entry point for paste.deploy for the slo middleware. This is the reference to the installed python egg. +This is normally \fBegg:swift#slo\fR. +.IP \fBmax_manifest_segments\fR +The default is 1000. +.IP \fBmax_manifest_size\fR +The default is 2097152. +.IP \fBmin_segment_size\fR +The default is 1048576 +.IP \fBrate_limit_after_segment\fR +Start rate-limiting object segments after the Nth segment of a segmented +object. The default is 10 segments. +.IP \fBrate_limit_segments_per_sec\fR +Once segment rate-limiting kicks in for an object, limit segments served to N +per second. The default is 1. +.IP \fBmax_get_time\fR +Time limit on GET requests (seconds). The default is 86400. +.RE +.PD + + +.RS 0 +.IP "\fB[filter:dlo]\fR" +.RE + +Note: Put after auth and staticweb in the pipeline. +If you don't put it in the pipeline, it will be inserted for you. + +.RS 3 +.IP \fBuse\fR +Entry point for paste.deploy for the dlo middleware. This is the reference to the installed python egg. +This is normally \fBegg:swift#dlo\fR. +.IP \fBrate_limit_after_segment\fR +Start rate-limiting object segments after the Nth segment of a segmented +object. The default is 10 segments. +.IP \fBrate_limit_segments_per_sec\fR +Once segment rate-limiting kicks in for an object, limit segments served to N +per second. The default is 1. +.IP \fBmax_get_time\fR +Time limit on GET requests (seconds). The default is 86400. +.RE +.PD + + +.RS 0 +.IP "\fB[filter:container-quotas]\fR" +.RE + +Note: Put after auth in the pipeline. + +.RS 3 +.IP \fBuse\fR +Entry point for paste.deploy for the container_quotas middleware. This is the reference to the installed python egg. +This is normally \fBegg:swift#container_quotas\fR. +.RE +.PD + + +.RS 0 +.IP "\fB[filter:account-quotas]\fR" +.RE + +Note: Put after auth in the pipeline. + +.RS 3 +.IP \fBuse\fR +Entry point for paste.deploy for the account_quotas middleware. This is the reference to the installed python egg. +This is normally \fBegg:swift#account_quotas\fR. +.RE +.PD + + +.RS 0 +.IP "\fB[filter:gatekeeper]\fR" +.RE + +Note: this middleware requires python-dnspython + +.RS 3 +.IP \fBuse\fR +Entry point for paste.deploy for the gatekeeper middleware. This is the reference to the installed python egg. +This is normally \fBegg:swift#gatekeeper\fR. +.IP "\fBset log_name\fR" +Label used when logging. The default is gatekeeper. +.IP "\fBset log_facility\fR" +Syslog log facility. The default is LOG_LOCAL0. +.IP "\fBset log_level\fR " +Logging level. The default is INFO. +.IP "\fBset log_address\fR" +Logging address. The default is /dev/log. +.IP "\fBset log_headers\fR" +Enables the ability to log request headers. The default is False. +.RE +.PD + + +.RS 0 +.IP "\fB[filter:container_sync]\fR" +.RE + +Note: this middleware requires python-dnspython + +.RS 3 +.IP \fBuse\fR +Entry point for paste.deploy for the container_sync middleware. This is the reference to the installed python egg. +This is normally \fBegg:swift#container_sync\fR. +.IP \fBallow_full_urls\fR +Set this to false if you want to disallow any full URL values to be set for +any new X-Container-Sync-To headers. This will keep any new full urls from +coming in, but won't change any existing values already in the cluster. +Updating those will have to be done manually, as knowing what the true realm +endpoint should be cannot always be guessed. The default is true. +.IP \fBcurrent\fR +Set this to specify this clusters //realm/cluster as "current" in /info +.RE +.PD + + +.RS 0 +.IP "\fB[filter:xprofile]\fR" +.RE + +Note: Put it at the beginning of the pipeline to profile all middleware. But it is safer to put this after healthcheck. + +.RS 3 +.IP "\fBuse\fR" +Entry point for paste.deploy for the xprofile middleware. This is the reference to the installed python egg. +This is normally \fBegg:swift#xprofile\fR. +.IP "\fBprofile_module\fR" +This option enable you to switch profilers which should inherit from python +standard profiler. Currently the supported value can be 'cProfile', 'eventlet.green.profile' etc. +.IP "\fBlog_filename_prefix\fR" +This prefix will be used to combine process ID and timestamp to name the +profile data file. Make sure the executing user has permission to write +into this path (missing path segments will be created, if necessary). +If you enable profiling in more than one type of daemon, you must override +it with an unique value like, the default is /var/log/swift/profile/account.profile. +.IP "\fBdump_interval\fR" +The profile data will be dumped to local disk based on above naming rule +in this interval. The default is 5.0. +.IP "\fBdump_timestamp\fR" +Be careful, this option will enable profiler to dump data into the file with +time stamp which means there will be lots of files piled up in the directory. +The default is false +.IP "\fBpath\fR" +This is the path of the URL to access the mini web UI. The default is __profile__. +.IP "\fBflush_at_shutdown\fR" +Clear the data when the wsgi server shutdown. The default is false. +.IP "\fBunwind\fR" +Unwind the iterator of applications. Default is false. +.RE +.PD + + +.RS 0 +.IP "\fB[filter:versioned_writes]\fR" +.RE + +Note: Put after slo, dlo in the pipeline. +If you don't put it in the pipeline, it will be inserted automatically. + +.RS 3 +.IP \fBuse\fR +Entry point for paste.deploy for the versioned_writes middleware. This is the reference to the installed python egg. +This is normally \fBegg:swift#versioned_writes\fR. +.IP \fBallow_versioned_writes\fR +Enables using versioned writes middleware and exposing configuration settings via HTTP GET /info. +WARNING: Setting this option bypasses the "allow_versions" option +in the container configuration file, which will be eventually +deprecated. See documentation for more details. +.RE +.PD .SH APP SECTION @@ -485,13 +988,13 @@ that are acceptable within this section. .IP \fBuse\fR Entry point for paste.deploy for the proxy server. This is the reference to the installed python egg. This is normally \fBegg:swift#proxy\fR. -.IP \fBset log_name\fR +.IP "\fBset log_name\fR" Label used when logging. The default is proxy-server. -.IP \fBset log_facility\fR +.IP "\fBset log_facility\fR" Syslog log facility. The default is LOG_LOCAL0. -.IP \fB set log_level\fR +.IP "\fBset log_level\fR" Logging level. The default is INFO. -.IP \fB set log_address\fR +.IP "\fBset log_address\fR" Logging address. The default is /dev/log. .IP \fBlog_handoffs\fR Log when handoff locations are used. Default is True. @@ -500,15 +1003,24 @@ Cache timeout in seconds to send memcached for account existence. The default is .IP \fBrecheck_container_existence\fR Cache timeout in seconds to send memcached for container existence. The default is 60 seconds. .IP \fBobject_chunk_size\fR -Chunk size to read from object servers. The default is 8192. +Chunk size to read from object servers. The default is 65536. .IP \fBclient_chunk_size\fR -Chunk size to read from clients. The default is 8192. +Chunk size to read from clients. The default is 65536. .IP \fBnode_timeout\fR Request timeout to external services. The default is 10 seconds. -.IP \fBclient_timeoutt\fR -Timeout to read one chunk from a client. The default is 60 seconds. +.IP \fBrecoverable_node_timeout\fR +How long the proxy server will wait for an initial response and to read a +chunk of data from the object servers while serving GET / HEAD requests. +Timeouts from these requests can be recovered from so setting this to +something lower than node_timeout would provide quicker error recovery +while allowing for a longer timeout for non-recoverable requests (PUTs). +Defaults to node_timeout, should be overridden if node_timeout is set to a +high number to prevent client timeouts from firing before the proxy server +has a chance to retry. .IP \fBconn_timeout\fR Connection timeout to external services. The default is 0.5 seconds. +.IP \fBpost_quorum_timeout\fR +How long to wait for requests to finish after a quorum has been established. The default is 0.5 seconds. .IP \fBerror_suppression_interval\fR Time in seconds that must elapse since the last error for a node to be considered no longer error limited. The default is 60 seconds. @@ -518,35 +1030,115 @@ Error count to consider a node error limited. The default is 10. Whether account PUTs and DELETEs are even callable. If set to 'true' any authorized user may create and delete accounts; if 'false' no one, even authorized, can. The default is false. -.IP \fBobject_post_as_copy\fR -Set object_post_as_copy = false to turn on fast posts where only the metadata changes -are stored as new and the original data file is kept in place. This makes for quicker -posts; but since the container metadata isn't updated in this mode, features like -container sync won't be able to sync posts. The default is True. .IP \fBaccount_autocreate\fR If set to 'true' authorized accounts that do not yet exist within the Swift cluster will be automatically created. The default is set to false. +.IP \fBmax_containers_per_account\fR +If set to a positive value, trying to create a container when the account +already has at least this maximum containers will result in a 403 Forbidden. +Note: This is a soft limit, meaning a user might exceed the cap for +recheck_account_existence before the 403s kick in. +.IP \fBmax_containers_whitelist\fR +This is a comma separated list of account hashes that ignore the max_containers_per_account cap. +.IP \fBdeny_host_headers\fR +Comma separated list of Host headers to which the proxy will deny requests. The default is empty. +.IP \fBsorting_method\fR +Storage nodes can be chosen at random (shuffle - default), by using timing +measurements (timing), or by using an explicit match (affinity). +Using timing measurements may allow for lower overall latency, while +using affinity allows for finer control. In both the timing and +affinity cases, equally-sorting nodes are still randomly chosen to +spread load. +The valid values for sorting_method are "affinity", "shuffle", and "timing". +.IP \fBtiming_expiry\fR +If the "timing" sorting_method is used, the timings will only be valid for +the number of seconds configured by timing_expiry. The default is 300. +.IP \fBconcurrent_gets\fR +If "on" then use replica count number of threads concurrently during a GET/HEAD +and return with the first successful response. In the EC case, this parameter +only affects an EC HEAD as an EC GET behaves differently. Default is "off". +.IP \fBconcurrency_timeout\fR +This parameter controls how long to wait before firing off the next +concurrent_get thread. A value of 0 would we fully concurrent, any other number +will stagger the firing of the threads. This number should be between 0 and +node_timeout. The default is the value of conn_timeout (0.5). +.IP \fBrequest_node_count\fR +Set to the number of nodes to contact for a normal request. You can use '* replicas' +at the end to have it use the number given times the number of +replicas for the ring being used for the request. The default is '2 * replicas'. +.IP \fBread_affinity\fR +Specifies which backend servers to prefer on reads. Format is a comma +separated list of affinity descriptors of the form =. +The may be r for selecting nodes in region N or rz for +selecting nodes in region N, zone M. The value should be a whole +number that represents the priority to be given to the selection; lower numbers +are higher priority. Default is empty, meaning no preference. + +Example: first read from region 1 zone 1, then region 1 zone 2, then anything +in region 2, then everything else: + +.PD 0 +.RS 10 +.IP "read_affinity = r1z1=100, r1z2=200, r2=300" +.RE +.PD +.IP \fBwrite_affinity\fR +Specifies which backend servers to prefer on writes. Format is a comma +separated list of affinity descriptors of the form r for region N or +rz for region N, zone M. If this is set, then when handling an object +PUT request, some number (see setting write_affinity_node_count) of local +backend servers will be tried before any nonlocal ones. Default is empty, +meaning no preference. + +Example: try to write to regions 1 and 2 before writing to any other +nodes: + +.PD 0 +.RS 10 +write_affinity = r1, r2 +.RE +.PD +.IP \fBwrite_affinity_node_count\fR +The number of local (as governed by the write_affinity setting) nodes to +attempt to contact first on writes, before any non-local ones. The value +should be an integer number, or use '* replicas' at the end to have it use +the number given times the number of replicas for the ring being used for the +request. The default is '2 * replicas'. +.IP \fBswift_owner_headers\fR +These are the headers whose values will only be shown to swift_owners. The +exact definition of a swift_owner is up to the auth system in use, but +usually indicates administrative responsibilities. +The default is 'x-container-read, x-container-write, x-container-sync-key, x-container-sync-to, x-account-meta-temp-url-key, x-account-meta-temp-url-key-2, x-container-meta-temp-url-key, x-container-meta-temp-url-key-2, x-account-access-control'. .IP \fBrate_limit_after_segment\fR Start rate-limiting object segments after the Nth segment of a segmented object. The default is 10 segments. .IP \fBrate_limit_segments_per_sec\fR Once segment rate-limiting kicks in for an object, limit segments served to N per second. The default is 1. +.IP \fBnice_priority\fR +Modify scheduling priority of server processes. Niceness values range from -20 +(most favorable to the process) to 19 (least favorable to the process). +The default does not modify priority. +.IP \fBionice_class\fR +Modify I/O scheduling class of server processes. I/O niceness class values +are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and IOPRIO_CLASS_IDLE (idle). +The default does not modify class and priority. +Work only with ionice_priority. +.IP \fBionice_priority\fR +Modify I/O scheduling priority of server processes. I/O niceness priority +is a number which goes from 0 to 7. The higher the value, the lower +the I/O priority of the process. Work only with ionice_class. +Ignored if IOPRIO_CLASS_IDLE is set. .RE .PD - - .SH DOCUMENTATION .LP More in depth documentation about the swift-proxy-server and -also Openstack-Swift as a whole can be found at -.BI http://swift.openstack.org/admin_guide.html +also OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/admin_guide.html and -.BI http://swift.openstack.org - +.BI https://docs.openstack.org/swift/latest/ .SH "SEE ALSO" -.BR swift-proxy-server(1), - - +.BR swift-proxy-server(1) diff --git a/doc/manpages/swift-account-audit.1 b/doc/manpages/swift-account-audit.1 new file mode 100644 index 0000000000..64d60c584b --- /dev/null +++ b/doc/manpages/swift-account-audit.1 @@ -0,0 +1,63 @@ +.\" +.\" Copyright (c) 2016 OpenStack Foundation. +.\" +.\" Licensed under the Apache License, Version 2.0 (the "License"); +.\" you may not use this file except in compliance with the License. +.\" You may obtain a copy of the License at +.\" +.\" http://www.apache.org/licenses/LICENSE-2.0 +.\" +.\" Unless required by applicable law or agreed to in writing, software +.\" distributed under the License is distributed on an "AS IS" BASIS, +.\" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +.\" implied. +.\" See the License for the specific language governing permissions and +.\" limitations under the License. +.\" +.TH SWIFT-ACCOUNT-AUDIT "1" "August 2016" "OpenStack Swift" +.SH NAME +swift\-account\-audit \- manually audit OpenStack Swift accounts + +.SH SYNOPSIS +.PP +.B swift\-account\-audit\/ +\fI[options]\fR \fI[url 1]\fR \fI[url 2]\fR \fI...\fR + +.SH DESCRIPTION +.PP +The swift-account-audit cli tool can be used to audit the data for an account. +It crawls the account, checking that all containers and objects can be found. + +You can also feed a list of URLs to the script through stdin. + +.SH OPTIONS +.TP +\fB\-c\fR \fIconcurrency\fR +Set the concurrency, default 50 +.TP +\fB\-r\fR \fIring dir\fR +Ring locations, default \fI/etc/swift\fR +.TP +\fB\-e\fR \fIfilename\fR +File for writing a list of inconsistent URLs +.TP +\fB\-d\fR +Also download files and verify md5 + +.SH EXAMPLES +.nf +/usr/bin/swift\-account\-audit\/ AUTH_88ad0b83\-b2c5\-4fa1\-b2d6\-60c597202076 +/usr/bin/swift\-account\-audit\/ AUTH_88ad0b83\-b2c5\-4fa1\-b2d6\-60c597202076/container/object +/usr/bin/swift\-account\-audit\/ \fB\-e\fR errors.txt AUTH_88ad0b83\-b2c5\-4fa1\-b2d6\-60c597202076/container +/usr/bin/swift\-account\-audit\/ < errors.txt +/usr/bin/swift\-account\-audit\/ \fB\-c\fR 25 \fB\-d\fR < errors.txt +.fi + +.SH DOCUMENTATION +.LP +More in depth documentation in regards to +.BI swift\-account\-audit +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/ +and +.BI https://docs.openstack.org diff --git a/doc/manpages/swift-account-auditor.1 b/doc/manpages/swift-account-auditor.1 index 86e60fbb4c..9983ac61de 100644 --- a/doc/manpages/swift-account-auditor.1 +++ b/doc/manpages/swift-account-auditor.1 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2012 OpenStack, LLC. +.\" Copyright (c) 2010-2012 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -14,24 +14,24 @@ .\" implied. .\" See the License for the specific language governing permissions and .\" limitations under the License. -.\" +.\" .TH swift-account-auditor 1 "8/26/2011" "Linux" "OpenStack Swift" -.SH NAME +.SH NAME .LP -.B swift-account-auditor -\- Openstack-swift account auditor +.B swift-account-auditor +\- OpenStack Swift account auditor .SH SYNOPSIS .LP -.B swift-account-auditor +.B swift-account-auditor [CONFIG] [-h|--help] [-v|--verbose] [-o|--once] -.SH DESCRIPTION +.SH DESCRIPTION .PP -The account auditor crawls the local account system checking the integrity of accounts -objects. If corruption is found (in the case of bit rot, for example), the file is +The account auditor crawls the local account system checking the integrity of accounts +objects. If corruption is found (in the case of bit rot, for example), the file is quarantined, and replication will replace the bad file from another replica. The options are as follows: @@ -46,17 +46,17 @@ The options are as follows: .IP "-o" .IP "--once" .RS 4 -.IP "only run one pass of daemon" +.IP "only run one pass of daemon" .RE .PD .RE - + .SH DOCUMENTATION .LP -More in depth documentation in regards to -.BI swift-account-auditor -and also about Openstack-Swift as a whole can be found at -.BI http://swift.openstack.org/index.html +More in depth documentation in regards to +.BI swift-account-auditor +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/ .SH "SEE ALSO" .BR account-server.conf(5) diff --git a/doc/manpages/swift-account-info.1 b/doc/manpages/swift-account-info.1 new file mode 100644 index 0000000000..41ab188214 --- /dev/null +++ b/doc/manpages/swift-account-info.1 @@ -0,0 +1,69 @@ +.\" +.\" Author: Madhuri Kumari +.\" +.\" Licensed under the Apache License, Version 2.0 (the "License"); +.\" you may not use this file except in compliance with the License. +.\" You may obtain a copy of the License at +.\" +.\" http://www.apache.org/licenses/LICENSE-2.0 +.\" +.\" Unless required by applicable law or agreed to in writing, software +.\" distributed under the License is distributed on an "AS IS" BASIS, +.\" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +.\" implied. +.\" See the License for the specific language governing permissions and +.\" limitations under the License. +.\" +.TH swift-account-info 1 "10/25/2016" "Linux" "OpenStack Swift" + +.SH NAME +.LP +.B swift-account-info +\- OpenStack Swift account-info tool + +.SH SYNOPSIS +.LP +.B swift-account-info + [options] + +.SH DESCRIPTION +.PP +This is a very simple swift tool that allows a swiftop engineer to retrieve +information about an account that is located on the storage node. One calls +the tool with a given db file as it is stored on the storage node system. +It will then return several information about that account such as; + +.PD 0 +.IP "- Account" +.IP "- Account hash " +.IP "- Created timestamp " +.IP "- Put timestamp " +.IP "- Delete timestamp " +.IP "- Container Count " +.IP "- Object count " +.IP "- Bytes used " +.IP "- Chexor " +.IP "- ID" +.IP "- User Metadata " +.IP "- Ring Location" +.PD + +.SH OPTIONS +.TP +\fB\-h, --help \fR +Shows the help message and exit +.TP +\fB\-d SWIFT_DIR, --swift-dir=SWIFT_DIR\fR +Pass location of swift configuration file if different from the default +location /etc/swift + +.SH DOCUMENTATION +.LP +More documentation about OpenStack Swift can be found at +.BI https://docs.openstack.org/swift/latest/ + +.SH "SEE ALSO" + +.BR swift-container-info(1), +.BR swift-get-nodes(1), +.BR swift-object-info(1) diff --git a/doc/manpages/swift-account-reaper.1 b/doc/manpages/swift-account-reaper.1 index 4dc1f72929..225aee34a3 100644 --- a/doc/manpages/swift-account-reaper.1 +++ b/doc/manpages/swift-account-reaper.1 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2012 OpenStack, LLC. +.\" Copyright (c) 2010-2012 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -14,24 +14,24 @@ .\" implied. .\" See the License for the specific language governing permissions and .\" limitations under the License. -.\" +.\" .TH swift-account-reaper 1 "8/26/2011" "Linux" "OpenStack Swift" -.SH NAME +.SH NAME .LP .B swift-account-reaper -\- Openstack-swift account reaper +\- OpenStack Swift account reaper .SH SYNOPSIS .LP -.B swift-account-reaper +.B swift-account-reaper [CONFIG] [-h|--help] [-v|--verbose] [-o|--once] -.SH DESCRIPTION +.SH DESCRIPTION .PP Removes data from status=DELETED accounts. These are accounts that have been asked to be removed by the reseller via services remove_storage_account -XMLRPC call. +XMLRPC call. .PP The account is not deleted immediately by the services call, but instead the account is simply marked for deletion by setting the status column in @@ -51,18 +51,18 @@ The options are as follows: .IP "-o" .IP "--once" .RS 4 -.IP "only run one pass of daemon" +.IP "only run one pass of daemon" .RE .PD .RE - + .SH DOCUMENTATION .LP -More in depth documentation in regards to -.BI swift-object-auditor -and also about Openstack-Swift as a whole can be found at -.BI http://swift.openstack.org/index.html +More in depth documentation in regards to +.BI swift-object-auditor +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/ .SH "SEE ALSO" diff --git a/doc/manpages/swift-account-replicator.1 b/doc/manpages/swift-account-replicator.1 index 7df81ed8ff..896f2eb307 100644 --- a/doc/manpages/swift-account-replicator.1 +++ b/doc/manpages/swift-account-replicator.1 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2012 OpenStack, LLC. +.\" Copyright (c) 2010-2012 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -14,31 +14,31 @@ .\" implied. .\" See the License for the specific language governing permissions and .\" limitations under the License. -.\" +.\" .TH swift-account-replicator 1 "8/26/2011" "Linux" "OpenStack Swift" -.SH NAME +.SH NAME .LP -.B swift-account-replicator -\- Openstack-swift account replicator +.B swift-account-replicator +\- OpenStack Swift account replicator .SH SYNOPSIS .LP -.B swift-account-replicator +.B swift-account-replicator [CONFIG] [-h|--help] [-v|--verbose] [-o|--once] -.SH DESCRIPTION +.SH DESCRIPTION .PP -Replication is designed to keep the system in a consistent state in the face of -temporary error conditions like network outages or drive failures. The replication -processes compare local data with each remote copy to ensure they all contain the -latest version. Account replication uses a combination of hashes and shared high +Replication is designed to keep the system in a consistent state in the face of +temporary error conditions like network outages or drive failures. The replication +processes compare local data with each remote copy to ensure they all contain the +latest version. Account replication uses a combination of hashes and shared high water marks to quickly compare subsections of each partition. .PP -Replication updates are push based. Account replication push missing records over +Replication updates are push based. Account replication push missing records over HTTP or rsync whole database files. The replicator also ensures that data is removed -from the system. When an account item is deleted a tombstone is set as the latest -version of the item. The replicator will see the tombstone and ensure that the item +from the system. When an account item is deleted a tombstone is set as the latest +version of the item. The replicator will see the tombstone and ensure that the item is removed from the entire system. The options are as follows: @@ -53,18 +53,18 @@ The options are as follows: .IP "-o" .IP "--once" .RS 4 -.IP "only run one pass of daemon" +.IP "only run one pass of daemon" .RE -.PD +.PD .RE - - + + .SH DOCUMENTATION .LP -More in depth documentation in regards to +More in depth documentation in regards to .BI swift-account-replicator -and also about Openstack-Swift as a whole can be found at -.BI http://swift.openstack.org/index.html +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/ .SH "SEE ALSO" diff --git a/doc/manpages/swift-account-server.1 b/doc/manpages/swift-account-server.1 index 21ba86f7a9..1d4b2992e1 100644 --- a/doc/manpages/swift-account-server.1 +++ b/doc/manpages/swift-account-server.1 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2011 OpenStack, LLC. +.\" Copyright (c) 2010-2011 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -14,33 +14,33 @@ .\" implied. .\" See the License for the specific language governing permissions and .\" limitations under the License. -.\" +.\" .TH swift-account-server 1 "8/26/2011" "Linux" "OpenStack Swift" -.SH NAME +.SH NAME .LP .B swift-account-server -\- Openstack-swift account server +\- OpenStack Swift account server .SH SYNOPSIS .LP .B swift-account-server [CONFIG] [-h|--help] [-v|--verbose] -.SH DESCRIPTION +.SH DESCRIPTION .PP The Account Server's primary job is to handle listings of containers. The listings are stored as sqlite database files, and replicated across the cluster similar to how -objects are. +objects are. .SH DOCUMENTATION .LP -More in depth documentation in regards to +More in depth documentation in regards to .BI swift-account-server -and also about Openstack-Swift as a whole can be found at -.BI http://swift.openstack.org/index.html -and -.BI http://docs.openstack.org +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/ +and +.BI https://docs.openstack.org .SH "SEE ALSO" diff --git a/doc/manpages/swift-config.1 b/doc/manpages/swift-config.1 new file mode 100644 index 0000000000..d1ad1af8c6 --- /dev/null +++ b/doc/manpages/swift-config.1 @@ -0,0 +1,51 @@ +.\" +.\" Copyright (c) 2016 OpenStack Foundation. +.\" +.\" Licensed under the Apache License, Version 2.0 (the "License"); +.\" you may not use this file except in compliance with the License. +.\" You may obtain a copy of the License at +.\" +.\" http://www.apache.org/licenses/LICENSE-2.0 +.\" +.\" Unless required by applicable law or agreed to in writing, software +.\" distributed under the License is distributed on an "AS IS" BASIS, +.\" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +.\" implied. +.\" See the License for the specific language governing permissions and +.\" limitations under the License. +.\" +.TH SWIFT-CONFIG "1" "August 2016" "OpenStack Swift" + +.SH NAME +swift\-config \- OpenStack Swift config parser + +.SH SYNOPSIS +.B swift\-config +[\fIoptions\fR] \fISERVER\fR + +.SH DESCRIPTION +.PP +Combine Swift configuration files and print result. + +.SH OPTIONS +.TP +\fB\-h\fR, \fB\-\-help\fR +Show this help message and exit +.TP +\fB\-c\fR \fIN\fR, \fB\-\-config\-num\fR=\fIN\fR +Parse config for the \fIN\fRth server only +.TP +\fB\-s\fR \fISECTION\fR, \fB\-\-section\fR=\fISECTION\fR +Only display matching sections +.TP +\fB\-w\fR, \fB\-\-wsgi\fR +Use wsgi/paste parser instead of readconf + +.SH DOCUMENTATION +.LP +More in depth documentation in regards to +.BI swift\-config +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/ +and +.BI https://docs.openstack.org diff --git a/doc/manpages/swift-container-auditor.1 b/doc/manpages/swift-container-auditor.1 index 2718f043d5..eae68fc006 100644 --- a/doc/manpages/swift-container-auditor.1 +++ b/doc/manpages/swift-container-auditor.1 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2012 OpenStack, LLC. +.\" Copyright (c) 2010-2012 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -14,24 +14,24 @@ .\" implied. .\" See the License for the specific language governing permissions and .\" limitations under the License. -.\" +.\" .TH swift-container-auditor 1 "8/26/2011" "Linux" "OpenStack Swift" -.SH NAME +.SH NAME .LP -.B swift-container-auditor -\- Openstack-swift container auditor +.B swift-container-auditor +\- OpenStack Swift container auditor .SH SYNOPSIS .LP -.B swift-container-auditor +.B swift-container-auditor [CONFIG] [-h|--help] [-v|--verbose] [-o|--once] -.SH DESCRIPTION +.SH DESCRIPTION .PP -The container auditor crawls the local container system checking the integrity of container -objects. If corruption is found (in the case of bit rot, for example), the file is +The container auditor crawls the local container system checking the integrity of container +objects. If corruption is found (in the case of bit rot, for example), the file is quarantined, and replication will replace the bad file from another replica. The options are as follows: @@ -46,18 +46,18 @@ The options are as follows: .IP "-o" .IP "--once" .RS 4 -.IP "only run one pass of daemon" +.IP "only run one pass of daemon" .RE .PD .RE - - + + .SH DOCUMENTATION .LP -More in depth documentation in regards to -.BI swift-container-auditor -and also about Openstack-Swift as a whole can be found at -.BI http://swift.openstack.org/index.html +More in depth documentation in regards to +.BI swift-container-auditor +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/ .SH "SEE ALSO" diff --git a/doc/manpages/swift-container-info.1 b/doc/manpages/swift-container-info.1 new file mode 100644 index 0000000000..ddacd7cf17 --- /dev/null +++ b/doc/manpages/swift-container-info.1 @@ -0,0 +1,74 @@ +.\" +.\" Author: Madhuri Kumari +.\" Copyright (c) 2010-2011 OpenStack Foundation. +.\" +.\" Licensed under the Apache License, Version 2.0 (the "License"); +.\" you may not use this file except in compliance with the License. +.\" You may obtain a copy of the License at +.\" +.\" http://www.apache.org/licenses/LICENSE-2.0 +.\" +.\" Unless required by applicable law or agreed to in writing, software +.\" distributed under the License is distributed on an "AS IS" BASIS, +.\" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +.\" implied. +.\" See the License for the specific language governing permissions and +.\" limitations under the License. +.\" +.TH swift-container-info 1 "10/25/2016" "Linux" "OpenStack Swift" + +.SH NAME +.LP +.B swift-container-info +\- OpenStack Swift container-info tool + +.SH SYNOPSIS +.LP +.B swift-container-info + [options] + +.SH DESCRIPTION +.PP +This is a very simple swift tool that allows a swiftop engineer to retrieve +information about a container that is located on the storage node. +One calls the tool with a given container db file as +it is stored on the storage node system. +It will then return several information about that container such as; + +.PD 0 +.IP "- Account it belongs to" +.IP "- Container " +.IP "- Created timestamp " +.IP "- Put timestamp " +.IP "- Delete timestamp " +.IP "- Object count " +.IP "- Bytes used " +.IP "- Reported put timestamp " +.IP "- Reported delete timestamp " +.IP "- Reported object count " +.IP "- Reported bytes used " +.IP "- Hash " +.IP "- ID " +.IP "- User metadata " +.IP "- X-Container-Sync-Point 1 " +.IP "- X-Container-Sync-Point 2 " +.IP "- Location on the ring " +.PD + +.SH OPTIONS +.TP +\fB\-h, --help \fR +Shows the help message and exit +.TP +\fB\-d SWIFT_DIR, --swift-dir=SWIFT_DIR\fR +Pass location of swift configuration file if different from the default +location /etc/swift + +.SH DOCUMENTATION +.LP +More documentation about OpenStack Swift can be found at +.BI https://docs.openstack.org/swift/latest/ + +.SH "SEE ALSO" +.BR swift-get-nodes(1), +.BR swift-object-info(1) diff --git a/doc/manpages/swift-container-reconciler.1 b/doc/manpages/swift-container-reconciler.1 new file mode 100644 index 0000000000..eaee41bbfb --- /dev/null +++ b/doc/manpages/swift-container-reconciler.1 @@ -0,0 +1,58 @@ +.\" +.\" Copyright (c) 2016 OpenStack Foundation. +.\" +.\" Licensed under the Apache License, Version 2.0 (the "License"); +.\" you may not use this file except in compliance with the License. +.\" You may obtain a copy of the License at +.\" +.\" http://www.apache.org/licenses/LICENSE-2.0 +.\" +.\" Unless required by applicable law or agreed to in writing, software +.\" distributed under the License is distributed on an "AS IS" BASIS, +.\" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +.\" implied. +.\" See the License for the specific language governing permissions and +.\" limitations under the License. +.\" +.TH SWIFT-CONTAINER-RECONCILER "1" "August 2016" "OpenStack Swift" + +.SH NAME +swift\-container\-reconciler \- OpenStack Swift container reconciler + +.SH SYNOPSIS +.B swift\-container\-reconciler +\fICONFIG \fR[\fIoptions\fR] + +.SH DESCRIPTION +.PP +This daemon will take objects that are in the wrong storage policy and +move them to the right ones, or delete requests that went to the wrong +storage policy and apply them to the right ones. It operates on a +queue similar to the object-expirer's queue. + +Discovering that the object is in the wrong policy is done in the container +replicator; the container reconciler is the daemon that handles them once they +happen. + +Like the object expirer, you only need to run one of these per cluster + +.SH OPTIONS +.TP +\fB\-h\fR, \fB\-\-help\fR +Show this help message and exit +.TP +\fB\-v\fR, \fB\-\-verbose\fR +Log to console +.TP +\fB\-o\fR, \fB\-\-once\fR +Only run one pass of daemon +.PP + +.SH DOCUMENTATION +.LP +More in depth documentation in regards to +.BI swift\-container\-reconciler +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/ +and +.BI https://docs.openstack.org diff --git a/doc/manpages/swift-container-replicator.1 b/doc/manpages/swift-container-replicator.1 index 0469c78ff2..b0103f1633 100644 --- a/doc/manpages/swift-container-replicator.1 +++ b/doc/manpages/swift-container-replicator.1 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2012 OpenStack, LLC. +.\" Copyright (c) 2010-2012 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -14,31 +14,31 @@ .\" implied. .\" See the License for the specific language governing permissions and .\" limitations under the License. -.\" +.\" .TH swift-container-replicator 1 "8/26/2011" "Linux" "OpenStack Swift" -.SH NAME +.SH NAME .LP -.B swift-container-replicator -\- Openstack-swift container replicator +.B swift-container-replicator +\- OpenStack Swift container replicator .SH SYNOPSIS .LP -.B swift-container-replicator +.B swift-container-replicator [CONFIG] [-h|--help] [-v|--verbose] [-o|--once] -.SH DESCRIPTION +.SH DESCRIPTION .PP -Replication is designed to keep the system in a consistent state in the face of -temporary error conditions like network outages or drive failures. The replication -processes compare local data with each remote copy to ensure they all contain the -latest version. Container replication uses a combination of hashes and shared high +Replication is designed to keep the system in a consistent state in the face of +temporary error conditions like network outages or drive failures. The replication +processes compare local data with each remote copy to ensure they all contain the +latest version. Container replication uses a combination of hashes and shared high water marks to quickly compare subsections of each partition. .PP -Replication updates are push based. Container replication push missing records over +Replication updates are push based. Container replication push missing records over HTTP or rsync whole database files. The replicator also ensures that data is removed -from the system. When an container item is deleted a tombstone is set as the latest -version of the item. The replicator will see the tombstone and ensure that the item +from the system. When an container item is deleted a tombstone is set as the latest +version of the item. The replicator will see the tombstone and ensure that the item is removed from the entire system. The options are as follows: @@ -53,18 +53,18 @@ The options are as follows: .IP "-o" .IP "--once" .RS 4 -.IP "only run one pass of daemon" +.IP "only run one pass of daemon" .RE .PD .RE - - + + .SH DOCUMENTATION .LP -More in depth documentation in regards to +More in depth documentation in regards to .BI swift-container-replicator -and also about Openstack-Swift as a whole can be found at -.BI http://swift.openstack.org/index.html +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/ .SH "SEE ALSO" diff --git a/doc/manpages/swift-container-server.1 b/doc/manpages/swift-container-server.1 index 20cc5aebe4..0271bdac69 100644 --- a/doc/manpages/swift-container-server.1 +++ b/doc/manpages/swift-container-server.1 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2011 OpenStack, LLC. +.\" Copyright (c) 2010-2011 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -14,37 +14,37 @@ .\" implied. .\" See the License for the specific language governing permissions and .\" limitations under the License. -.\" +.\" .TH swift-container-server 1 "8/26/2011" "Linux" "OpenStack Swift" -.SH NAME +.SH NAME .LP .B swift-container-server -\- Openstack-swift container server +\- OpenStack Swift container server .SH SYNOPSIS .LP .B swift-container-server [CONFIG] [-h|--help] [-v|--verbose] -.SH DESCRIPTION +.SH DESCRIPTION .PP -The Container Server's primary job is to handle listings of objects. It doesn't know -where those objects are, just what objects are in a specific container. The listings -are stored as sqlite database files, and replicated across the cluster similar to how -objects are. Statistics are also tracked that include the total number of objects, and +The Container Server's primary job is to handle listings of objects. It doesn't know +where those objects are, just what objects are in a specific container. The listings +are stored as sqlite database files, and replicated across the cluster similar to how +objects are. Statistics are also tracked that include the total number of objects, and total storage usage for that container. .SH DOCUMENTATION .LP -More in depth documentation in regards to +More in depth documentation in regards to .BI swift-container-server -and also about Openstack-Swift as a whole can be found at -.BI http://swift.openstack.org/index.html -and -.BI http://docs.openstack.org +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/ +and +.BI https://docs.openstack.org -.LP +.LP .SH "SEE ALSO" .BR container-server.conf(5) diff --git a/doc/manpages/swift-container-sync.1 b/doc/manpages/swift-container-sync.1 index 11669ec1b3..1c1616faa5 100644 --- a/doc/manpages/swift-container-sync.1 +++ b/doc/manpages/swift-container-sync.1 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2011 OpenStack, LLC. +.\" Copyright (c) 2010-2011 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -14,25 +14,25 @@ .\" implied. .\" See the License for the specific language governing permissions and .\" limitations under the License. -.\" +.\" .TH swift-container-sync 1 "8/26/2011" "Linux" "OpenStack Swift" -.SH NAME +.SH NAME .LP .B swift-container-sync -\- Openstack-swift container sync +\- OpenStack Swift container sync .SH SYNOPSIS .LP .B swift-container-sync [CONFIG] [-h|--help] [-v|--verbose] [-o|--once] -.SH DESCRIPTION +.SH DESCRIPTION .PP Swift has a feature where all the contents of a container can be mirrored to another container through background synchronization. Swift cluster operators configure their cluster to allow/accept sync requests to/from other clusters, -and the user specifies where to sync their container to along with a secret +and the user specifies where to sync their container to along with a secret synchronization key. .PP The swift-container-sync does the job of sending updates to the remote container. @@ -42,14 +42,14 @@ newer rows since the last sync will trigger PUTs or DELETEs to the other contain .SH DOCUMENTATION .LP -More in depth documentation in regards to +More in depth documentation in regards to .BI swift-container-sync -and also about Openstack-Swift as a whole can be found at -.BI http://swift.openstack.org/overview_container_sync.html -and -.BI http://docs.openstack.org +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/overview_container_sync.html +and +.BI https://docs.openstack.org -.LP +.LP .SH "SEE ALSO" .BR container-server.conf(5) diff --git a/doc/manpages/swift-container-updater.1 b/doc/manpages/swift-container-updater.1 index 9ec709ce70..c69b533629 100644 --- a/doc/manpages/swift-container-updater.1 +++ b/doc/manpages/swift-container-updater.1 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2012 OpenStack, LLC. +.\" Copyright (c) 2010-2012 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -14,31 +14,31 @@ .\" implied. .\" See the License for the specific language governing permissions and .\" limitations under the License. -.\" +.\" .TH swift-container-updater 1 "8/26/2011" "Linux" "OpenStack Swift" -.SH NAME +.SH NAME .LP .B swift-container-updater -\- Openstack-swift container updater +\- OpenStack Swift container updater .SH SYNOPSIS .LP -.B swift-container-updater +.B swift-container-updater [CONFIG] [-h|--help] [-v|--verbose] [-o|--once] -.SH DESCRIPTION +.SH DESCRIPTION .PP -The container updater is responsible for updating container information in the account database. +The container updater is responsible for updating container information in the account database. It will walk the container path in the system looking for container DBs and sending updates -to the account server as needed as it goes along. +to the account server as needed as it goes along. -There are times when account data can not be immediately updated. This usually occurs -during failure scenarios or periods of high load. This is where an eventual consistency -window will most likely come in to play. +There are times when account data can not be immediately updated. This usually occurs +during failure scenarios or periods of high load. This is where an eventual consistency +window will most likely come in to play. -In practice, the consistency window is only as large as the frequency at which -the updater runs and may not even be noticed as the proxy server will route +In practice, the consistency window is only as large as the frequency at which +the updater runs and may not even be noticed as the proxy server will route listing requests to the first account server which responds. The server under load may not be the one that serves subsequent listing requests – one of the other two replicas may handle the listing. @@ -55,17 +55,17 @@ The options are as follows: .IP "-o" .IP "--once" .RS 4 -.IP "only run one pass of daemon" +.IP "only run one pass of daemon" .RE .PD .RE - + .SH DOCUMENTATION .LP -More in depth documentation in regards to +More in depth documentation in regards to .BI swift-container-updater -and also about Openstack-Swift as a whole can be found at -.BI http://swift.openstack.org/index.html +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/ .SH "SEE ALSO" diff --git a/doc/manpages/swift-dispersion-populate.1 b/doc/manpages/swift-dispersion-populate.1 index 27188cef1b..6584bfb211 100644 --- a/doc/manpages/swift-dispersion-populate.1 +++ b/doc/manpages/swift-dispersion-populate.1 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2011 OpenStack, LLC. +.\" Copyright (c) 2010-2011 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -14,26 +14,26 @@ .\" implied. .\" See the License for the specific language governing permissions and .\" limitations under the License. -.\" +.\" .TH swift-dispersion-populate 1 "8/26/2011" "Linux" "OpenStack Swift" -.SH NAME +.SH NAME .LP .B swift-dispersion-populate -\- Openstack-swift dispersion populate +\- OpenStack Swift dispersion populate .SH SYNOPSIS .LP -.B swift-dispersion-populate +.B swift-dispersion-populate [--container-suffix-start] [--object-suffix-start] [--container-only|--object-only] [--insecure] [conf_file] -.SH DESCRIPTION +.SH DESCRIPTION .PP This is one of the swift-dispersion utilities that is used to evaluate the -overall cluster health. This is accomplished by checking if a set of +overall cluster health. This is accomplished by checking if a set of deliberately distributed containers and objects are currently in their proper places within the cluster. -.PP +.PP For instance, a common deployment has three replicas of each object. The health of that object can be measured by checking if each replica is in its proper place. If only 2 of the 3 is in place the object's health @@ -48,17 +48,36 @@ we need to run the \fBswift-dispersion-report\fR tool to check the health of eac of these containers and objects. .PP -These tools need direct access to the entire cluster and to the ring files. -Installing them on a proxy server will probably do or a box used for swift -administration purposes that also contains the common swift packages and ring. -Both \fBswift-dispersion-populate\fR and \fBswift-dispersion-report\fR use the +These tools need direct access to the entire cluster and to the ring files. +Installing them on a proxy server will probably do or a box used for swift +administration purposes that also contains the common swift packages and ring. +Both \fBswift-dispersion-populate\fR and \fBswift-dispersion-report\fR use the same configuration file, /etc/swift/dispersion.conf . The account used by these tool should be a dedicated account for the dispersion stats and also have admin -privileges. +privileges. + +.SH OPTIONS +.RS 0 +.PD 1 +.IP "\fB--insecure\fR" +Allow accessing insecure keystone server. The keystone's certificate will not +be verified. +.IP "\fB--container-suffix-start=NUMBER\fR" +Start container suffix at NUMBER and resume population at this point; default: 0 +.IP "\fB--object-suffix-start=NUMBER\fR" +Start object suffix at NUMBER and resume population at this point; default: 0 +.IP "\fB--object-only\fR" +Only run object population +.IP "\fB--container-only\fR" +Only run container population +.IP "\fB--no-overlap\fR" +Increase coverage by amount in dispersion_coverage option with no overlap of existing partitions (if run more than once) +.IP "\fB-P, --policy-name\fR" +Specify storage policy name .SH CONFIGURATION -.PD 0 -Example \fI/etc/swift/dispersion.conf\fR: +.PD 0 +Example \fI/etc/swift/dispersion.conf\fR: .RS 3 .IP "[dispersion]" @@ -66,30 +85,37 @@ Example \fI/etc/swift/dispersion.conf\fR: .IP "auth_user = dpstats:dpstats" .IP "auth_key = dpstats" .IP "swift_dir = /etc/swift" -.IP "# dispersion_coverage = 1" +.IP "# project_name = dpstats" +.IP "# project_domain_name = default" +.IP "# user_domain_name = default" +.IP "# dispersion_coverage = 1.0" .IP "# retries = 5" .IP "# concurrency = 25" +.IP "# endpoint_type = publicURL" .RE -.PD - .SH EXAMPLE -.PP +.PD + +.SH EXAMPLE +.PP .PD 0 $ swift-dispersion-populate .RS 1 .IP "Created 2621 containers for dispersion reporting, 38s, 0 retries" .IP "Created 2621 objects for dispersion reporting, 27s, 0 retries" -.RE +.RE + .PD - + + .SH DOCUMENTATION .LP More in depth documentation about the swift-dispersion utilities and -also Openstack-Swift as a whole can be found at -.BI http://swift.openstack.org/admin_guide.html#cluster-health -and -.BI http://swift.openstack.org +also OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/admin_guide.html#dispersion-report +and +.BI https://docs.openstack.org/swift/latest/ .SH "SEE ALSO" .BR swift-dispersion-report(1), -.BR dispersion.conf (5) +.BR dispersion.conf(5) diff --git a/doc/manpages/swift-dispersion-report.1 b/doc/manpages/swift-dispersion-report.1 index 2b0ffcd796..f789720616 100644 --- a/doc/manpages/swift-dispersion-report.1 +++ b/doc/manpages/swift-dispersion-report.1 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2011 OpenStack, LLC. +.\" Copyright (c) 2010-2011 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -14,45 +14,45 @@ .\" implied. .\" See the License for the specific language governing permissions and .\" limitations under the License. -.\" +.\" .TH swift-dispersion-report 1 "8/26/2011" "Linux" "OpenStack Swift" -.SH NAME +.SH NAME .LP .B swift-dispersion-report -\- Openstack-swift dispersion report +\- OpenStack Swift dispersion report .SH SYNOPSIS .LP -.B swift-dispersion-report [-d|--debug] [-j|--dump-json] [-p|--partitions] [--container-only|--object-only] [conf_file] +.B swift-dispersion-report [-d|--debug] [-j|--dump-json] [-p|--partitions] [--container-only|--object-only] [--insecure] [conf_file] -.SH DESCRIPTION +.SH DESCRIPTION .PP This is one of the swift-dispersion utilities that is used to evaluate the -overall cluster health. This is accomplished by checking if a set of +overall cluster health. This is accomplished by checking if a set of deliberately distributed containers and objects are currently in their proper places within the cluster. -.PP +.PP For instance, a common deployment has three replicas of each object. The health of that object can be measured by checking if each replica is in its proper place. If only 2 of the 3 is in place the object's health can be said to be at 66.66%, where 100% would be perfect. .PP -Once the \fBswift-dispersion-populate\fR has been used to populate the -dispersion account, one should run the \fBswift-dispersion-report\fR tool +Once the \fBswift-dispersion-populate\fR has been used to populate the +dispersion account, one should run the \fBswift-dispersion-report\fR tool repeatedly for the life of the cluster, in order to check the health of each of these containers and objects. .PP -These tools need direct access to the entire cluster and to the ring files. -Installing them on a proxy server will probably do or a box used for swift -administration purposes that also contains the common swift packages and ring. -Both \fBswift-dispersion-populate\fR and \fBswift-dispersion-report\fR use the +These tools need direct access to the entire cluster and to the ring files. +Installing them on a proxy server will probably do or a box used for swift +administration purposes that also contains the common swift packages and ring. +Both \fBswift-dispersion-populate\fR and \fBswift-dispersion-report\fR use the same configuration file, /etc/swift/dispersion.conf . The account used by these tool should be a dedicated account for the dispersion stats and also have admin -privileges. +privileges. .SH OPTIONS .RS 0 @@ -60,33 +60,28 @@ privileges. .IP "\fB-d, --debug\fR" output any 404 responses to standard error -.SH OPTIONS -.RS 0 -.PD 1 .IP "\fB-j, --dump-json\fR" output dispersion report in json format -.SH OPTIONS -.RS 0 -.PD 1 .IP "\fB-p, --partitions\fR" output the partition numbers that have any missing replicas -.SH OPTIONS -.RS 0 -.PD 1 .IP "\fB--container-only\fR" Only run the container report -.SH OPTIONS -.RS 0 -.PD 1 .IP "\fB--object-only\fR" Only run the object report +.IP "\fB--insecure\fR" +Allow accessing insecure keystone server. The keystone's certificate will not +be verified. + +.IP "\fB-P, --policy-name\fR" +Specify storage policy name + .SH CONFIGURATION -.PD 0 -Example \fI/etc/swift/dispersion.conf\fR: +.PD 0 +Example \fI/etc/swift/dispersion.conf\fR: .RS 3 .IP "[dispersion]" @@ -94,17 +89,23 @@ Example \fI/etc/swift/dispersion.conf\fR: .IP "auth_user = dpstats:dpstats" .IP "auth_key = dpstats" .IP "swift_dir = /etc/swift" -.IP "# dispersion_coverage = 1" +.IP "# project_name = dpstats" +.IP "# project_domain_name = default" +.IP "# user_domain_name = default" +.IP "# dispersion_coverage = 1.0" .IP "# retries = 5" .IP "# concurrency = 25" .IP "# dump_json = no" +.IP "# endpoint_type = publicURL" .RE -.PD - .SH EXAMPLE -.PP +.PD + +.SH EXAMPLE +.PP .PD 0 -$ swift-dispersion-report - +$ swift-dispersion-report + + .RS 1 .IP "Queried 2622 containers for dispersion reporting, 31s, 0 retries" .IP "100.00% of container copies found (7866 of 7866)" @@ -113,18 +114,20 @@ $ swift-dispersion-report .IP "Queried 2621 objects for dispersion reporting, 22s, 0 retries" .IP "100.00% of object copies found (7863 of 7863)" .IP "Sample represents 1.00% of the object partition space" -.RE +.RE + .PD - + + .SH DOCUMENTATION .LP More in depth documentation about the swift-dispersion utilities and -also Openstack-Swift as a whole can be found at -.BI http://swift.openstack.org/admin_guide.html#cluster-health -and -.BI http://swift.openstack.org +also OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/admin_guide.html#dispersion-report +and +.BI https://docs.openstack.org/swift/latest/ .SH "SEE ALSO" .BR swift-dispersion-populate(1), -.BR dispersion.conf (5) +.BR dispersion.conf(5) diff --git a/doc/manpages/swift-drive-audit.1 b/doc/manpages/swift-drive-audit.1 new file mode 100644 index 0000000000..cf8902061b --- /dev/null +++ b/doc/manpages/swift-drive-audit.1 @@ -0,0 +1,38 @@ +.\" +.\" Copyright (c) 2016 OpenStack Foundation. +.\" +.\" Licensed under the Apache License, Version 2.0 (the "License"); +.\" you may not use this file except in compliance with the License. +.\" You may obtain a copy of the License at +.\" +.\" http://www.apache.org/licenses/LICENSE-2.0 +.\" +.\" Unless required by applicable law or agreed to in writing, software +.\" distributed under the License is distributed on an "AS IS" BASIS, +.\" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +.\" implied. +.\" See the License for the specific language governing permissions and +.\" limitations under the License. +.\" +.TH SWIFT-DRIVE-AUDIT "1" "August 2016" "OpenStack Swift" + +.SH NAME +swift\-drive\-audit \- OpenStack Swift drive audit cron job + +.SH SYNOPSIS +.B swift\-drive\-audit +\fICONFIG\fR + +.SH DESCRIPTION +.PP +Tool that can be run by using cron to watch for bad drives. If errors are +detected, it unmounts the bad drive, so that Swift can work around it. + +.SH DOCUMENTATION +.LP +More in depth documentation in regards to +.BI swift\-drive\-audit +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/ +and +.BI https://docs.openstack.org diff --git a/doc/manpages/swift-form-signature.1 b/doc/manpages/swift-form-signature.1 new file mode 100644 index 0000000000..2ce640b90c --- /dev/null +++ b/doc/manpages/swift-form-signature.1 @@ -0,0 +1,67 @@ +.\" +.\" Copyright (c) 2016 OpenStack Foundation. +.\" +.\" Licensed under the Apache License, Version 2.0 (the "License"); +.\" you may not use this file except in compliance with the License. +.\" You may obtain a copy of the License at +.\" +.\" http://www.apache.org/licenses/LICENSE-2.0 +.\" +.\" Unless required by applicable law or agreed to in writing, software +.\" distributed under the License is distributed on an "AS IS" BASIS, +.\" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +.\" implied. +.\" See the License for the specific language governing permissions and +.\" limitations under the License. +.\" +.TH SWIFT-FORM-SIGNATURE "1" "August 2016" "OpenStack Swift" + +.SH NAME +swift\-form\-signature \- compute the expires and signature for OpenStack Swift Form POST middleware + +.SH SYNOPSIS +.B swift\-form\-signature +\fIpath\fR \fIredirect\fR \fImax_file_size\fR \fImax_file_count\fR +\fIseconds\fR \fIkey\fR + +.SH DESCRIPTION +.PP +Tool to compute expires and signature values which can be used to upload +objects directly to the Swift from a browser by using the form POST middleware. + +.SH OPTIONS +.TP +.I path +The prefix to use for form uploaded +objects. For example: +\fI/v1/account/container/object_prefix_\fP would +ensure all form uploads have that path +prepended to the browser\-given file name. +.TP +.I redirect +The URL to redirect the browser to after +the uploads have completed. +.TP +.I max_file_size +The maximum file size per file uploaded. +.TP +.I max_file_count +The maximum number of uploaded files +allowed. +.TP +.I seconds +The number of seconds from now to allow +the form post to begin. +.TP +.I key +The X\-Account\-Meta\-Temp\-URL\-Key for the +account. + +.SH DOCUMENTATION +.LP +More in depth documentation in regards to +.BI swift\-form\-signature +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/ +and +.BI https://docs.openstack.org diff --git a/doc/manpages/swift-get-nodes.1 b/doc/manpages/swift-get-nodes.1 index d9409c58f5..73d1475ed2 100644 --- a/doc/manpages/swift-get-nodes.1 +++ b/doc/manpages/swift-get-nodes.1 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2011 OpenStack, LLC. +.\" Copyright (c) 2010-2011 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -14,33 +14,61 @@ .\" implied. .\" See the License for the specific language governing permissions and .\" limitations under the License. -.\" -.TH swift-get-nodes 1 "8/26/2011" "Linux" "OpenStack Swift" +.\" +.TH swift-get-nodes 1 "10/25/2016" "Linux" "OpenStack Swift" -.SH NAME +.SH NAME .LP .B swift-get-nodes -\- Openstack-swift get-nodes tool +\- OpenStack Swift get-nodes tool .SH SYNOPSIS .LP -.B swift-get-nodes -\ [] [] - -.SH DESCRIPTION +.B swift-get-nodes +\ [options] [ []] + +Or + +.B swift-get-nodes +[options] -p + +Or + +.B swift-get-nodes +\ [options] -P policy_name + +.SH DESCRIPTION .PP The swift-get-nodes tool can be used to find out the location where -a particular account, container or object item is located within the -swift cluster nodes. For example, if you have the account hash and a container -name that belongs to that account, you can use swift-get-nodes to lookup +a particular account, container or object item is located within the +swift cluster nodes. For example, if you have the account hash and a container +name that belongs to that account, you can use swift-get-nodes to lookup where the container resides by using the container ring. +.SH OPTIONS +.TP +\fB\-h --help \fR +Shows the help message and exit +.TP +\fB\-a, --all\fR +Show all handoff nodes +.TP +\fB\-p PARTITION, --partition=PARTITION\fR +Show nodes for a given partition +.TP +\fB\-P POLICY_NAME, --policy-name=POLICY_NAME \fR +Specify storage policy name +.TP +\fB\-d SWIFT_DIR, --swift-dir=SWIFT_DIR\fR +Pass location of swift configuration file if different from the default +location /etc/swift + .RS 0 .IP "\fIExample:\fR" .RE .RS 4 -.PD 0 +.PD 0 .IP "$ swift-get-nodes /etc/swift/account.ring.gz MyAccount-12ac01446be2" .PD 0 @@ -51,28 +79,35 @@ where the container resides by using the container ring. .IP "Partition 221082" .IP "Hash d7e6ba68cfdce0f0e4ca7890e46cacce" -.IP "Server:Port Device 172.24.24.29:6002 sdd" -.IP "Server:Port Device 172.24.24.27:6002 sdr" -.IP "Server:Port Device 172.24.24.32:6002 sde" -.IP "Server:Port Device 172.24.24.26:6002 sdv [Handoff]" - -.IP "curl -I -XHEAD http://172.24.24.29:6002/sdd/221082/MyAccount-12ac01446be2" -.IP "curl -I -XHEAD http://172.24.24.27:6002/sdr/221082/MyAccount-12ac01446be2" -.IP "curl -I -XHEAD http://172.24.24.32:6002/sde/221082/MyAccount-12ac01446be2" -.IP "curl -I -XHEAD http://172.24.24.26:6002/sdv/221082/MyAccount-12ac01446be2 # [Handoff]" +.IP "Server:Port Device 172.24.24.29:6202 sdd" +.IP "Server:Port Device 172.24.24.27:6202 sdr" +.IP "Server:Port Device 172.24.24.32:6202 sde" +.IP "Server:Port Device 172.24.24.26:6202 sdv [Handoff]" + + +.IP "curl -I -XHEAD http://172.24.24.29:6202/sdd/221082/MyAccount-12ac01446be2" +.IP "curl -I -XHEAD http://172.24.24.27:6202/sdr/221082/MyAccount-12ac01446be2" +.IP "curl -I -XHEAD http://172.24.24.32:6202/sde/221082/MyAccount-12ac01446be2" +.IP "curl -I -XHEAD http://172.24.24.26:6202/sdv/221082/MyAccount-12ac01446be2 # [Handoff]" .IP "ssh 172.24.24.29 ls -lah /srv/node/sdd/accounts/221082/cce/d7e6ba68cfdce0f0e4ca7890e46cacce/ " -.IP "ssh 172.24.24.27 ls -lah /srv/node/sdr/accounts/221082/cce/d7e6ba68cfdce0f0e4ca7890e46cacce/" .IP "ssh 172.24.24.32 ls -lah /srv/node/sde/accounts/221082/cce/d7e6ba68cfdce0f0e4ca7890e46cacce/" .IP "ssh 172.24.24.26 ls -lah /srv/node/sdv/accounts/221082/cce/d7e6ba68cfdce0f0e4ca7890e46cacce/ # [Handoff] " -.PD -.RE +.IP "ssh 172.24.24.27 ls -lah /srv/node/sdr/accounts/221082/cce/d7e6ba68cfdce0f0e4ca7890e46cacce/" +.IP "ssh 172.24.24.32 ls -lah /srv/node/sde/accounts/221082/cce/d7e6ba68cfdce0f0e4ca7890e46cacce/" +.IP "ssh 172.24.24.26 ls -lah /srv/node/sdv/accounts/221082/cce/d7e6ba68cfdce0f0e4ca7890e46cacce/ # [Handoff] " + +.PD +.RE .SH DOCUMENTATION .LP -More documentation about Openstack-Swift can be found at -.BI http://swift.openstack.org/index.html +More documentation about OpenStack Swift can be found at +.BI https://docs.openstack.org/swift/latest/ .SH "SEE ALSO" + +.BR swift-account-info(1), +.BR swift-container-info(1), .BR swift-object-info(1), .BR swift-ring-builder(1) diff --git a/doc/manpages/swift-init.1 b/doc/manpages/swift-init.1 index 0d5431bfa0..c056e04fea 100644 --- a/doc/manpages/swift-init.1 +++ b/doc/manpages/swift-init.1 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2011 OpenStack, LLC. +.\" Copyright (c) 2010-2011 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -14,25 +14,25 @@ .\" implied. .\" See the License for the specific language governing permissions and .\" limitations under the License. -.\" +.\" .TH swift-init 1 "8/26/2011" "Linux" "OpenStack Swift" -.SH NAME +.SH NAME .LP .B swift-init -\- Openstack-swift swift-init tool +\- OpenStack Swift swift-init tool .SH SYNOPSIS .LP .B swift-init [ ...] [options] - -.SH DESCRIPTION + +.SH DESCRIPTION .PP The swift-init tool can be used to initialize all swift daemons available as part of -openstack-swift. Instead of calling individual init scripts for each -swift daemon, one can just use swift-init. With swift-init you can initialize -just one swift service, such as the "proxy", or a combination of them. The tool also +OpenStack Swift. Instead of calling individual init scripts for each +swift daemon, one can just use swift-init. With swift-init you can initialize +just one swift service, such as the "proxy", or a combination of them. The tool also allows one to use the keywords such as "all", "main" and "rest" for the argument. @@ -41,17 +41,17 @@ allows one to use the keywords such as "all", "main" and "rest" for the .PD 0 .RS 4 .IP "\fIproxy\fR" "4" -.IP " - Initializes the swift proxy daemon" +.IP " - Initializes the swift proxy daemon" .RE .RS 4 .IP "\fIobject\fR, \fIobject-replicator\fR, \fIobject-auditor\fR, \fIobject-updater\fR" -.IP " - Initialize the swift object daemons above" +.IP " - Initializes the swift object daemons above" .RE .RS 4 .IP "\fIcontainer\fR, \fIcontainer-update\fR, \fIcontainer-replicator\fR, \fIcontainer-auditor\fR" -.IP " - Initialize the swift container daemons above" +.IP " - Initializes the swift container daemons above" .RE .RS 4 @@ -66,16 +66,16 @@ allows one to use the keywords such as "all", "main" and "rest" for the .RS 4 .IP "\fImain\fR" -.IP " - Initializes all the \fBmain\fR swift daemons " +.IP " - Initializes all the \fBmain\fR swift daemons" .IP " (proxy, container, account and object servers)" .RE .RS 4 .IP "\fIrest\fR" -.IP " - Initializes all the other \fBswift background daemons\fR such as" -.IP " (updater, replicator, auditor, reaper)" +.IP " - Initializes all the other \fBswift background daemons\fR" +.IP " (updater, replicator, auditor, reaper, etc)" .RE -.PD +.PD \fBCommands:\fR @@ -87,19 +87,20 @@ allows one to use the keywords such as "all", "main" and "rest" for the .IP "\fIno-wait\fR: \t\t\t spawn server and return immediately" .IP "\fIonce\fR: \t\t\t start server and run one pass on supporting daemons" .IP "\fIreload\fR: \t\t\t graceful shutdown then restart on supporting servers" +.IP "\fIreload-seamless\fR: \t\t reload supporting servers with no downtime" .IP "\fIrestart\fR: \t\t\t stops then restarts server" .IP "\fIshutdown\fR: \t\t allow current requests to finish on supporting servers" .IP "\fIstart\fR: \t\t\t starts a server" .IP "\fIstatus\fR: \t\t\t display status of tracked pids for server" .IP "\fIstop\fR: \t\t\t stops a server" -.PD +.PD .RE \fBOptions:\fR .RS 4 -.PD 0 +.PD 0 .IP "-h, --help \t\t\t show this help message and exit" .IP "-v, --verbose \t\t\t display verbose output" .IP "-w, --no-wait \t\t\t won't wait for server to start before returning @@ -107,14 +108,17 @@ allows one to use the keywords such as "all", "main" and "rest" for the .IP "-n, --no-daemon \t\t start server interactively .IP "-g, --graceful \t\t send SIGHUP to supporting servers .IP "-c N, --config-num=N \t send command to the Nth server only -.PD +.IP "-k N, --kill-wait=N \t wait N seconds for processes to die (default 15) +.IP "-r RUN_DIR, --run-dir=RUN_DIR directory where the pids will be stored (default /var/run/swift) +.IP "--strict return non-zero status code if some config is missing. Default mode if server is explicitly named." +.IP "--non-strict return zero status code even if some config is missing. Default mode if server is one of aliases `all`, `main` or `rest`." +.IP "--kill-after-timeout kill daemon and all children after kill-wait period." +.PD .RE - - -.SH DOCUMENTATION -.LP -More documentation about Openstack-Swift can be found at -.BI http://swift.openstack.org/index.html +.SH DOCUMENTATION +.LP +More documentation about OpenStack Swift can be found at +.BI https://docs.openstack.org/swift/latest/ diff --git a/doc/manpages/swift-object-auditor.1 b/doc/manpages/swift-object-auditor.1 index 5be922676c..999acc1825 100644 --- a/doc/manpages/swift-object-auditor.1 +++ b/doc/manpages/swift-object-auditor.1 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2012 OpenStack, LLC. +.\" Copyright (c) 2010-2012 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -14,23 +14,23 @@ .\" implied. .\" See the License for the specific language governing permissions and .\" limitations under the License. -.\" +.\" .TH swift-object-auditor 1 "8/26/2011" "Linux" "OpenStack Swift" -.SH NAME +.SH NAME .LP -.B swift-object-auditor -\- Openstack-swift object auditor +.B swift-object-auditor +\- OpenStack Swift object auditor .SH SYNOPSIS .LP -.B swift-object-auditor +.B swift-object-auditor [CONFIG] [-h|--help] [-v|--verbose] [-o|--once] [-z|--zero_byte_fps] -.SH DESCRIPTION +.SH DESCRIPTION .PP -The object auditor crawls the local object system checking the integrity of objects. -If corruption is found (in the case of bit rot, for example), the file is +The object auditor crawls the local object system checking the integrity of objects. +If corruption is found (in the case of bit rot, for example), the file is quarantined, and replication will replace the bad file from another replica. The options are as follows: @@ -46,7 +46,7 @@ The options are as follows: .IP "-o" .IP "--once" .RS 4 -.IP "only run one pass of daemon" +.IP "only run one pass of daemon" .RE .IP "-z ZERO_BYTE_FPS" @@ -56,14 +56,14 @@ The options are as follows: .RE .PD .RE - - + + .SH DOCUMENTATION .LP -More in depth documentation in regards to -.BI swift-object-auditor -and also about Openstack-Swift as a whole can be found at -.BI http://swift.openstack.org/index.html +More in depth documentation in regards to +.BI swift-object-auditor +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/ .SH "SEE ALSO" diff --git a/doc/manpages/swift-object-expirer.1 b/doc/manpages/swift-object-expirer.1 index 24e1cbb05c..0615f9f99f 100644 --- a/doc/manpages/swift-object-expirer.1 +++ b/doc/manpages/swift-object-expirer.1 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2012 OpenStack, LLC. +.\" Copyright (c) 2012 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -14,31 +14,31 @@ .\" implied. .\" See the License for the specific language governing permissions and .\" limitations under the License. -.\" +.\" .TH swift-object-expirer 1 "3/15/2012" "Linux" "OpenStack Swift" -.SH NAME +.SH NAME .LP .B swift-object-expirer -\- Openstack-swift object expirer +\- OpenStack Swift object expirer .SH SYNOPSIS .LP -.B swift-object-expirer +.B swift-object-expirer [CONFIG] [-h|--help] [-v|--verbose] [-o|--once] -.SH DESCRIPTION +.SH DESCRIPTION .PP -The swift-object-expirer offers scheduled deletion of objects. The Swift client would -use the X-Delete-At or X-Delete-After headers during an object PUT or POST and the -cluster would automatically quit serving that object at the specified time and would +The swift-object-expirer offers scheduled deletion of objects. The Swift client would +use the X-Delete-At or X-Delete-After headers during an object PUT or POST and the +cluster would automatically quit serving that object at the specified time and would shortly thereafter remove the object from the system. -The X-Delete-At header takes a Unix Epoch timestamp, in integer form; for example: +The X-Delete-At header takes a Unix Epoch timestamp, in integer form; for example: 1317070737 represents Mon Sep 26 20:58:57 2011 UTC. -The X-Delete-After header takes a integer number of seconds. The proxy server -that receives the request will convert this header into an X-Delete-At header +The X-Delete-After header takes an integer number of seconds. The proxy server +that receives the request will convert this header into an X-Delete-At header using its current time plus the value given. The options are as follows: @@ -53,22 +53,23 @@ The options are as follows: .IP "-o" .IP "--once" .RS 4 -.IP "only run one pass of daemon" +.IP "only run one pass of daemon" .RE .PD .RE - - + + .SH DOCUMENTATION .LP -More in depth documentation in regards to +More in depth documentation in regards to .BI swift-object-expirer -can be foud at -.BI http://swift.openstack.org/overview_expiring_objects.html -and also about Openstack-Swift as a whole can be found at -.BI http://swift.openstack.org/index.html +can be found at +.BI https://docs.openstack.org/swift/latest/overview_expiring_objects.html +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/ .SH "SEE ALSO" +.BR object-server.conf(5) .BR object-expirer.conf(5) diff --git a/doc/manpages/swift-object-info.1 b/doc/manpages/swift-object-info.1 index fe38638b03..be7e6e51f6 100644 --- a/doc/manpages/swift-object-info.1 +++ b/doc/manpages/swift-object-info.1 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2011 OpenStack, LLC. +.\" Copyright (c) 2010-2011 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -14,42 +14,60 @@ .\" implied. .\" See the License for the specific language governing permissions and .\" limitations under the License. -.\" -.TH swift-object-info 1 "8/26/2011" "Linux" "OpenStack Swift" +.\" +.TH swift-object-info 1 "10/25/2016" "Linux" "OpenStack Swift" -.SH NAME +.SH NAME .LP .B swift-object-info -\- Openstack-swift object-info tool +\- OpenStack Swift object-info tool .SH SYNOPSIS .LP .B swift-object-info -[OBJECT_FILE] + [options] -.SH DESCRIPTION +.SH DESCRIPTION .PP -This is a very simple swift tool that allows a swiftop engineer to retrieve -information about an object that is located on the storage node. One calls -the tool with a given object file as it is stored on the storage node system. -It will then return several information about that object such as; +This is a very simple swift tool that allows a swiftop engineer to retrieve +information about an object that is located on the storage node. One calls +the tool with a given object file as it is stored on the storage node system. +It will then return several information about that object such as; .PD 0 -.IP "- Account it belongs to" +.IP "- Account it belongs to" .IP "- Container " .IP "- Object hash " -.IP "- Location on the ring " .IP "- Content Type " .IP "- timestamp " .IP "- Etag " .IP "- Content Length " .IP "- User Metadata " -.PD - +.IP "- Location on the ring " +.PD + +.SH OPTIONS +.TP +\fB\-h --help \fR +Shows the help message and exit +.TP +\fB\-n, --no-check-etag\fR +Don't verify file contents against stored etag +.TP +\fB\-d SWIFT_DIR, --swift-dir=SWIFT_DIR\fR +Pass location of swift configuration file if different from the default +location /etc/swift +.TP +\fB\-P POLICY_NAME, --policy-name=POLICY_NAME \fR +Specify storage policy name + .SH DOCUMENTATION .LP -More documentation about Openstack-Swift can be found at -.BI http://swift.openstack.org/index.html +More documentation about OpenStack Swift can be found at +.BI https://docs.openstack.org/swift/latest/ .SH "SEE ALSO" -.BR swift-get-nodes(1), + +.BR swift-account-info(1), +.BR swift-container-info(1), +.BR swift-get-nodes(1) diff --git a/doc/manpages/swift-object-reconstructor.1 b/doc/manpages/swift-object-reconstructor.1 new file mode 100644 index 0000000000..3877d8eaf1 --- /dev/null +++ b/doc/manpages/swift-object-reconstructor.1 @@ -0,0 +1,61 @@ +.\" +.\" Copyright (c) 2016 OpenStack Foundation. +.\" +.\" Licensed under the Apache License, Version 2.0 (the "License"); +.\" you may not use this file except in compliance with the License. +.\" You may obtain a copy of the License at +.\" +.\" http://www.apache.org/licenses/LICENSE-2.0 +.\" +.\" Unless required by applicable law or agreed to in writing, software +.\" distributed under the License is distributed on an "AS IS" BASIS, +.\" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +.\" implied. +.\" See the License for the specific language governing permissions and +.\" limitations under the License. +.\" +.TH SWIFT-OBJECT-RECONSTRUCTOR "1" "August 2016" "OpenStack Swift" + +.SH NAME +swift\-object\-reconstructor \- OpenStack Swift EC object reconstructor + +.SH SYNOPSIS +.B swift\-object\-reconstructor +\fICONFIG \fR[\fIoptions\fR] + +.SH DESCRIPTION +.PP +Daemon for reconstruction of EC objects. Once a pair of nodes has +determined the need to replace a missing object fragment, instead of +pushing over a copy like replication would do, the reconstructor has to +read in enough surviving fragments from other nodes and perform a local +reconstruction before it has the correct data to push to the other node. + +.SH OPTIONS +.TP +\fB\-h\fR, \fB\-\-help\fR +Show this help message and exit +.TP +\fB\-d\fR \fIDEVICES\fR, \fB\-\-devices\fR=\fIDEVICES\fR +Reconstruct only given devices. Comma\-separated list. Only has effect if +\-\-once is used. +.TP +\fB\-p\fR \fIPARTITIONS\fR, \fB\-\-partitions\fR=\fIPARTITIONS\fR +Reconstruct only given partitions. Comma\-separated +list. Only has effect if \-\-once is used. +.TP +\fB\-v\fR, \fB\-\-verbose\fR +Log to console +.TP +\fB\-o\fR, \fB\-\-once\fR +Only run one pass of daemon +.PP + +.SH DOCUMENTATION +.LP +More in depth documentation in regards to +.BI swift\-object\-reconstructor +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/ +and +.BI https://docs.openstack.org diff --git a/doc/manpages/swift-object-relinker.1 b/doc/manpages/swift-object-relinker.1 new file mode 100644 index 0000000000..8b42a6fc5e --- /dev/null +++ b/doc/manpages/swift-object-relinker.1 @@ -0,0 +1,75 @@ +.\" +.\" Copyright (c) 2017 OpenStack Foundation. +.\" +.\" Licensed under the Apache License, Version 2.0 (the "License"); +.\" you may not use this file except in compliance with the License. +.\" You may obtain a copy of the License at +.\" +.\" http://www.apache.org/licenses/LICENSE-2.0 +.\" +.\" Unless required by applicable law or agreed to in writing, software +.\" distributed under the License is distributed on an "AS IS" BASIS, +.\" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +.\" implied. +.\" See the License for the specific language governing permissions and +.\" limitations under the License. +.\" +.TH SWIFT-OBJECT-RELINKER "1" "December 2017" "OpenStack Swift" + +.SH NAME +\fBswift\-object\-relinker\fR \- relink and cleanup objects to increase partition power +.SH SYNOPSIS +.B swift\-object\-relinker +[\fIoptions\fR] <\fIcommand\fR> + +.SH DESCRIPTION +.PP +The relinker prepares an object server's filesystem for a partition power +change by crawling the filesystem and linking existing objects to future +partition directories. + +More information can be found at +.BI https://docs.openstack.org/swift/latest/ring_partpower.html + +.SH COMMANDS +.TP +\fBrelink\fR +Relink files for partition power increase. + +.TP +\fBcleanup\fR +Remove hard links in the old locations. + +.SH OPTIONS +.TP +\fB\-h\fR, \fB\-\-help\fR +Show this help message and exit + +.TP +\fB\-\-swift-dir\fR \fISWIFT_DIR\fR +Path to swift directory + +.TP +\fB\-\-devices\fR \fIDEVICES\fR +Path to swift device directory + +.TP +\fB\-\-skip\-mount\-check\fR +Don't test if disk is mounted + +.TP +\fB\-\-logfile\fR \fILOGFILE\fR +Set log file name + +.TP +\fB\-\-debug\fR +Enable debug mode + +.SH DOCUMENTATION +.LP +More in depth documentation in regards to +.BI swift\-object\-relinker +and also about OpenStack Swift as a whole can be found at +.BI http://docs.openstack.org/developer/swift/index.html +and +.BI http://docs.openstack.org diff --git a/doc/manpages/swift-object-replicator.1 b/doc/manpages/swift-object-replicator.1 index 71c0383aa2..62eae6990b 100644 --- a/doc/manpages/swift-object-replicator.1 +++ b/doc/manpages/swift-object-replicator.1 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2012 OpenStack, LLC. +.\" Copyright (c) 2010-2012 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -14,57 +14,64 @@ .\" implied. .\" See the License for the specific language governing permissions and .\" limitations under the License. -.\" +.\" .TH swift-object-replicator 1 "8/26/2011" "Linux" "OpenStack Swift" -.SH NAME +.SH NAME .LP -.B swift-object-replicator -\- Openstack-swift object replicator +.B swift-object-replicator +\- OpenStack Swift object replicator .SH SYNOPSIS .LP -.B swift-object-replicator +.B swift-object-replicator [CONFIG] [-h|--help] [-v|--verbose] [-o|--once] -.SH DESCRIPTION +.SH DESCRIPTION .PP -Replication is designed to keep the system in a consistent state in the face of -temporary error conditions like network outages or drive failures. The replication -processes compare local data with each remote copy to ensure they all contain the -latest version. Object replication uses a hash list to quickly compare subsections +Replication is designed to keep the system in a consistent state in the face of +temporary error conditions like network outages or drive failures. The replication +processes compare local data with each remote copy to ensure they all contain the +latest version. Object replication uses a hash list to quickly compare subsections of each partition. .PP -Replication updates are push based. For object replication, updating is just a matter +Replication updates are push based. For object replication, updating is just a matter of rsyncing files to the peer. The replicator also ensures that data is removed -from the system. When an object item is deleted a tombstone is set as the latest -version of the item. The replicator will see the tombstone and ensure that the item +from the system. When an object item is deleted a tombstone is set as the latest +version of the item. The replicator will see the tombstone and ensure that the item is removed from the entire system. -The options are as follows: +.SH OPTIONS +.TP +\fB\-h\fR, \fB\-\-help\fR +Show this help message and exit +.TP +\fB\-d\fR \fIDEVICES\fR, \fB\-\-devices\fR=\fIDEVICES\fR +Replicate only given devices. Comma\-separated list. Only has effect if +\-\-once is used. +.TP +\fB\-p\fR \fIPARTITIONS\fR, \fB\-\-partitions\fR=\fIPARTITIONS\fR +Replicate only given partitions. Comma\-separated +list. Only has effect if \-\-once is used. +.TP +\fB\-i\fR \fIPOLICIES\fR, \fB\-\-policies\fR=\fIPOLICIES\fR +Replicate only given policy indices. Comma\-separated list. Only has effect if +\-\-once is used. +.TP +\fB\-v\fR, \fB\-\-verbose\fR +Log to console +.TP +\fB\-o\fR, \fB\-\-once\fR +Only run one pass of daemon +.PP + -.RS 4 -.PD 0 -.IP "-v" -.IP "--verbose" -.RS 4 -.IP "log to console" -.RE -.IP "-o" -.IP "--once" -.RS 4 -.IP "only run one pass of daemon" -.RE -.PD -.RE - - .SH DOCUMENTATION .LP -More in depth documentation in regards to +More in depth documentation in regards to .BI swift-object-replicator -and also about Openstack-Swift as a whole can be found at -.BI http://swift.openstack.org/index.html +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/ .SH "SEE ALSO" diff --git a/doc/manpages/swift-object-server.1 b/doc/manpages/swift-object-server.1 index e3d0d34693..76632a9313 100644 --- a/doc/manpages/swift-object-server.1 +++ b/doc/manpages/swift-object-server.1 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2011 OpenStack, LLC. +.\" Copyright (c) 2010-2011 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -14,40 +14,40 @@ .\" implied. .\" See the License for the specific language governing permissions and .\" limitations under the License. -.\" +.\" .TH swift-object-server 1 "8/26/2011" "Linux" "OpenStack Swift" -.SH NAME +.SH NAME .LP .B swift-object-server -\- Openstack-swift object server. +\- OpenStack Swift object server. .SH SYNOPSIS .LP .B swift-object-server [CONFIG] [-h|--help] [-v|--verbose] -.SH DESCRIPTION +.SH DESCRIPTION .PP The Object Server is a very simple blob storage server that can store, retrieve -and delete objects stored on local devices. Objects are stored as binary files +and delete objects stored on local devices. Objects are stored as binary files on the filesystem with metadata stored in the file's extended attributes (xattrs). -This requires that the underlying filesystem choice for object servers support -xattrs on files. Some filesystems, like ext3, have xattrs turned off by default. +This requires that the underlying filesystem choice for object servers support +xattrs on files. Some filesystems, like ext3, have xattrs turned off by default. Each object is stored using a path derived from the object name's hash and the operation's timestamp. Last write always wins, and ensures that the latest object version will be served. A deletion is also treated as a version of the file (a 0 byte file ending with -".ts", which stands for tombstone). This ensures that deleted files are replicated +".ts", which stands for tombstone). This ensures that deleted files are replicated correctly and older versions don't magically reappear due to failure scenarios. .SH DOCUMENTATION .LP -More in depth documentation in regards to +More in depth documentation in regards to .BI swift-object-server -and also about Openstack-Swift as a whole can be found at -.BI http://swift.openstack.org/index.html -and -.BI http://docs.openstack.org +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/ +and +.BI https://docs.openstack.org .SH "SEE ALSO" diff --git a/doc/manpages/swift-object-updater.1 b/doc/manpages/swift-object-updater.1 index 254b3499ba..da175674e2 100644 --- a/doc/manpages/swift-object-updater.1 +++ b/doc/manpages/swift-object-updater.1 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2012 OpenStack, LLC. +.\" Copyright (c) 2010-2012 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -14,36 +14,36 @@ .\" implied. .\" See the License for the specific language governing permissions and .\" limitations under the License. -.\" +.\" .TH swift-object-updater 1 "8/26/2011" "Linux" "OpenStack Swift" -.SH NAME +.SH NAME .LP .B swift-object-updater -\- Openstack-swift object updater +\- OpenStack Swift object updater .SH SYNOPSIS .LP .B swift-object-updater [CONFIG] [-h|--help] [-v|--verbose] [-o|--once] -.SH DESCRIPTION +.SH DESCRIPTION .PP -The object updater is responsible for updating object information in container listings. -It will check to see if there are any locally queued updates on the filesystem of each -devices, what is also known as async pending file(s), walk each one and update the +The object updater is responsible for updating object information in container listings. +It will check to see if there are any locally queued updates on the filesystem of each +devices, what is also known as async pending file(s), walk each one and update the container listing. -For example, suppose a container server is under load and a new object is put -into the system. The object will be immediately available for reads as soon as -the proxy server responds to the client with success. However, the object -server has not been able to update the object listing in the container server. -Therefore, the update would be queued locally for a later update. Container listings, +For example, suppose a container server is under load and a new object is put +into the system. The object will be immediately available for reads as soon as +the proxy server responds to the client with success. However, the object +server has not been able to update the object listing in the container server. +Therefore, the update would be queued locally for a later update. Container listings, therefore, may not immediately contain the object. This is where an eventual consistency -window will most likely come in to play. +window will most likely come in to play. -In practice, the consistency window is only as large as the frequency at which -the updater runs and may not even be noticed as the proxy server will route +In practice, the consistency window is only as large as the frequency at which +the updater runs and may not even be noticed as the proxy server will route listing requests to the first container server which responds. The server under load may not be the one that serves subsequent listing requests – one of the other two replicas may handle the listing. @@ -60,18 +60,18 @@ The options are as follows: .IP "-o" .IP "--once" .RS 4 -.IP "only run one pass of daemon" +.IP "only run one pass of daemon" .RE -.PD +.PD .RE - - + + .SH DOCUMENTATION .LP -More in depth documentation in regards to +More in depth documentation in regards to .BI swift-object-updater -and also about Openstack-Swift as a whole can be found at -.BI http://swift.openstack.org/index.html +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/ .SH "SEE ALSO" diff --git a/doc/manpages/swift-oldies.1 b/doc/manpages/swift-oldies.1 new file mode 100644 index 0000000000..4266f42df9 --- /dev/null +++ b/doc/manpages/swift-oldies.1 @@ -0,0 +1,69 @@ +.\" +.\" Author: Paul Dardeau +.\" Copyright (c) 2016 OpenStack Foundation. +.\" +.\" Licensed under the Apache License, Version 2.0 (the "License"); +.\" you may not use this file except in compliance with the License. +.\" You may obtain a copy of the License at +.\" +.\" http://www.apache.org/licenses/LICENSE-2.0 +.\" +.\" Unless required by applicable law or agreed to in writing, software +.\" distributed under the License is distributed on an "AS IS" BASIS, +.\" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +.\" implied. +.\" See the License for the specific language governing permissions and +.\" limitations under the License. +.\" +.TH swift-oldies 1 "8/04/2016" "Linux" "OpenStack Swift" + +.SH NAME +.LP +.B swift-oldies +\- OpenStack Swift oldies tool + +.SH SYNOPSIS +.LP +.B swift-oldies +[-h|--help] [-a|--age] + + +.SH DESCRIPTION +.PP +Lists Swift processes that have been running more than a specific length of +time (in hours). This is done by scanning the list of currently executing +processes (via ps command) and examining the execution time of those python +processes whose program names begin with 'swift-'. + +Example (see all Swift processes older than two days): +swift-oldies \-a 48 + +The options are as follows: + +.RS 4 +.PD 0 +.IP "-a HOURS" +.IP "--age=HOURS" +.RS 4 +.IP "Look for processes at least HOURS old; default: 720 (30 days)" +.RE +.PD 0 + +.IP "-h" +.IP "--help" +.RS 4 +.IP "Display program help and exit" +.PD +.RE + + +.SH DOCUMENTATION +.LP +More documentation about OpenStack Swift can be found at +.BI https://docs.openstack.org/swift/latest/ + + +.SH "SEE ALSO" + +.BR swift-orphans(1) + diff --git a/doc/manpages/swift-orphans.1 b/doc/manpages/swift-orphans.1 index b4b6158bdc..e39513b614 100644 --- a/doc/manpages/swift-orphans.1 +++ b/doc/manpages/swift-orphans.1 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2012 OpenStack, LLC. +.\" Copyright (c) 2012 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -14,30 +14,31 @@ .\" implied. .\" See the License for the specific language governing permissions and .\" limitations under the License. -.\" +.\" .TH swift-orphans 1 "3/15/2012" "Linux" "OpenStack Swift" -.SH NAME +.SH NAME .LP .B swift-orphans -\- Openstack-swift orphans tool +\- OpenStack Swift orphans tool .SH SYNOPSIS .LP -.B swift-orphans -[-h|--help] [-a|--age] [-k|--kill] [-w|--wide] +.B swift-orphans +[-h|--help] [-a|--age] [-k|--kill] [-w|--wide] [-r|--run-dir] -.SH DESCRIPTION +.SH DESCRIPTION .PP Lists and optionally kills orphaned Swift processes. This is done by scanning -/var/run/swift for .pid files and listing any processes that look like Swift -processes but aren't associated with the pids in those .pid files. Any Swift -processes running with the 'once' parameter are ignored, as those are usually -for full-speed audit scans and such. +/var/run/swift or the directory specified to the \-r switch for .pid files and +listing any processes that look like Swift processes but aren't associated with +the pids in those .pid files. Any Swift processes running with the 'once' +parameter are ignored, as those are usually for full-speed audit scans and +such. -Example (sends SIGTERM to all orphaned Swift processes older than two hours): -swift-orphans -a 2 -k TERM +Example (sends SIGTERM to all orphaned Swift processes older than two hours): +swift-orphans \-a 2 \-k TERM The options are as follows: @@ -61,9 +62,9 @@ The options are as follows: .PD .RE - + .SH DOCUMENTATION .LP -More documentation about Openstack-Swift can be found at -.BI http://swift.openstack.org/index.html +More documentation about OpenStack Swift can be found at +.BI https://docs.openstack.org/swift/latest/ diff --git a/doc/manpages/swift-proxy-server.1 b/doc/manpages/swift-proxy-server.1 index 2a4b6c36f4..790d3fda05 100644 --- a/doc/manpages/swift-proxy-server.1 +++ b/doc/manpages/swift-proxy-server.1 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2011 OpenStack, LLC. +.\" Copyright (c) 2010-2011 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -14,36 +14,36 @@ .\" implied. .\" See the License for the specific language governing permissions and .\" limitations under the License. -.\" +.\" .TH swift-proxy-server 1 "8/26/2011" "Linux" "OpenStack Swift" -.SH NAME +.SH NAME .LP -.B swift-proxy-server -\- Openstack-swift proxy server. +.B swift-proxy-server +\- OpenStack Swift proxy server. .SH SYNOPSIS .LP .B swift-proxy-server [CONFIG] [-h|--help] [-v|--verbose] -.SH DESCRIPTION +.SH DESCRIPTION .PP -The Swift Proxy Server is responsible for tying together the rest of the Swift architecture. -For each request, it will look up the location of the account, container, or object in the -ring and route the request accordingly. The public API is also exposed through the Proxy -Server. A large number of failures are also handled in the Proxy Server. For example, +The Swift Proxy Server is responsible for tying together the rest of the Swift architecture. +For each request, it will look up the location of the account, container, or object in the +ring and route the request accordingly. The public API is also exposed through the Proxy +Server. A large number of failures are also handled in the Proxy Server. For example, if a server is unavailable for an object PUT, it will ask the ring for a handoff server and route there instead. When objects are streamed to or from an object server, they are -streamed directly through the proxy server to or from the user the proxy server does +streamed directly through the proxy server to or from the user the proxy server does not spool them. .SH DOCUMENTATION .LP -More in depth documentation in regards to +More in depth documentation in regards to .BI swift-proxy-server -and also about Openstack-Swift as a whole can be found at -.BI http://swift.openstack.org/index.html +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/ .SH "SEE ALSO" diff --git a/doc/manpages/swift-recon-cron.1 b/doc/manpages/swift-recon-cron.1 new file mode 100644 index 0000000000..8ebbcb2c29 --- /dev/null +++ b/doc/manpages/swift-recon-cron.1 @@ -0,0 +1,38 @@ +.\" +.\" Copyright (c) 2016 OpenStack Foundation. +.\" +.\" Licensed under the Apache License, Version 2.0 (the "License"); +.\" you may not use this file except in compliance with the License. +.\" You may obtain a copy of the License at +.\" +.\" http://www.apache.org/licenses/LICENSE-2.0 +.\" +.\" Unless required by applicable law or agreed to in writing, software +.\" distributed under the License is distributed on an "AS IS" BASIS, +.\" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +.\" implied. +.\" See the License for the specific language governing permissions and +.\" limitations under the License. +.\" +.TH SWIFT-RECON-CRON "1" "August 2016" "OpenStack Swift" + +.SH NAME +swift\-recon\-cron \- OpenStack Swift recon cron job + +.SH SYNOPSIS +.B swift\-recon\-cron +\fI\fR + +.SH DESCRIPTION +.PP +Tool that can be run by using cron to fill recon cache. Recon data +can be read by \fBswift-recon\fR tool. + +.SH DOCUMENTATION +.LP +More in depth documentation in regards to +.BI swift\-recon\-cron +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/ +and +.BI https://docs.openstack.org diff --git a/doc/manpages/swift-recon.1 b/doc/manpages/swift-recon.1 index a9745c28da..dc5a19d002 100644 --- a/doc/manpages/swift-recon.1 +++ b/doc/manpages/swift-recon.1 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2011 OpenStack, LLC. +.\" Copyright (c) 2010-2011 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -14,26 +14,26 @@ .\" implied. .\" See the License for the specific language governing permissions and .\" limitations under the License. -.\" +.\" .TH swift-recon 1 "8/26/2011" "Linux" "OpenStack Swift" -.SH NAME +.SH NAME .LP .B swift-recon -\- Openstack-swift recon middleware cli tool +\- OpenStack Swift recon middleware cli tool .SH SYNOPSIS .LP -.B swift-recon -\ [-v] [--suppress] [-a] [-r] [-u] [-d] [-l] [--md5] [--auditor] [--updater] [--expirer] [--sockstat] - -.SH DESCRIPTION +.B swift-recon +\ [-v] [--suppress] [-a] [-r] [-u] [-d] [-l] [-T] [--md5] [--auditor] [--updater] [--expirer] [--sockstat] + +.SH DESCRIPTION .PP The swift-recon cli tool can be used to retrieve various metrics and telemetry information about -a cluster that has been collected by the swift-recon middleware. +a cluster that has been collected by the swift-recon middleware. -In order to make use of the swift-recon middleware, update the object-server.conf file and -enable the recon middleware by adding a pipeline entry and setting its option(s). You can view +In order to make use of the swift-recon middleware, update the object-server.conf file and +enable the recon middleware by adding a pipeline entry and setting its option(s). You can view more information in the example section below. @@ -58,24 +58,48 @@ Get updater stats Get expirer stats .IP "\fB-r, --replication\fR" Get replication stats +.IP "\fB-R, --reconstruction\fR" +Get reconstruction stats .IP "\fB-u, --unmounted\fR" Check cluster for unmounted devices .IP "\fB-d, --diskusage\fR" Get disk usage stats +.IP "\fB--top=COUNT\fR" +Also show the top COUNT entries in rank order +.IP "\fB--lowest=COUNT\fR" +Also show the lowest COUNT entries in rank order +.IP "\fB--human-readable\fR" +Use human readable suffix for disk usage stats .IP "\fB-l, --loadstats\fR" Get cluster load average stats .IP "\fB-q, --quarantined\fR" Get cluster quarantine stats +.IP "\fB--validate-servers\fR" +Validate servers on the ring .IP "\fB--md5\fR" -Get md5sum of servers ring and compare to local cop +Get md5sum of servers ring and compare to local copy +.IP "\fB--sockstat\fR" +Get cluster socket usage stats +.IP "\fB--driveaudit\fR" +Get drive audit error stats +.IP "\fB-T, --time\fR" +Check time synchronization +.IP "\fB--swift-versions\fR" +Check swift version .IP "\fB--all\fR" -Perform all checks. Equivalent to -arudlq --md5 +Perform all checks. Equivalent to \-arudlqT +\-\-md5 \-\-sockstat \-\-auditor \-\-updater \-\-expirer +\-\-driveaudit \-\-validate\-servers \-\-swift-versions +.IP "\fB--region=REGION\fR" +Only query servers in specified region .IP "\fB-z ZONE, --zone=ZONE\fR" Only query servers in specified zone +.IP "\fB-t SECONDS, --timeout=SECONDS\fR" +Time to wait for a response from a server .IP "\fB--swiftdir=PATH\fR" Default = /etc/swift .PD -.RE +.RE @@ -84,16 +108,16 @@ Default = /etc/swift .PD 0 .RS 0 .IP "ubuntu:~$ swift-recon -q --zone 3" -.IP "===============================================================================" +.IP "=================================================================" .IP "[2011-10-18 19:36:00] Checking quarantine dirs on 1 hosts... " .IP "[Quarantined objects] low: 4, high: 4, avg: 4, total: 4 " .IP "[Quarantined accounts] low: 0, high: 0, avg: 0, total: 0 " .IP "[Quarantined containers] low: 0, high: 0, avg: 0, total: 0 " -.IP "===============================================================================" +.IP "=================================================================" .RE .RS 0 -Finally if you also wish to track asynchronous pending’s you will need to setup a +Finally if you also wish to track asynchronous pending's you will need to setup a cronjob to run the swift-recon-cron script periodically: .IP "*/5 * * * * swift /usr/bin/swift-recon-cron /etc/swift/object-server.conf" @@ -104,10 +128,10 @@ cronjob to run the swift-recon-cron script periodically: .SH DOCUMENTATION .LP -More documentation about Openstack-Swift can be found at -.BI http://swift.openstack.org/index.html -Also more specific documentation about swift-recon can be found at -.BI http://swift.openstack.org/admin_guide.html#cluster-telemetry-and-monitoring +More documentation about OpenStack Swift can be found at +.BI https://docs.openstack.org/swift/latest/ +Also more specific documentation about swift-recon can be found at +.BI https://docs.openstack.org/swift/latest/admin_guide.html\#cluster-telemetry-and-monitoring diff --git a/doc/manpages/swift-reconciler-enqueue.1 b/doc/manpages/swift-reconciler-enqueue.1 new file mode 100644 index 0000000000..fe9c3db1fc --- /dev/null +++ b/doc/manpages/swift-reconciler-enqueue.1 @@ -0,0 +1,58 @@ +.\" +.\" Copyright (c) 2016 OpenStack Foundation. +.\" +.\" Licensed under the Apache License, Version 2.0 (the "License"); +.\" you may not use this file except in compliance with the License. +.\" You may obtain a copy of the License at +.\" +.\" http://www.apache.org/licenses/LICENSE-2.0 +.\" +.\" Unless required by applicable law or agreed to in writing, software +.\" distributed under the License is distributed on an "AS IS" BASIS, +.\" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +.\" implied. +.\" See the License for the specific language governing permissions and +.\" limitations under the License. +.\" +.TH SWIFT-RECONCILER-ENQUEUE "1" "August 2016" "OpenStack Swift" + +.SH NAME +swift\-reconciler\-enqueue \- OpenStack Swift reconciler enqueue +.SH SYNOPSIS +.B swift\-reconciler\-enqueue +\fIpolicy_index\fR \fI/a/c/o\fR \fItimestamp\fR \fR[\fIoptions\fR] + +.SH DESCRIPTION +.PP +This script enqueues an object to be evaluated by the reconciler. + +.SH OPTIONS +.TP +\fIpolicy_index\fR +The policy the object is currently stored in. +.TP +\fI/a/c/o\fR +The full path of the object \- UTF\-8 +.TP +\fItimestamp\fR +The timestamp of the datafile/tombstone. + +.TP +\fB\-h\fR, \fB\-\-help\fR +Show this help message and exit +.TP +\fB\-X\fR \fIOP\fR, \fB\-\-op\fR=\fIOP\fR +The method of the misplaced operation +.TP +\fB\-f\fR, \fB\-\-force\fR +Force an object to be re\-enqueued +.PP + +.SH DOCUMENTATION +.LP +More in depth documentation in regards to +.BI swift\-reconciler\-enqueue +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/ +and +.BI https://docs.openstack.org diff --git a/doc/manpages/swift-ring-builder-analyzer.1 b/doc/manpages/swift-ring-builder-analyzer.1 new file mode 100644 index 0000000000..6ced40416e --- /dev/null +++ b/doc/manpages/swift-ring-builder-analyzer.1 @@ -0,0 +1,52 @@ +.\" +.\" Copyright (c) 2016 OpenStack Foundation. +.\" +.\" Licensed under the Apache License, Version 2.0 (the "License"); +.\" you may not use this file except in compliance with the License. +.\" You may obtain a copy of the License at +.\" +.\" http://www.apache.org/licenses/LICENSE-2.0 +.\" +.\" Unless required by applicable law or agreed to in writing, software +.\" distributed under the License is distributed on an "AS IS" BASIS, +.\" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +.\" implied. +.\" See the License for the specific language governing permissions and +.\" limitations under the License. +.\" +.TH SWIFT-RING-BUILDER-ANALYZER "1" "August 2016" "OpenStack Swift" + +.SH NAME +swift\-ring\-builder\-analyzer \- put the OpenStack Swift ring builder through its paces +.SH SYNOPSIS +.B swift\-ring\-builder\-analyzer +[\fIoptions\fR] \fIscenario_path\fR + +.SH DESCRIPTION +.PP +This is a tool to help developers quantify changes to the ring +builder. It takes a scenario (JSON file) describing the builder's +basic parameters (part_power, replicas, etc.) and a number of +"rounds", where each round is a set of operations to perform on the +builder. For each round, the operations are applied, and then the +builder is rebalanced until it reaches a steady state. + +.SH OPTIONS +.TP +.I scenario_path +Path to the scenario file +.TP +\fB\-h\fR, \fB\-\-help\fR +Show this help message and exit +.TP +\fB\-\-check\fR, \fB\-c\fR +Just check the scenario, don't execute it. + +.SH DOCUMENTATION +.LP +More in depth documentation in regards to +.BI swift\-ring\-builder\-analyzer +and also about OpenStack Swift as a whole can be found at +.BI https://docs.openstack.org/swift/latest/ +and +.BI https://docs.openstack.org diff --git a/doc/manpages/swift-ring-builder.1 b/doc/manpages/swift-ring-builder.1 index c8433e8ddf..ac6bb7a1af 100644 --- a/doc/manpages/swift-ring-builder.1 +++ b/doc/manpages/swift-ring-builder.1 @@ -1,6 +1,6 @@ .\" .\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2011 OpenStack, LLC. +.\" Copyright (c) 2010-2011 OpenStack Foundation. .\" .\" Licensed under the Apache License, Version 2.0 (the "License"); .\" you may not use this file except in compliance with the License. @@ -14,26 +14,26 @@ .\" implied. .\" See the License for the specific language governing permissions and .\" limitations under the License. -.\" +.\" .TH swift-ring-builder 1 "8/26/2011" "Linux" "OpenStack Swift" -.SH NAME +.SH NAME .LP .B swift-ring-builder -\- Openstack-swift ring builder +\- OpenStack Swift ring builder .SH SYNOPSIS .LP .B swift-ring-builder <...> -.SH DESCRIPTION +.SH DESCRIPTION .PP -The swift-ring-builder utility is used to create, search and manipulate -the swift storage ring. The ring-builder assigns partitions to devices and +The swift-ring-builder utility is used to create, search and manipulate +the swift storage ring. The ring-builder assigns partitions to devices and writes an optimized Python structure to a gzipped, pickled file on disk for -shipping out to the servers. The server processes just check the modification -time of the file occasionally and reload their in-memory copies of the ring +shipping out to the servers. The server processes just check the modification +time of the file occasionally and reload their in-memory copies of the ring structure as needed. Because of how the ring-builder manages changes to the ring, using a slightly older ring usually just means one of the three replicas for a subset of the partitions will be incorrect, which can be easily worked around. @@ -48,15 +48,23 @@ partitions will end up assigned to different devices, and therefore nearly all data stored will have to be replicated to new locations. So, recovery from a builder file loss is possible, but data will definitely be unreachable for an extended time. +.PP +If invoked as 'swift-ring-builder-safe' the directory containing the builder +file provided will be locked (via a .lock file in the files parent directory). +This provides a basic safe guard against multiple instances of the swift-ring-builder +(or other utilities that observe this lock) from attempting to write to or read +the builder/ring files while operations are in progress. This can be useful in +environments where ring management has been automated but the operator still +needs to interact with the rings manually. .SH SEARCH -.PD 0 +.PD 0 .IP "\fB\fR" .RS 5 .IP "Can be of the form:" -.IP "dz-:/_" +.IP "drz-:/_" .IP "Any part is optional, but you must include at least one, examples:" @@ -65,6 +73,7 @@ extended time. .IP "z1 Matches devices in zone 1" .IP "z1-1.2.3.4 Matches devices in zone 1 with the ip 1.2.3.4" .IP "1.2.3.4 Matches devices in any zone with the ip 1.2.3.4" +.IP "r1z1:5678 Matches devices in zone 1 present in region 1 using port 5678" .IP "z1:5678 Matches devices in zone 1 using port 5678" .IP ":5678 Matches devices that use port 5678" .IP "/sdb1 Matches devices with the device name sdb1" @@ -73,12 +82,12 @@ extended time. .IP "[::1] Matches devices in any zone with the ip ::1" .IP "z1-[::1]:5678 Matches devices in zone 1 with ip ::1 and port 5678" .RE - + Most specific example: .RS 3 -d74z1-1.2.3.4:5678/sdb1_"snet: 5.6.7.8" -.RE +d74z1-1.2.3.4:5678/sdb1_"snet: 5.6.7.8" +.RE Nerd explanation: @@ -86,17 +95,22 @@ Nerd explanation: .IP "All items require their single character prefix except the ip, in which case the - is optional unless the device id or zone is also included." .RE .RE -.PD +.PD + +.SH OPTIONS +.TP +.I "\-y, \-\-yes" +Assume a yes response to all questions .SH COMMANDS -.PD 0 +.PD 0 .IP "\fB\fR" .RS 5 -Shows information about the ring and the devices within. +Shows information about the ring and the devices within. .RE @@ -106,17 +120,19 @@ Shows information about matching devices. .RE -.IP "\fBadd\fR z-:/_" +.IP "\fBadd\fR z-:/_ " +.IP "\fBadd\fR rz-:/_ " +.IP "\fBadd\fR -r -z -i -p -d -m -w " .RS 5 -Adds a device to the ring with the given information. No partitions will be -assigned to the new device until after running 'rebalance'. This is so you +Adds a device to the ring with the given information. No partitions will be +assigned to the new device until after running 'rebalance'. This is so you can make multiple device changes and rebalance them all just once. .RE .IP "\fBcreate\fR " .RS 5 -Creates with 2^ partitions and . +Creates with 2^ partitions and . is number of hours to restrict moving a partition more than once. .RE @@ -128,11 +144,11 @@ the devices matching the search values given. The first column is the assigned partition number and the second column is the number of device matches for that partition. The list is ordered from most number of matches to least. If there are a lot of devices to match against, this command -could take a while to run. +could take a while to run. .RE -.IP "\fBrebalence\fR" +.IP "\fBrebalance\fR" .RS 5 Attempts to rebalance the ring by reassigning partitions that haven't been recently reassigned. .RE @@ -140,37 +156,37 @@ Attempts to rebalance the ring by reassigning partitions that haven't been recen .IP "\fBremove\fR " .RS 5 -Removes the device(s) from the ring. This should normally just be used for -a device that has failed. For a device you wish to decommission, it's best -to set its weight to 0, wait for it to drain all its data, then use this -remove command. This will not take effect until after running 'rebalance'. +Removes the device(s) from the ring. This should normally just be used for +a device that has failed. For a device you wish to decommission, it's best +to set its weight to 0, wait for it to drain all its data, then use this +remove command. This will not take effect until after running 'rebalance'. This is so you can make multiple device changes and rebalance them all just once. .RE .IP "\fBset_info\fR :/_" .RS 5 -Resets the device's information. This information isn't used to assign -partitions, so you can use 'write_ring' afterward to rewrite the current -ring with the newer device information. Any of the parts are optional -in the final :/_ parameter; just give what you -want to change. For instance set_info d74 _"snet: 5.6.7.8" would just +Resets the device's information. This information isn't used to assign +partitions, so you can use 'write_ring' afterward to rewrite the current +ring with the newer device information. Any of the parts are optional +in the final :/_ parameter; just give what you +want to change. For instance set_info d74 _"snet: 5.6.7.8" would just update the meta data for device id 74. .RE .IP "\fBset_min_part_hours\fR " .RS 5 -Changes the to the given . This should be set to -however long a full replication/update cycle takes. We're working on a way +Changes the to the given . This should be set to +however long a full replication/update cycle takes. We're working on a way to determine this more easily than scanning logs. .RE .IP "\fBset_weight\fR " .RS 5 -Resets the device's weight. No partitions will be reassigned to or from the -device until after running 'rebalance'. This is so you can make multiple +Resets the device's weight. No partitions will be reassigned to or from the +device until after running 'rebalance'. This is so you can make multiple device changes and rebalance them all just once. .RE @@ -183,8 +199,8 @@ Just runs the validation routines on the ring. .IP "\fBwrite_ring\fR" .RS 5 -Just rewrites the distributable ring file. This is done automatically after -a successful rebalance, so really this is only useful after one or more 'set_info' +Just rewrites the distributable ring file. This is done automatically after +a successful rebalance, so really this is only useful after one or more 'set_info' calls when no rebalance is needed but you want to send out the new device information. .RE @@ -193,17 +209,16 @@ calls when no rebalance is needed but you want to send out the new device inform set_min_part_hours set_weight validate write_ring \fBExit codes:\fR 0 = ring changed, 1 = ring did not change, 2 = error -.PD +.PD - -.SH DOCUMENTATION -.LP -More in depth documentation about the swift ring and also Openstack-Swift as a -whole can be found at -.BI http://swift.openstack.org/overview_ring.html, -.BI http://swift.openstack.org/admin_guide.html#managing-the-rings -and -.BI http://swift.openstack.org +.SH DOCUMENTATION +.LP +More in depth documentation about the swift ring and also OpenStack Swift as a +whole can be found at +.BI https://docs.openstack.org/swift/latest/overview_ring.html +.BI https://docs.openstack.org/swift/latest/admin_guide.html#managing-the-rings +and +.BI https://docs.openstack.org/swift/latest/ diff --git a/doc/manpages/swift-ring-composer.1 b/doc/manpages/swift-ring-composer.1 new file mode 100644 index 0000000000..8d029ff334 --- /dev/null +++ b/doc/manpages/swift-ring-composer.1 @@ -0,0 +1,40 @@ +.TH swift-ring-composer "1" "June 2018" "Linux" "OpenStack Swift" +.SH NAME +.B swift-ring-composer +\- manual page for swift-ring-composer + +.SH SYNOPSIS +.LP +.B swift-ring-composer +[\-h] {show,compose} ... + +.SH DESCRIPTION +This is a tool for building a composite ring file from other existing ring +builder files. The component ring builders must all have the same partition +power. Each device must only be used in a single component builder. Each +region must only be used in a single component builder. +.PP +.B NOTE: +This tool is for experimental use and may be removed in future versions of Swift. +.PP +.SS "positional arguments:" +.TP + +Name of composite builder file +.SS "optional arguments:" +.TP +\fB\-h\fR, \fB\-\-help\fR +show this help message and exit +.SH "COMMANDS" +.TP +.SS "\fBshow\fR [-h]" +show composite ring builder metadata +.TP +.SS "\fBcompose\fR [-h] [ [ ...] --output [--force]" +compose composite ring +.PP +.SH DOCUMENTATION +.LP +More in depth documentation about the swift ring and also OpenStack Swift as a +whole can be found at +.BI https://swift.openstack.org diff --git a/doc/manpages/swift.1 b/doc/manpages/swift.1 deleted file mode 100644 index 9e86dd1601..0000000000 --- a/doc/manpages/swift.1 +++ /dev/null @@ -1,121 +0,0 @@ -.\" -.\" Author: Joao Marcelo Martins or -.\" Copyright (c) 2010-2011 OpenStack, LLC. -.\" -.\" Licensed under the Apache License, Version 2.0 (the "License"); -.\" you may not use this file except in compliance with the License. -.\" You may obtain a copy of the License at -.\" -.\" http://www.apache.org/licenses/LICENSE-2.0 -.\" -.\" Unless required by applicable law or agreed to in writing, software -.\" distributed under the License is distributed on an "AS IS" BASIS, -.\" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -.\" implied. -.\" See the License for the specific language governing permissions and -.\" limitations under the License. -.\" -.TH swift 1 "8/26/2011" "Linux" "OpenStack Swift" - -.SH NAME -.LP -.B swift -\- Openstack-swift swift tool - -.SH SYNOPSIS -.LP -.B swift -[options] [args] - -.SH DESCRIPTION -.PP -The swift tool is a command line interface script for communicating with -an openstack-swift object storage environment. It allows one to perform -several types of operations. - -.SH COMMANDS -.PP - -\fBstat\fR [\fIcontainer\fR] [\fIobject\fR] -.RS 4 -Displays information for the account, container, or object depending on the args given (if any). -.RE - -\fBlist\fR [\fIcommand-options\fR] [\fIcontainer\fR] -.RS 4 -Lists the containers for the account or the objects for a container. The -p or --prefix is an option that will only list items beginning -with that prefix. The -d or --delimiter is option (for container listings only) -that will roll up items with the given delimiter (see Cloud Files general -documentation for what this means). -.RE - -\fBupload\fR [\fIcommand-options\fR] container file_or_directory [\fIfile_or_directory\fR] [...] -.RS 4 -Uploads to the given container the files and directories specified by the -remaining args. The -c or --changed is an option that will only upload files -that have changed since the last upload. The -S or --segment-size -and --leave-segments are options as well (see --help for more). -.RE - -\fBpost\fR [\fIcommand-options\fR] [\fIcontainer\fR] [\fIobject\fR] -.RS 4 -Updates meta information for the account, container, or object depending -on the args given. If the container is not found, it will be created -automatically; but this is not true for accounts and objects. Containers -also allow the -r (or --read-acl) and -w (or --write-acl) options. The -m -or --meta option is allowed on all and used to define the user meta data -items to set in the form Name:Value. This option can be repeated. -\fBExample\fR: post -m Color:Blue -m Size:Large -.RE - - \fBdownload\fR [\fIcommand-options\fR] [\fIcontainer\fR] [\fIobject\fR] [\fIobject\fR] [...] -.RS 4 -Downloads everything in the account (with --all), or everything in a -container, or a list of objects depending on the args given. For a single -object download, you may use the -o [--output] option to -redirect the output to a specific file or if "-" then just redirect to stdout. -.RE - -\fBdelete\fR [\fIcommand-options\fR] [\fIcontainer\fR] [\fIobject\fR] [\fIobject\fR] [...] -.RS 4 -Deletes everything in the account (with --all), or everything in a container, -or a list of objects depending on the args given. Segments of manifest objects -will be deleted as well, unless you specify the --leave-segments option. -.RE - - .SH OPTIONS -.PD 0 -.IP "--version Show program's version number and exit" -.IP "-h, --help Show this help message and exit" -.IP "-s, --snet Use SERVICENET internal network" -.IP "-v, --verbose Print more info" -.IP "-q, --quiet Suppress status output" -.IP "-A AUTH, --auth=AUTH URL for obtaining an auth token " -.IP "-U USER, --user=USER User name for obtaining an auth token" -.IP "-K KEY, --key=KEY Key for obtaining an auth token" -.PD - - .SH EXAMPLE -.PP -swift -A https://127.0.0.1:443/auth/v1.0 -U swiftops:swiftops -K swiftops stat - -.RS 2 -.PD 0 -.IP "Account: AUTH_43b42dae-dc0b-4a4b-ac55-97de614d6e6e" -.IP "Containers: 1" -.IP "Objects: 1" -.IP "Bytes: 1124" -.IP "Accept-Ranges: bytes" -.IP "X-Trans-Id: txb21186a9eef64ed295a1e95896a0fc72" -.PD -.RE - - -.SH DOCUMENTATION -.LP -More in depth documentation about Openstack-Swift as a whole can be found at -.BI http://swift.openstack.org - - -.LP - diff --git a/doc/manpages/swift.conf.5 b/doc/manpages/swift.conf.5 new file mode 100644 index 0000000000..b750cfdd4f --- /dev/null +++ b/doc/manpages/swift.conf.5 @@ -0,0 +1,217 @@ +.\" +.\" Author: Nandini Tata +.\" Copyright (c) 2016 OpenStack Foundation. +.\" +.\" Licensed under the Apache License, Version 2.0 (the "License"); +.\" you may not use this file except in compliance with the License. +.\" You may obtain a copy of the License at +.\" +.\" http://www.apache.org/licenses/LICENSE-2.0 +.\" +.\" Unless required by applicable law or agreed to in writing, software +.\" distributed under the License is distributed on an "AS IS" BASIS, +.\" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +.\" implied. +.\" See the License for the specific language governing permissions and +.\" limitations under the License. +.\" +.TH swift.conf 5 "8/8/2016" "Linux" "OpenStack Swift" + +.SH NAME +.LP +.B swift.conf +\- common configuration file for the OpenStack object storage services + + + +.SH SYNOPSIS +.LP +.B swift.conf + + + +.SH DESCRIPTION +.PP +This is the common configuration file used by all services of OpenStack object +storage services. + +The configuration file follows the python-pastedeploy syntax. The file is +divided into sections, which are enclosed by square brackets. Each section +will contain a certain number of key/value parameters which are described +later. + +Any line that begins with a '#' symbol is ignored. + +You can find more information about python-pastedeploy configuration format at +\fIhttps://docs.pylonsproject.org/projects/pastedeploy/en/latest/#config-format\fR + + + +.SH SWIFT HASH SECTION +.PD 1 +.RS 0 +This is indicated by section named [swift-hash]. Below are the parameters that +are acceptable within this section: + +.PD 0 +.IP "\fBswift_hash_path_suffix\fR" +.IP "\fBswift_hash_path_prefix\fR" +.PD + +swift_hash_path_suffix and swift_hash_path_prefix are used as part of the +hashing algorithm when determining data placement in the cluster. +These values should remain secret and MUST NOT change once a cluster has been +deployed. + +Use only printable chars (python -c "import string; print(string.printable)"). + + + +.SH STORAGE POLICY SECTION +.PD 1 +.RS 0 +This is indicated by section name [storage-policy:#] + +Storage policies are defined here and they determine various characteristics +about how objects are stored and treated. Policies are specified by name on +a per container basis. The policy index is specified in the section header +and is used internally. The policy with index 0 is always used for legacy +containers and can be given a name for use in metadata; however, the ring file +name will always be 'object.ring.gz' for backwards compatibility. If no +policies are defined, a policy with index 0 will be automatically created for +backwards compatibility and given the name Policy-0. A default policy is used +when creating new containers when no policy is specified in the request. If +no other policies are defined, the policy with index 0 will be declared the +default. If multiple policies are defined, you must define a policy with index +0 and you must specify a default. It is recommended you always define a +section for storage-policy:0. Aliases are not mandatory when defining a +storage policy. + +.IP "\fB[storage-policy:index]\fR" +Each storage policy is defined in a separate section with an index specified +in the header. Below are the parameters that are acceptable within this +section: + +.IP "\fBname\fR" +Name of the storage policy. Policy names are case insensitive. +.IP "\fBaliases\fR" +Multiple names can be assigned to one policy using aliases. All names must +follow the Swift naming rules. +.IP "\fBpolicy_type\fR" +Policy type can be replication or erasure_coding. Replication policy +replicates the objects to specified number of replicas. Erasure coding uses +PyECLib API library for encode/decode operations. Please refer to Swift +documentation for details on how erasure coding is implemented. +.IP "\fBec_type\fR" +This parameter must be chosen from the list of EC backends supported by +PyECLib. +.IP "\fBec_num_data_fragments\fR" +This parameter is specific to 'erasure coding' policy_type only. It defines +the number of fragments that will be comprised of data. +.IP "\fBec_num_parity_fragments\fR" +This parameter is specific to 'erasure coding' policy_type only. It defines +the number of fragments that will be comprised of parity. +.IP "\fBec_object_segment_size\fR" +This parameter is specific to 'erasure coding' policy_type only. It defines +the amount of data that will be buffered up before feeding a segment into the +encoder/decoder. The default value is 1048576. +.IP "\fIExamples:\fR" + +.PD 0 +.IP "[storage-policy:0]" +.IP "name = Policy-0" +.IP "default = yes" +.IP "policy_type = replication" +.IP "aliases = yellow, orange" + +.IP "[storage-policy:1]" +.IP "name = silver" +.IP "policy_type = replication" + +.IP "[storage-policy:2]" +.IP "name = deepfreeze10-4" +.IP "aliases = df10-4" +.IP "policy_type = erasure_coding" +.IP "ec_type = liberasurecode_rs_vand" +.IP "ec_num_data_fragments = 10" +.IP "ec_num_parity_fragments = 4" +.IP "ec_object_segment_size = 1048576" +.PD +.RE +.PD + + + +.SH SWIFT CONSTRAINTS SECTION +.PD 1 +.RS 0 +This is indicated by section name [swift-constraints]. This section sets the +basic constraints on data saved in the swift cluster. These constraints are +automatically published by the proxy server in responses to /info requests. +Below are the parameters that are acceptable within this section: +.IP "\fBmax_file_size\fR" +max_file_size is the largest "normal" object that can be saved in the cluster. +This is also the limit on the size of each segment of a "large" object when +using the large object manifest support. This value is set in bytes. Setting +it to lower than 1MiB will cause some tests to fail. It is STRONGLY +recommended to leave this value at the default (5 * 2**30 + 2). +.IP "\fBmax_meta_name_length\fR" +max_meta_name_length is the max number of bytes in the utf8 encoding of the +name portion of a metadata header. +.IP "\fBmax_meta_value_length\fR" +max_meta_value_length is the max number of bytes in the utf8 encoding of a +metadata value. +.IP "\fBmax_meta_count\fR" +max_meta_count is the max number of metadata keys that can be stored on a +single account, container, or object. +.IP "\fBmax_meta_overall_size\fR" +max_meta_overall_size is the max number of bytes in the utf8 encoding of the +metadata (keys + values). +.IP "\fBmax_header_size\fR" +max_header_size is the max number of bytes in the utf8 encoding of each +header. Using 8192 as default because eventlet uses 8192 as max size of header +line. This value may need to be increased when using identity v3 API tokens +including more than 7 catalog entries. +.IP "\fBextra_header_count\fR" +By default the maximum number of allowed headers depends on the number of max +allowed metadata settings plus a default value of 36 for swift internally +generated headers and regular http headers. If for some reason this is not +enough (custom middleware for example) it can be increased with the +extra_header_count constraint. +.IP "\fBmax_object_name_length\fR" +max_object_name_length is the max number of bytes in the utf8 encoding of an +object name. +.IP "\fBcontainer_listing_limit\fR" +container_listing_limit is the default (and max) number of items returned for +a container listing request. +.IP "\fBaccount_listing_limit\fR" +account_listing_limit is the default (and max) number of items returned for an +account listing request. +.IP "\fBmax_account_name_length\fR" +max_account_name_length is the max number of bytes in the utf8 encoding of an +account name. +.IP "\fBmax_container_name_length\fR" +max_container_name_length is the max number of bytes in the utf8 encoding of a +container name. +.IP "\fBvalid_api_versions\fR" +By default, all REST API calls should use "v1" or "v1.0" as the version string, +for example "/v1/account". This can be manually overridden to make this +backward-compatible, in case a different version string has been used before. +Use a comma-separated list in case of multiple allowed versions, for example +valid_api_versions = v0,v1,v2. +This is only enforced for account, container and object requests. The allowed +api versions are by default excluded from /info. +.IP "\fBauto_create_account_prefix\fR" +auto_create_account_prefix specifies the prefix for system accounts, such as +those used by the object-expirer, and container-sharder. +Default is ".". + + + +.SH DOCUMENTATION +.LP +More in depth documentation about the swift.conf and also OpenStack-Swift as a +whole can be found at +.BI https://docs.openstack.org/swift/latest/admin_guide.html +and +.BI https://docs.openstack.org/swift/latest/ diff --git a/doc/requirements.txt b/doc/requirements.txt new file mode 100644 index 0000000000..32c5aaebd6 --- /dev/null +++ b/doc/requirements.txt @@ -0,0 +1,10 @@ +# The order of packages is significant, because pip processes them in the order +# of appearance. Changing the order has an impact on the overall integration +# process, which may cause wedges in the gate later. +# this is required for the docs build jobs +sphinx>=2.0.0,!=2.1.0 # BSD +openstackdocstheme>=2.2.1 # Apache-2.0 +reno>=3.1.0 # Apache-2.0 +os-api-ref>=1.0.0 # Apache-2.0 +python-keystoneclient>=3.19.0 # Apache-2.0 +sphinxcontrib-svg2pdfconverter>=0.1.0 # BSD diff --git a/doc/s3api/conf/ceph-known-failures-keystone.yaml b/doc/s3api/conf/ceph-known-failures-keystone.yaml new file mode 100644 index 0000000000..69f0e76ffb --- /dev/null +++ b/doc/s3api/conf/ceph-known-failures-keystone.yaml @@ -0,0 +1,194 @@ +ceph_s3: + :teardown: {status: KNOWN} + :setup: {status: KNOWN} + s3tests.functional.test_headers.test_bucket_create_bad_authorization_invalid_aws2: {status: KNOWN} + s3tests.functional.test_headers.test_bucket_create_bad_authorization_none: {status: KNOWN} + s3tests.functional.test_headers.test_object_create_bad_authorization_invalid_aws2: {status: KNOWN} + s3tests.functional.test_headers.test_object_create_bad_authorization_none: {status: KNOWN} + s3tests.functional.test_s3.test_100_continue: {status: KNOWN} + s3tests.functional.test_s3.test_atomic_conditional_write_1mb: {status: KNOWN} + s3tests.functional.test_s3.test_atomic_dual_conditional_write_1mb: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_acl_default: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_acl_grant_email: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_acl_grant_email_notexist: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_acl_grant_nonexist_user: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_acl_grant_userid_fullcontrol: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_acl_grant_userid_read: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_acl_grant_userid_readacp: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_acl_grant_userid_write: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_acl_grant_userid_writeacp: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_acl_no_grants: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_acls_changes_persistent: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_acl_xml_fullcontrol: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_acl_xml_read: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_acl_xml_readacp: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_acl_xml_write: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_acl_xml_writeacp: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_header_acl_grants: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_list_objects_anonymous: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_list_objects_anonymous_fail: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_recreate_not_overriding: {status: KNOWN} + s3tests.functional.test_s3.test_cors_origin_response: {status: KNOWN} + s3tests.functional.test_s3.test_cors_origin_wildcard: {status: KNOWN} + s3tests.functional.test_s3.test_list_buckets_anonymous: {status: KNOWN} + s3tests.functional.test_s3.test_list_buckets_invalid_auth: {status: KNOWN} + s3tests.functional.test_s3.test_logging_toggle: {status: KNOWN} + s3tests.functional.test_s3.test_multipart_resend_first_finishes_last: {status: KNOWN} + s3tests.functional.test_s3.test_object_acl_full_control_verify_owner: {status: KNOWN} + s3tests.functional.test_s3.test_object_acl_xml: {status: KNOWN} + s3tests.functional.test_s3.test_object_acl_xml_read: {status: KNOWN} + s3tests.functional.test_s3.test_object_acl_xml_readacp: {status: KNOWN} + s3tests.functional.test_s3.test_object_acl_xml_write: {status: KNOWN} + s3tests.functional.test_s3.test_object_acl_xml_writeacp: {status: KNOWN} + s3tests.functional.test_s3.test_object_copy_canned_acl: {status: KNOWN} + s3tests.functional.test_s3.test_object_copy_not_owned_object_bucket: {status: KNOWN} + s3tests.functional.test_s3.test_object_giveaway: {status: KNOWN} + s3tests.functional.test_s3.test_object_header_acl_grants: {status: KNOWN} + s3tests.functional.test_s3.test_object_raw_get: {status: KNOWN} + s3tests.functional.test_s3.test_object_raw_get_bucket_acl: {status: KNOWN} + s3tests.functional.test_s3.test_object_raw_get_bucket_gone: {status: KNOWN} + s3tests.functional.test_s3.test_object_raw_get_object_acl: {status: KNOWN} + s3tests.functional.test_s3.test_object_raw_get_object_gone: {status: KNOWN} + s3tests.functional.test_s3.test_object_raw_put: {status: KNOWN} + s3tests.functional.test_s3.test_object_raw_put_write_access: {status: KNOWN} + s3tests.functional.test_s3.test_object_set_valid_acl: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_anonymous_request: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_authenticated_request: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_authenticated_request_bad_access_key: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_case_insensitive_condition_fields: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_condition_is_case_sensitive: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_escaped_field_values: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_expired_policy: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_expires_is_case_sensitive: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_ignored_header: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_invalid_access_key: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_invalid_content_length_argument: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_invalid_date_format: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_invalid_request_field_value: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_invalid_signature: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_missing_conditions_list: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_missing_content_length_argument: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_missing_expires_condition: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_missing_policy_condition: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_missing_signature: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_no_key_specified: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_request_missing_policy_specified_field: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_set_invalid_success_code: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_set_key_from_filename: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_set_success_code: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_success_redirect_action: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_upload_larger_than_chunk: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_upload_size_below_minimum: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_upload_size_limit_exceeded: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_user_specified_header: {status: KNOWN} + s3tests.functional.test_s3.test_put_object_ifmatch_failed: {status: KNOWN} + s3tests.functional.test_s3.test_put_object_ifmatch_good: {status: KNOWN} + s3tests.functional.test_s3.test_put_object_ifmatch_nonexisted_failed: {status: KNOWN} + s3tests.functional.test_s3.test_put_object_ifmatch_overwrite_existed_good: {status: KNOWN} + s3tests.functional.test_s3.test_put_object_ifnonmatch_failed: {status: KNOWN} + s3tests.functional.test_s3.test_put_object_ifnonmatch_good: {status: KNOWN} + s3tests.functional.test_s3.test_set_cors: {status: KNOWN} + s3tests.functional.test_s3.test_stress_bucket_acls_changes: {status: KNOWN} + s3tests.functional.test_s3.test_versioned_concurrent_object_create_concurrent_remove: {status: KNOWN} + s3tests.functional.test_s3.test_versioned_object_acl: {status: KNOWN} + s3tests.functional.test_s3.test_versioning_copy_obj_version: {status: KNOWN} + s3tests.functional.test_s3.test_versioning_multi_object_delete: {status: KNOWN} + s3tests.functional.test_s3.test_versioning_multi_object_delete_with_marker: {status: KNOWN} + s3tests.functional.test_s3.test_versioning_multi_object_delete_with_marker_create: {status: KNOWN} + s3tests.functional.test_s3.test_versioning_obj_create_overwrite_multipart: {status: KNOWN} + s3tests.functional.test_s3.test_versioning_obj_create_read_remove_head: {status: KNOWN} + s3tests.functional.test_s3.test_versioning_obj_create_versions_remove_all: {status: KNOWN} + s3tests.functional.test_s3.test_versioning_obj_create_versions_remove_special_names: {status: KNOWN} + s3tests.functional.test_s3.test_versioning_obj_suspend_versions: {status: KNOWN} + s3tests.functional.test_s3.test_versioning_obj_suspend_versions_simple: {status: KNOWN} + s3tests.functional.test_s3_website.check_can_test_website: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_bucket_private_redirectall_base: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_bucket_private_redirectall_path: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_bucket_private_redirectall_path_upgrade: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_nonexistant_bucket_rgw: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_nonexistant_bucket_s3: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_private_bucket_list_empty: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_private_bucket_list_empty_blockederrordoc: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_private_bucket_list_empty_gooderrordoc: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_private_bucket_list_empty_missingerrordoc: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_private_bucket_list_private_index: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_private_bucket_list_private_index_blockederrordoc: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_private_bucket_list_private_index_gooderrordoc: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_private_bucket_list_private_index_missingerrordoc: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_private_bucket_list_public_index: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_public_bucket_list_empty: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_public_bucket_list_empty_blockederrordoc: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_public_bucket_list_empty_gooderrordoc: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_public_bucket_list_empty_missingerrordoc: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_public_bucket_list_private_index: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_public_bucket_list_private_index_blockederrordoc: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_public_bucket_list_private_index_gooderrordoc: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_public_bucket_list_private_index_missingerrordoc: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_public_bucket_list_public_index: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_xredirect_nonwebsite: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_xredirect_private_abs: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_xredirect_private_relative: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_xredirect_public_abs: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_xredirect_public_relative: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_list_return_data_versioning: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_policy: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_policy_acl: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_policy_another_bucket: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_policy_different_tenant: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_policy_set_condition_operator_end_with_IfExists: {status: KNOWN} + s3tests.functional.test_s3.test_delete_tags_obj_public: {status: KNOWN} + s3tests.functional.test_s3.test_encryption_sse_c_invalid_md5: {status: KNOWN} + s3tests.functional.test_s3.test_encryption_sse_c_method_head: {status: KNOWN} + s3tests.functional.test_s3.test_encryption_sse_c_multipart_bad_download: {status: KNOWN} + s3tests.functional.test_s3.test_encryption_sse_c_multipart_invalid_chunks_1: {status: KNOWN} + s3tests.functional.test_s3.test_encryption_sse_c_multipart_invalid_chunks_2: {status: KNOWN} + s3tests.functional.test_s3.test_encryption_sse_c_no_key: {status: KNOWN} + s3tests.functional.test_s3.test_encryption_sse_c_no_md5: {status: KNOWN} + s3tests.functional.test_s3.test_encryption_sse_c_other_key: {status: KNOWN} + s3tests.functional.test_s3.test_encryption_sse_c_post_object_authenticated_request: {status: KNOWN} + s3tests.functional.test_s3.test_encryption_sse_c_present: {status: KNOWN} + s3tests.functional.test_s3.test_get_obj_head_tagging: {status: KNOWN} + s3tests.functional.test_s3.test_get_obj_tagging: {status: KNOWN} + s3tests.functional.test_s3.test_get_tags_acl_public: {status: KNOWN} + s3tests.functional.test_s3.test_lifecycle_deletemarker_expiration: {status: KNOWN} + s3tests.functional.test_s3.test_lifecycle_expiration: {status: KNOWN} + s3tests.functional.test_s3.test_lifecycle_expiration_date: {status: KNOWN} + s3tests.functional.test_s3.test_lifecycle_get: {status: KNOWN} + s3tests.functional.test_s3.test_lifecycle_get_no_id: {status: KNOWN} + s3tests.functional.test_s3.test_lifecycle_id_too_long: {status: KNOWN} + s3tests.functional.test_s3.test_lifecycle_multipart_expiration: {status: KNOWN} + s3tests.functional.test_s3.test_lifecycle_noncur_expiration: {status: KNOWN} + s3tests.functional.test_s3.test_lifecycle_rules_conflicted: {status: KNOWN} + s3tests.functional.test_s3.test_lifecycle_same_id: {status: KNOWN} + s3tests.functional.test_s3.test_lifecycle_set: {status: KNOWN} + s3tests.functional.test_s3.test_lifecycle_set_date: {status: KNOWN} + s3tests.functional.test_s3.test_lifecycle_set_deletemarker: {status: KNOWN} + s3tests.functional.test_s3.test_lifecycle_set_empty_filter: {status: KNOWN} + s3tests.functional.test_s3.test_lifecycle_set_filter: {status: KNOWN} + s3tests.functional.test_s3.test_lifecycle_set_multipart: {status: KNOWN} + s3tests.functional.test_s3.test_lifecycle_set_noncurrent: {status: KNOWN} + s3tests.functional.test_s3.test_multipart_copy_invalid_range: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_empty_conditions: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_tags_anonymous_request: {status: KNOWN} + s3tests.functional.test_s3.test_post_object_tags_authenticated_request: {status: KNOWN} + s3tests.functional.test_s3.test_put_delete_tags: {status: KNOWN} + s3tests.functional.test_s3.test_put_excess_key_tags: {status: KNOWN} + s3tests.functional.test_s3.test_put_excess_tags: {status: KNOWN} + s3tests.functional.test_s3.test_put_excess_val_tags: {status: KNOWN} + s3tests.functional.test_s3.test_put_max_kvsize_tags: {status: KNOWN} + s3tests.functional.test_s3.test_put_max_tags: {status: KNOWN} + s3tests.functional.test_s3.test_put_modify_tags: {status: KNOWN} + s3tests.functional.test_s3.test_put_obj_with_tags: {status: KNOWN} + s3tests.functional.test_s3.test_put_tags_acl_public: {status: KNOWN} + s3tests.functional.test_s3.test_sse_kms_method_head: {status: KNOWN} + s3tests.functional.test_s3.test_sse_kms_multipart_invalid_chunks_1: {status: KNOWN} + s3tests.functional.test_s3.test_sse_kms_multipart_invalid_chunks_2: {status: KNOWN} + s3tests.functional.test_s3.test_sse_kms_multipart_upload: {status: KNOWN} + s3tests.functional.test_s3.test_sse_kms_post_object_authenticated_request: {status: KNOWN} + s3tests.functional.test_s3.test_sse_kms_present: {status: KNOWN} + s3tests.functional.test_s3.test_sse_kms_read_declare: {status: KNOWN} + s3tests.functional.test_s3.test_sse_kms_transfer_13b: {status: KNOWN} + s3tests.functional.test_s3.test_sse_kms_transfer_1MB: {status: KNOWN} + s3tests.functional.test_s3.test_sse_kms_transfer_1b: {status: KNOWN} + s3tests.functional.test_s3.test_sse_kms_transfer_1kb: {status: KNOWN} + s3tests.functional.test_s3.test_versioned_object_acl_no_version_specified: {status: KNOWN} diff --git a/doc/s3api/conf/ceph-known-failures-tempauth.yaml b/doc/s3api/conf/ceph-known-failures-tempauth.yaml new file mode 100644 index 0000000000..86dc1dbdb5 --- /dev/null +++ b/doc/s3api/conf/ceph-known-failures-tempauth.yaml @@ -0,0 +1,359 @@ +ceph_s3: + :teardown: {status: KNOWN} + :teardown: {status: KNOWN} + :setup: {status: KNOWN} + s3tests.functional.test_headers.test_bucket_create_bad_authorization_invalid_aws2: {status: KNOWN} + s3tests.functional.test_headers.test_bucket_create_bad_authorization_none: {status: KNOWN} + s3tests.functional.test_headers.test_object_create_bad_authorization_invalid_aws2: {status: KNOWN} + s3tests.functional.test_headers.test_object_create_bad_authorization_none: {status: KNOWN} + s3tests.functional.test_s3.test_atomic_dual_conditional_write_1mb: {status: KNOWN} + s3tests.functional.test_s3.test_logging_toggle: {status: KNOWN} + s3tests.functional.test_s3_website.check_can_test_website: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_bucket_private_redirectall_base: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_bucket_private_redirectall_path: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_bucket_private_redirectall_path_upgrade: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_nonexistant_bucket_rgw: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_nonexistant_bucket_s3: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_private_bucket_list_empty: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_private_bucket_list_empty_blockederrordoc: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_private_bucket_list_empty_gooderrordoc: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_private_bucket_list_empty_missingerrordoc: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_private_bucket_list_private_index: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_private_bucket_list_private_index_blockederrordoc: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_private_bucket_list_private_index_gooderrordoc: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_private_bucket_list_private_index_missingerrordoc: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_private_bucket_list_public_index: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_public_bucket_list_empty: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_public_bucket_list_empty_blockederrordoc: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_public_bucket_list_empty_gooderrordoc: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_public_bucket_list_empty_missingerrordoc: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_public_bucket_list_private_index: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_public_bucket_list_private_index_blockederrordoc: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_public_bucket_list_private_index_gooderrordoc: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_public_bucket_list_private_index_missingerrordoc: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_public_bucket_list_public_index: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_xredirect_nonwebsite: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_xredirect_private_abs: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_xredirect_private_relative: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_xredirect_public_abs: {status: KNOWN} + s3tests.functional.test_s3_website.test_website_xredirect_public_relative: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_policy_different_tenant: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_policy_set_condition_operator_end_with_IfExists: {status: KNOWN} + s3tests.functional.test_s3.test_encryption_sse_c_multipart_invalid_chunks_1: {status: KNOWN} + s3tests.functional.test_s3.test_encryption_sse_c_multipart_invalid_chunks_2: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_policy_put_obj_enc: {status: KNOWN} + s3tests.functional.test_s3.test_bucket_policy_put_obj_request_obj_tag: {status: KNOWN} + s3tests.functional.test_s3.test_append_object_position_wrong: {status: KNOWN} + s3tests.functional.test_s3.test_append_normal_object: {status: KNOWN} + s3tests.functional.test_s3.test_append_object: {status: KNOWN} + s3tests_boto3.functional.test_headers.test_bucket_create_bad_authorization_empty: {status: KNOWN} + s3tests_boto3.functional.test_headers.test_bucket_create_bad_authorization_invalid_aws2: {status: KNOWN} + s3tests_boto3.functional.test_headers.test_bucket_create_bad_authorization_none: {status: KNOWN} + s3tests_boto3.functional.test_headers.test_bucket_create_bad_date_none_aws2: {status: KNOWN} + s3tests_boto3.functional.test_headers.test_object_create_bad_authorization_empty: {status: KNOWN} + s3tests_boto3.functional.test_headers.test_object_create_bad_authorization_incorrect_aws2: {status: KNOWN} + s3tests_boto3.functional.test_headers.test_object_create_bad_authorization_invalid_aws2: {status: KNOWN} + s3tests_boto3.functional.test_headers.test_object_create_bad_authorization_none: {status: KNOWN} + s3tests_boto3.functional.test_headers.test_object_create_bad_contentlength_mismatch_above: {status: KNOWN} + s3tests_boto3.functional.test_headers.test_object_create_bad_contentlength_mismatch_below_aws2: {status: KNOWN} + s3tests_boto3.functional.test_headers.test_object_create_bad_contentlength_none: {status: KNOWN} + s3tests_boto3.functional.test_headers.test_object_create_bad_date_none_aws2: {status: KNOWN} + s3tests_boto3.functional.test_headers.test_bucket_create_bad_ua_empty_aws2: {status: KNOWN} + s3tests_boto3.functional.test_headers.test_bucket_create_bad_ua_none_aws2: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_100_continue: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_atomic_conditional_write_1mb: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_atomic_dual_conditional_write_1mb: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_acl_grant_email: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_acl_grant_email_notexist: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_acl_grant_nonexist_user: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_acl_no_grants: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_create_exists: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_create_naming_bad_long: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_create_naming_bad_punctuation: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_create_naming_bad_short_empty: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_head_extended: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_list_objects_anonymous: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_list_objects_anonymous_fail: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_list_return_data_versioning: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_list_unordered: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_listv2_objects_anonymous: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_listv2_objects_anonymous_fail: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_listv2_unordered: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_policy: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_policy_acl: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_policy_another_bucket: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_policy_different_tenant: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_policy_get_obj_acl_existing_tag: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_policy_get_obj_existing_tag: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_policy_get_obj_tagging_existing_tag: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_policy_put_obj_acl: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_policy_put_obj_copy_source: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_policy_put_obj_copy_source_meta: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_policy_put_obj_enc: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_policy_put_obj_grant: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_policy_put_obj_request_obj_tag: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_policy_put_obj_tagging_existing_tag: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_policy_set_condition_operator_end_with_IfExists: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_recreate_not_overriding: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucketv2_policy: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucketv2_policy_acl: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucketv2_policy_another_bucket: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucketv2_policy_different_tenant: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_cors_header_option: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_cors_origin_response: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_cors_origin_wildcard: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_delete_tags_obj_public: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_encryption_key_no_sse_c: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_encryption_sse_c_invalid_md5: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_encryption_sse_c_method_head: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_encryption_sse_c_multipart_bad_download: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_encryption_sse_c_multipart_invalid_chunks_1: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_encryption_sse_c_multipart_invalid_chunks_2: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_encryption_sse_c_no_key: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_encryption_sse_c_no_md5: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_encryption_sse_c_other_key: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_encryption_sse_c_post_object_authenticated_request: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_encryption_sse_c_present: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_get_obj_head_tagging: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_get_obj_tagging: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_get_tags_acl_public: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_deletemarker_expiration: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_expiration: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_expiration_date: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_expiration_days0: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_expiration_header_head: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_expiration_header_put: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_expiration_versioning_enabled: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_get: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_get_no_id: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_id_too_long: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_multipart_expiration: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_noncur_expiration: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_same_id: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_set: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_set_date: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_set_deletemarker: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_set_empty_filter: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_set_filter: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_set_multipart: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_set_noncurrent: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecyclev2_expiration: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_list_buckets_anonymous: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_list_buckets_invalid_auth: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_logging_toggle: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_multipart_copy_invalid_range: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_multipart_resend_first_finishes_last: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_multipart_upload_empty: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_anon_put: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_anon_put_write_access: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_delete_key_bucket_gone: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_delete_object_with_legal_hold_off: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_delete_object_with_legal_hold_on: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_delete_object_with_retention: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_get_legal_hold_invalid_bucket: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_get_obj_lock: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_get_obj_metadata: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_get_obj_retention: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_get_obj_retention_invalid_bucket: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_put_legal_hold_invalid_bucket: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_put_legal_hold_invalid_status: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_put_obj_lock: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_put_obj_lock_invalid_bucket: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_put_obj_lock_invalid_days: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_put_obj_lock_invalid_status: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_put_obj_lock_invalid_years: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_put_obj_lock_with_days_and_years: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_put_obj_retention: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_put_obj_retention_increase_period: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_put_obj_retention_invalid_bucket: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_put_obj_retention_invalid_mode: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_put_obj_retention_override_default_retention: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_put_obj_retention_shorten_period: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_put_obj_retention_shorten_period_bypass: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_put_obj_retention_versionid: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_suspend_versioning: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_uploading_obj: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_raw_get: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_raw_get_bucket_acl: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_raw_get_bucket_gone: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_raw_get_object_acl: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_raw_get_object_gone: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_raw_get_x_amz_expires_out_max_range: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_raw_get_x_amz_expires_out_positive_range: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_raw_put_authenticated_expired: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_set_get_metadata_empty_to_unreadable_prefix: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_set_get_metadata_empty_to_unreadable_suffix: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_set_get_metadata_overwrite_to_unreadable_prefix: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_set_get_metadata_overwrite_to_unreadable_suffix: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_set_get_non_utf8_metadata: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_set_get_unicode_metadata: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_anonymous_request: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_authenticated_no_content_type: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_authenticated_request: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_authenticated_request_bad_access_key: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_case_insensitive_condition_fields: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_condition_is_case_sensitive: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_empty_conditions: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_escaped_field_values: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_expired_policy: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_expires_is_case_sensitive: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_ignored_header: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_invalid_access_key: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_invalid_content_length_argument: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_invalid_date_format: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_invalid_request_field_value: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_invalid_signature: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_missing_conditions_list: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_missing_content_length_argument: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_missing_expires_condition: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_missing_policy_condition: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_missing_signature: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_no_key_specified: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_request_missing_policy_specified_field: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_set_invalid_success_code: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_set_key_from_filename: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_set_success_code: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_success_redirect_action: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_tags_anonymous_request: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_tags_authenticated_request: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_upload_larger_than_chunk: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_upload_size_below_minimum: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_upload_size_limit_exceeded: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_post_object_user_specified_header: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_put_delete_tags: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_put_excess_key_tags: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_put_excess_tags: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_put_excess_val_tags: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_put_max_kvsize_tags: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_put_max_tags: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_put_modify_tags: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_put_obj_with_tags: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_put_object_ifmatch_failed: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_put_object_ifmatch_good: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_put_object_ifmatch_nonexisted_failed: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_put_object_ifmatch_overwrite_existed_good: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_put_object_ifnonmatch_failed: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_put_object_ifnonmatch_good: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_put_tags_acl_public: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_set_cors: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_set_tagging: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_sse_kms_method_head: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_sse_kms_multipart_invalid_chunks_1: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_sse_kms_multipart_invalid_chunks_2: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_sse_kms_multipart_upload: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_sse_kms_not_declared: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_sse_kms_post_object_authenticated_request: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_sse_kms_present: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_sse_kms_read_declare: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_sse_kms_transfer_13b: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_sse_kms_transfer_1MB: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_sse_kms_transfer_1b: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_sse_kms_transfer_1kb: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_versioning_bucket_multipart_upload_return_version_id: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_versioning_multi_object_delete_with_marker_create: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_versioning_obj_plain_null_version_overwrite: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_account_usage: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_head_bucket_usage: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_set_get_metadata_empty_to_unreadable_infix: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_recreate_overwrite_acl: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_recreate_new_acl: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_acl_grant_email_not_exist: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_buckets_list_ctime: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_create_naming_good_starts_alpha: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_bucket_create_naming_good_starts_digit: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_list_multipart_upload_owner: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_set_bucket_tagging: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_atomic_dual_write_1mb: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_atomic_dual_write_4mb: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_atomic_dual_write_8mb: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_expiration_tags1: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_expiration_tags2: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_expiration_versioned_tags2: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_expiration_noncur_tags1: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_expiration_header_tags_head: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_lifecycle_expiration_header_and_tags_head: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_get_obj_retention_iso8601: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_multi_delete_object_with_retention: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_changing_mode_from_governance_without_bypass: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_lock_changing_mode_from_compliance: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_object_read_unreadable: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_user_policy: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_get_bucket_policy_status: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_get_public_acl_bucket_policy_status: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_get_authpublic_acl_bucket_policy_status: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_get_publicpolicy_acl_bucket_policy_status: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_get_nonpublicpolicy_acl_bucket_policy_status: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_get_nonpublicpolicy_deny_bucket_policy_status: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_get_default_public_block: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_put_public_block: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_block_public_put_bucket_acls: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_block_public_object_canned_acls: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_block_public_policy: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_ignore_public_acls: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_multipart_upload_on_a_bucket_with_policy: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_put_bucket_encryption: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_get_bucket_encryption: {status: KNOWN} + s3tests_boto3.functional.test_s3.test_delete_bucket_encryption: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_generate_where_clause: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_generate_projection: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_count_operation: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_column_sum_min_max: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_nullif_expressions: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_nulliftrue_expressions: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_is_not_null_expressions: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_lowerupper_expressions: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_in_expressions: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_true_false_in_expressions: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_like_expressions: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_truefalselike_expressions: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_complex_expressions: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_alias: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_alias_cyclic_refernce: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_datetime: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_true_false_datetime: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_csv_parser: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_csv_definition: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_schema_definition: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_when_then_else_expressions: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_coalesce_expressions: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_cast_expressions: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_trim_expressions: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_truefalse_trim_expressions: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_escape_expressions: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_case_value_expressions: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_bool_cast_expressions: {status: KNOWN} + s3tests_boto3.functional.test_s3select.test_progress_expressions: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_get_session_token: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_get_session_token_permanent_creds_denied: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_assume_role_allow: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_assume_role_deny: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_assume_role_creds_expiry: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_assume_role_deny_head_nonexistent: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_assume_role_allow_head_nonexistent: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_assume_role_with_web_identity: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_session_policy_check_on_different_buckets: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_session_policy_check_on_same_bucket: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_session_policy_check_put_obj_denial: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_swapping_role_policy_and_session_policy: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_session_policy_check_different_op_permissions: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_session_policy_check_with_deny_effect: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_session_policy_check_with_deny_on_same_op: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_session_policy_bucket_policy_role_arn: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_session_policy_bucket_policy_session_arn: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_session_policy_copy_object: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_session_policy_no_bucket_role_policy: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_session_policy_bucket_policy_deny: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_assume_role_with_web_identity_with_sub: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_assume_role_with_web_identity_with_azp: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_assume_role_with_web_identity_with_request_tag: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_assume_role_with_web_identity_with_principal_tag: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_assume_role_with_web_identity_for_all_values: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_assume_role_with_web_identity_for_all_values_deny: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_assume_role_with_web_identity_tag_keys_trust_policy: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_assume_role_with_web_identity_tag_keys_role_policy: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_assume_role_with_web_identity_resource_tag: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_assume_role_with_web_identity_resource_tag_deny: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_assume_role_with_web_identity_wrong_resource_tag_deny: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_assume_role_with_web_identity_resource_tag_princ_tag: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_assume_role_with_web_identity_resource_tag_copy_obj: {status: KNOWN} + s3tests_boto3.functional.test_sts.test_assume_role_with_web_identity_role_resource_tag: {status: KNOWN} diff --git a/doc/s3api/rnc/access_control_policy.rnc b/doc/s3api/rnc/access_control_policy.rnc new file mode 100644 index 0000000000..c857359e09 --- /dev/null +++ b/doc/s3api/rnc/access_control_policy.rnc @@ -0,0 +1,7 @@ +include "common.rnc" + +start = + element AccessControlPolicy { + element Owner { CanonicalUser } & + element AccessControlList { AccessControlList } + } diff --git a/doc/s3api/rnc/bucket_logging_status.rnc b/doc/s3api/rnc/bucket_logging_status.rnc new file mode 100644 index 0000000000..a7d9a1effc --- /dev/null +++ b/doc/s3api/rnc/bucket_logging_status.rnc @@ -0,0 +1,10 @@ +include "common.rnc" + +start = + element BucketLoggingStatus { + element LoggingEnabled { + element TargetBucket { xsd:string } & + element TargetPrefix { xsd:string } & + element TargetGrants { AccessControlList }? + }? + } diff --git a/doc/s3api/rnc/common.rnc b/doc/s3api/rnc/common.rnc new file mode 100644 index 0000000000..79dddbb556 --- /dev/null +++ b/doc/s3api/rnc/common.rnc @@ -0,0 +1,26 @@ +namespace xsi = "http://www.w3.org/2001/XMLSchema-instance" + +CanonicalUser = + element ID { xsd:string } & + element DisplayName { xsd:string }? + +StorageClass = "STANDARD" | "REDUCED_REDUNDANCY" | "GLACIER" | "UNKNOWN" + +AccessControlList = + element Grant { + element Grantee { + ( + attribute xsi:type { "AmazonCustomerByEmail" }, + element EmailAddress { xsd:string } + ) | ( + attribute xsi:type { "CanonicalUser" }, + CanonicalUser + ) | ( + attribute xsi:type { "Group" }, + element URI { xsd:string } + ) + } & + element Permission { + "READ" | "WRITE" | "READ_ACP" | "WRITE_ACP" | "FULL_CONTROL" + } + }* diff --git a/doc/s3api/rnc/complete_multipart_upload.rnc b/doc/s3api/rnc/complete_multipart_upload.rnc new file mode 100644 index 0000000000..2a8459ef62 --- /dev/null +++ b/doc/s3api/rnc/complete_multipart_upload.rnc @@ -0,0 +1,12 @@ +start = + element CompleteMultipartUpload { + element Part { + element PartNumber { xsd:int } & + element ETag { xsd:string } & + element ChecksumCRC32 { xsd:string }? & + element ChecksumCRC32C { xsd:string }? & + element ChecksumCRC64NVME { xsd:string }? & + element ChecksumSHA1 { xsd:string }? & + element ChecksumSHA256 { xsd:string }? + }+ + } diff --git a/doc/s3api/rnc/complete_multipart_upload_result.rnc b/doc/s3api/rnc/complete_multipart_upload_result.rnc new file mode 100644 index 0000000000..6dd9cbeb9f --- /dev/null +++ b/doc/s3api/rnc/complete_multipart_upload_result.rnc @@ -0,0 +1,7 @@ +start = + element CompleteMultipartUploadResult { + element Location { xsd:anyURI }, + element Bucket { xsd:string }, + element Key { xsd:string }, + element ETag { xsd:string } + } diff --git a/doc/s3api/rnc/copy_object_result.rnc b/doc/s3api/rnc/copy_object_result.rnc new file mode 100644 index 0000000000..bf96a8a91d --- /dev/null +++ b/doc/s3api/rnc/copy_object_result.rnc @@ -0,0 +1,5 @@ +start = + element CopyObjectResult { + element LastModified { xsd:dateTime }, + element ETag { xsd:string } + } diff --git a/doc/s3api/rnc/copy_part_result.rnc b/doc/s3api/rnc/copy_part_result.rnc new file mode 100644 index 0000000000..a7d7956515 --- /dev/null +++ b/doc/s3api/rnc/copy_part_result.rnc @@ -0,0 +1,5 @@ +start = + element CopyPartResult { + element LastModified { xsd:dateTime }, + element ETag { xsd:string } + } diff --git a/doc/s3api/rnc/create_bucket_configuration.rnc b/doc/s3api/rnc/create_bucket_configuration.rnc new file mode 100644 index 0000000000..e366d72ed5 --- /dev/null +++ b/doc/s3api/rnc/create_bucket_configuration.rnc @@ -0,0 +1,4 @@ +start = + element * { + element LocationConstraint { xsd:string } + } diff --git a/doc/s3api/rnc/delete.rnc b/doc/s3api/rnc/delete.rnc new file mode 100644 index 0000000000..a0cac2d74f --- /dev/null +++ b/doc/s3api/rnc/delete.rnc @@ -0,0 +1,8 @@ +start = + element Delete { + element Quiet { xsd:string }? & + element Object { + element Key { xsd:string } & + element VersionId { xsd:string }? + }+ + } diff --git a/doc/s3api/rnc/delete_result.rnc b/doc/s3api/rnc/delete_result.rnc new file mode 100644 index 0000000000..3a63bf78a8 --- /dev/null +++ b/doc/s3api/rnc/delete_result.rnc @@ -0,0 +1,17 @@ +start = + element DeleteResult { + ( + element Deleted { + element Key { xsd:string }, + element VersionId { xsd:string }?, + element DeleteMarker { xsd:boolean }?, + element DeleteMarkerVersionId { xsd:string }? + } | + element Error { + element Key { xsd:string }, + element VersionId { xsd:string }?, + element Code { xsd:string }, + element Message { xsd:string } + } + )* + } diff --git a/doc/s3api/rnc/error.rnc b/doc/s3api/rnc/error.rnc new file mode 100644 index 0000000000..0e352c71a6 --- /dev/null +++ b/doc/s3api/rnc/error.rnc @@ -0,0 +1,11 @@ +start = + element Error { + element Code { xsd:string }, + element Message { xsd:string }, + DebugInfo* + } + +DebugInfo = + element * { + (attribute * { text } | text | DebugInfo)* + } diff --git a/doc/s3api/rnc/initiate_multipart_upload_result.rnc b/doc/s3api/rnc/initiate_multipart_upload_result.rnc new file mode 100644 index 0000000000..8830121f95 --- /dev/null +++ b/doc/s3api/rnc/initiate_multipart_upload_result.rnc @@ -0,0 +1,6 @@ +start = + element InitiateMultipartUploadResult { + element Bucket { xsd:string }, + element Key { xsd:string }, + element UploadId { xsd:string } + } diff --git a/doc/s3api/rnc/lifecycle_configuration.rnc b/doc/s3api/rnc/lifecycle_configuration.rnc new file mode 100644 index 0000000000..b21fc07b67 --- /dev/null +++ b/doc/s3api/rnc/lifecycle_configuration.rnc @@ -0,0 +1,20 @@ +include "common.rnc" + +start = + element LifecycleConfiguration { + element Rule { + element ID { xsd:string }? & + element Prefix { xsd:string } & + element Status { "Enabled" | "Disabled" } & + element Transition { Transition }? & + element Expiration { Expiration }? + }+ + } + +Expiration = + element Days { xsd:int } | + element Date { xsd:dateTime } + +Transition = + Expiration & + element StorageClass { StorageClass } diff --git a/doc/s3api/rnc/list_all_my_buckets_result.rnc b/doc/s3api/rnc/list_all_my_buckets_result.rnc new file mode 100644 index 0000000000..220a34aa99 --- /dev/null +++ b/doc/s3api/rnc/list_all_my_buckets_result.rnc @@ -0,0 +1,12 @@ +include "common.rnc" + +start = + element ListAllMyBucketsResult { + element Owner { CanonicalUser }, + element Buckets { + element Bucket { + element Name { xsd:string }, + element CreationDate { xsd:dateTime } + }* + } + } diff --git a/doc/s3api/rnc/list_bucket_result.rnc b/doc/s3api/rnc/list_bucket_result.rnc new file mode 100644 index 0000000000..eb86c08042 --- /dev/null +++ b/doc/s3api/rnc/list_bucket_result.rnc @@ -0,0 +1,33 @@ +include "common.rnc" + +start = + element ListBucketResult { + element Name { xsd:string }, + element Prefix { xsd:string }, + ( + ( + element Marker { xsd:string }, + element NextMarker { xsd:string }? + ) | ( + element NextContinuationToken { xsd:string }?, + element ContinuationToken { xsd:string }?, + element StartAfter { xsd:string }?, + element KeyCount { xsd:int } + ) + ), + element MaxKeys { xsd:int }, + element Delimiter { xsd:string }?, + element EncodingType { xsd:string }?, + element IsTruncated { xsd:boolean }, + element Contents { + element Key { xsd:string }, + element LastModified { xsd:dateTime }, + element ETag { xsd:string }, + element Size { xsd:long }, + element Owner { CanonicalUser }?, + element StorageClass { StorageClass } + }*, + element CommonPrefixes { + element Prefix { xsd:string } + }* + } diff --git a/doc/s3api/rnc/list_multipart_uploads_result.rnc b/doc/s3api/rnc/list_multipart_uploads_result.rnc new file mode 100644 index 0000000000..6ac1e1237c --- /dev/null +++ b/doc/s3api/rnc/list_multipart_uploads_result.rnc @@ -0,0 +1,26 @@ +include "common.rnc" + +start = + element ListMultipartUploadsResult { + element Bucket { xsd:string }, + element KeyMarker { xsd:string }, + element UploadIdMarker { xsd:string }, + element NextKeyMarker { xsd:string }, + element NextUploadIdMarker { xsd:string }, + element Delimiter { xsd:string }?, + element Prefix { xsd:string }?, + element MaxUploads { xsd:int }, + element EncodingType { xsd:string }?, + element IsTruncated { xsd:boolean }, + element Upload { + element Key { xsd:string }, + element UploadId { xsd:string }, + element Initiator { CanonicalUser }, + element Owner { CanonicalUser }, + element StorageClass { StorageClass }, + element Initiated { xsd:dateTime } + }*, + element CommonPrefixes { + element Prefix { xsd:string } + }* + } diff --git a/doc/s3api/rnc/list_parts_result.rnc b/doc/s3api/rnc/list_parts_result.rnc new file mode 100644 index 0000000000..2143315427 --- /dev/null +++ b/doc/s3api/rnc/list_parts_result.rnc @@ -0,0 +1,22 @@ +include "common.rnc" + +start = + element ListPartsResult { + element Bucket { xsd:string }, + element Key { xsd:string }, + element UploadId { xsd:string }, + element Initiator { CanonicalUser }, + element Owner { CanonicalUser }, + element StorageClass { StorageClass }, + element PartNumberMarker { xsd:int }, + element NextPartNumberMarker { xsd:int }, + element MaxParts { xsd:int }, + element EncodingType { xsd:string }?, + element IsTruncated { xsd:boolean }, + element Part { + element PartNumber { xsd:int }, + element LastModified { xsd:dateTime }, + element ETag { xsd:string }, + element Size { xsd:long } + }* + } diff --git a/doc/s3api/rnc/list_versions_result.rnc b/doc/s3api/rnc/list_versions_result.rnc new file mode 100644 index 0000000000..969073f3b0 --- /dev/null +++ b/doc/s3api/rnc/list_versions_result.rnc @@ -0,0 +1,37 @@ +include "common.rnc" + +start = + element ListVersionsResult { + element Name { xsd:string }, + element Prefix { xsd:string }, + element KeyMarker { xsd:string }, + element VersionIdMarker { xsd:string }, + element NextKeyMarker { xsd:string }?, + element NextVersionIdMarker { xsd:string }?, + element MaxKeys { xsd:int }, + element EncodingType { xsd:string }?, + element Delimiter { xsd:string }?, + element IsTruncated { xsd:boolean }, + ( + element Version { + element Key { xsd:string }, + element VersionId { xsd:string }, + element IsLatest { xsd:boolean }, + element LastModified { xsd:dateTime }, + element ETag { xsd:string }, + element Size { xsd:long }, + element Owner { CanonicalUser }?, + element StorageClass { StorageClass } + } | + element DeleteMarker { + element Key { xsd:string }, + element VersionId { xsd:string }, + element IsLatest { xsd:boolean }, + element LastModified { xsd:dateTime }, + element Owner { CanonicalUser }? + } + )*, + element CommonPrefixes { + element Prefix { xsd:string } + }* + } diff --git a/doc/s3api/rnc/location_constraint.rnc b/doc/s3api/rnc/location_constraint.rnc new file mode 100644 index 0000000000..829176ff95 --- /dev/null +++ b/doc/s3api/rnc/location_constraint.rnc @@ -0,0 +1 @@ +start = element LocationConstraint { xsd:string } diff --git a/doc/s3api/rnc/versioning_configuration.rnc b/doc/s3api/rnc/versioning_configuration.rnc new file mode 100644 index 0000000000..87e5d15a97 --- /dev/null +++ b/doc/s3api/rnc/versioning_configuration.rnc @@ -0,0 +1,5 @@ +start = + element VersioningConfiguration { + element Status { "Enabled" | "Suspended" }? & + element MfaDelete { "Enabled" | "Disabled" }? + } diff --git a/doc/saio/bin/remakerings b/doc/saio/bin/remakerings new file mode 100755 index 0000000000..0d67b6b4ce --- /dev/null +++ b/doc/saio/bin/remakerings @@ -0,0 +1,42 @@ +#!/bin/bash + +set -e + +cd /etc/swift + +rm -f *.builder *.ring.gz backups/*.builder backups/*.ring.gz + +swift-ring-builder object.builder create 10 3 1 +swift-ring-builder object.builder add r1z1-127.0.0.1:6210/sdb1 1 +swift-ring-builder object.builder add r1z2-127.0.0.2:6220/sdb2 1 +swift-ring-builder object.builder add r1z3-127.0.0.3:6230/sdb3 1 +swift-ring-builder object.builder add r1z4-127.0.0.4:6240/sdb4 1 +swift-ring-builder object.builder rebalance +swift-ring-builder object-1.builder create 10 2 1 +swift-ring-builder object-1.builder add r1z1-127.0.0.1:6210/sdb1 1 +swift-ring-builder object-1.builder add r1z2-127.0.0.2:6220/sdb2 1 +swift-ring-builder object-1.builder add r1z3-127.0.0.3:6230/sdb3 1 +swift-ring-builder object-1.builder add r1z4-127.0.0.4:6240/sdb4 1 +swift-ring-builder object-1.builder rebalance +swift-ring-builder object-2.builder create 10 6 1 +swift-ring-builder object-2.builder add r1z1-127.0.0.1:6210/sdb1 1 +swift-ring-builder object-2.builder add r1z1-127.0.0.1:6210/sdb5 1 +swift-ring-builder object-2.builder add r1z2-127.0.0.2:6220/sdb2 1 +swift-ring-builder object-2.builder add r1z2-127.0.0.2:6220/sdb6 1 +swift-ring-builder object-2.builder add r1z3-127.0.0.3:6230/sdb3 1 +swift-ring-builder object-2.builder add r1z3-127.0.0.3:6230/sdb7 1 +swift-ring-builder object-2.builder add r1z4-127.0.0.4:6240/sdb4 1 +swift-ring-builder object-2.builder add r1z4-127.0.0.4:6240/sdb8 1 +swift-ring-builder object-2.builder rebalance +swift-ring-builder container.builder create 10 3 1 +swift-ring-builder container.builder add r1z1-127.0.0.1:6211/sdb1 1 +swift-ring-builder container.builder add r1z2-127.0.0.2:6221/sdb2 1 +swift-ring-builder container.builder add r1z3-127.0.0.3:6231/sdb3 1 +swift-ring-builder container.builder add r1z4-127.0.0.4:6241/sdb4 1 +swift-ring-builder container.builder rebalance +swift-ring-builder account.builder create 10 3 1 +swift-ring-builder account.builder add r1z1-127.0.0.1:6212/sdb1 1 +swift-ring-builder account.builder add r1z2-127.0.0.2:6222/sdb2 1 +swift-ring-builder account.builder add r1z3-127.0.0.3:6232/sdb3 1 +swift-ring-builder account.builder add r1z4-127.0.0.4:6242/sdb4 1 +swift-ring-builder account.builder rebalance diff --git a/doc/saio/bin/resetswift b/doc/saio/bin/resetswift new file mode 100755 index 0000000000..65c7a306cd --- /dev/null +++ b/doc/saio/bin/resetswift @@ -0,0 +1,30 @@ +#!/bin/bash + +set -e + +swift-init all kill +swift-orphans -a 0 -k KILL + +# Remove the following line if you did not set up rsyslog for individual logging: +sudo find /var/log/swift -type f -exec rm -f {} \; +if cut -d' ' -f2 /proc/mounts | grep -q /mnt/sdb1 ; then + sudo umount /mnt/sdb1 +fi +# If you are using a loopback device set SAIO_BLOCK_DEVICE to "/srv/swift-disk" +sudo mkfs.xfs -f ${SAIO_BLOCK_DEVICE:-/dev/sdb1} +sudo mount /mnt/sdb1 +sudo mkdir /mnt/sdb1/1 /mnt/sdb1/2 /mnt/sdb1/3 /mnt/sdb1/4 +sudo chown ${USER}:${USER} /mnt/sdb1/* +mkdir -p /srv/1/node/sdb1 /srv/1/node/sdb5 \ + /srv/2/node/sdb2 /srv/2/node/sdb6 \ + /srv/3/node/sdb3 /srv/3/node/sdb7 \ + /srv/4/node/sdb4 /srv/4/node/sdb8 +sudo rm -f /var/log/debug /var/log/messages /var/log/rsyncd.log /var/log/syslog +find /var/cache/swift* -type f -name *.recon -exec rm -f {} \; +if [ "`type -t systemctl`" == "file" ]; then + sudo systemctl restart rsyslog + sudo systemctl restart memcached +else + sudo service rsyslog restart + sudo service memcached restart +fi diff --git a/doc/saio/bin/startmain b/doc/saio/bin/startmain new file mode 100755 index 0000000000..f099424db2 --- /dev/null +++ b/doc/saio/bin/startmain @@ -0,0 +1,5 @@ +#!/bin/bash + +set -e + +swift-init main start diff --git a/doc/saio/bin/startrest b/doc/saio/bin/startrest new file mode 100755 index 0000000000..bb1996b25b --- /dev/null +++ b/doc/saio/bin/startrest @@ -0,0 +1,5 @@ +#!/bin/bash + +set -e + +swift-init rest start diff --git a/doc/saio/rsyncd.conf b/doc/saio/rsyncd.conf new file mode 100644 index 0000000000..28aca5a595 --- /dev/null +++ b/doc/saio/rsyncd.conf @@ -0,0 +1,77 @@ +uid = +gid = +log file = /var/log/rsyncd.log +pid file = /var/run/rsyncd.pid +address = 0.0.0.0 + +[account6212] +max connections = 25 +path = /srv/1/node/ +read only = false +lock file = /var/lock/account6212.lock + +[account6222] +max connections = 25 +path = /srv/2/node/ +read only = false +lock file = /var/lock/account6222.lock + +[account6232] +max connections = 25 +path = /srv/3/node/ +read only = false +lock file = /var/lock/account6232.lock + +[account6242] +max connections = 25 +path = /srv/4/node/ +read only = false +lock file = /var/lock/account6242.lock + +[container6211] +max connections = 25 +path = /srv/1/node/ +read only = false +lock file = /var/lock/container6211.lock + +[container6221] +max connections = 25 +path = /srv/2/node/ +read only = false +lock file = /var/lock/container6221.lock + +[container6231] +max connections = 25 +path = /srv/3/node/ +read only = false +lock file = /var/lock/container6231.lock + +[container6241] +max connections = 25 +path = /srv/4/node/ +read only = false +lock file = /var/lock/container6241.lock + +[object6210] +max connections = 25 +path = /srv/1/node/ +read only = false +lock file = /var/lock/object6210.lock + +[object6220] +max connections = 25 +path = /srv/2/node/ +read only = false +lock file = /var/lock/object6220.lock + +[object6230] +max connections = 25 +path = /srv/3/node/ +read only = false +lock file = /var/lock/object6230.lock + +[object6240] +max connections = 25 +path = /srv/4/node/ +read only = false +lock file = /var/lock/object6240.lock diff --git a/doc/saio/rsyslog.d/10-swift.conf b/doc/saio/rsyslog.d/10-swift.conf new file mode 100644 index 0000000000..e0a5624340 --- /dev/null +++ b/doc/saio/rsyslog.d/10-swift.conf @@ -0,0 +1,30 @@ +# Uncomment the following to have a log containing all logs together +#local1,local2,local3,local4,local5.* /var/log/swift/all.log + +# Uncomment the following to have hourly proxy logs for stats processing +#$template HourlyProxyLog,"/var/log/swift/hourly/%$YEAR%%$MONTH%%$DAY%%$HOUR%" +#local1.*;local1.!notice ?HourlyProxyLog + +local1.*;local1.!notice /var/log/swift/proxy.log +local1.notice /var/log/swift/proxy.error +local1.* ~ + +local2.*;local2.!notice /var/log/swift/storage1.log +local2.notice /var/log/swift/storage1.error +local2.* ~ + +local3.*;local3.!notice /var/log/swift/storage2.log +local3.notice /var/log/swift/storage2.error +local3.* ~ + +local4.*;local4.!notice /var/log/swift/storage3.log +local4.notice /var/log/swift/storage3.error +local4.* ~ + +local5.*;local5.!notice /var/log/swift/storage4.log +local5.notice /var/log/swift/storage4.error +local5.* ~ + +local6.*;local6.!notice /var/log/swift/expirer.log +local6.notice /var/log/swift/expirer.error +local6.* ~ diff --git a/doc/saio/swift/account-server/1.conf b/doc/saio/swift/account-server/1.conf new file mode 100644 index 0000000000..e405f2de41 --- /dev/null +++ b/doc/saio/swift/account-server/1.conf @@ -0,0 +1,30 @@ +[DEFAULT] +devices = /srv/1/node +mount_check = false +disable_fallocate = true +bind_ip = 127.0.0.1 +bind_port = 6212 +workers = 1 +user = +log_facility = LOG_LOCAL2 +recon_cache_path = /var/cache/swift +eventlet_debug = true + +[pipeline:main] +pipeline = healthcheck recon account-server + +[app:account-server] +use = egg:swift#account + +[filter:recon] +use = egg:swift#recon + +[filter:healthcheck] +use = egg:swift#healthcheck + +[account-replicator] +rsync_module = {replication_ip}::account{replication_port} + +[account-auditor] + +[account-reaper] diff --git a/doc/saio/swift/account-server/2.conf b/doc/saio/swift/account-server/2.conf new file mode 100644 index 0000000000..9b580566ea --- /dev/null +++ b/doc/saio/swift/account-server/2.conf @@ -0,0 +1,30 @@ +[DEFAULT] +devices = /srv/2/node +mount_check = false +disable_fallocate = true +bind_ip = 127.0.0.2 +bind_port = 6222 +workers = 1 +user = +log_facility = LOG_LOCAL3 +recon_cache_path = /var/cache/swift2 +eventlet_debug = true + +[pipeline:main] +pipeline = healthcheck recon account-server + +[app:account-server] +use = egg:swift#account + +[filter:recon] +use = egg:swift#recon + +[filter:healthcheck] +use = egg:swift#healthcheck + +[account-replicator] +rsync_module = {replication_ip}::account{replication_port} + +[account-auditor] + +[account-reaper] diff --git a/doc/saio/swift/account-server/3.conf b/doc/saio/swift/account-server/3.conf new file mode 100644 index 0000000000..c0cbe6e33e --- /dev/null +++ b/doc/saio/swift/account-server/3.conf @@ -0,0 +1,30 @@ +[DEFAULT] +devices = /srv/3/node +mount_check = false +disable_fallocate = true +bind_ip = 127.0.0.3 +bind_port = 6232 +workers = 1 +user = +log_facility = LOG_LOCAL4 +recon_cache_path = /var/cache/swift3 +eventlet_debug = true + +[pipeline:main] +pipeline = healthcheck recon account-server + +[app:account-server] +use = egg:swift#account + +[filter:recon] +use = egg:swift#recon + +[filter:healthcheck] +use = egg:swift#healthcheck + +[account-replicator] +rsync_module = {replication_ip}::account{replication_port} + +[account-auditor] + +[account-reaper] diff --git a/doc/saio/swift/account-server/4.conf b/doc/saio/swift/account-server/4.conf new file mode 100644 index 0000000000..105d0eec16 --- /dev/null +++ b/doc/saio/swift/account-server/4.conf @@ -0,0 +1,30 @@ +[DEFAULT] +devices = /srv/4/node +mount_check = false +disable_fallocate = true +bind_ip = 127.0.0.4 +bind_port = 6242 +workers = 1 +user = +log_facility = LOG_LOCAL5 +recon_cache_path = /var/cache/swift4 +eventlet_debug = true + +[pipeline:main] +pipeline = healthcheck recon account-server + +[app:account-server] +use = egg:swift#account + +[filter:recon] +use = egg:swift#recon + +[filter:healthcheck] +use = egg:swift#healthcheck + +[account-replicator] +rsync_module = {replication_ip}::account{replication_port} + +[account-auditor] + +[account-reaper] diff --git a/doc/saio/swift/container-reconciler/1.conf b/doc/saio/swift/container-reconciler/1.conf new file mode 100644 index 0000000000..9350405e98 --- /dev/null +++ b/doc/saio/swift/container-reconciler/1.conf @@ -0,0 +1,49 @@ +[DEFAULT] +# swift_dir = /etc/swift +user = +# You can specify default log routing here if you want: +# log_name = swift +log_facility = LOG_LOCAL2 +# log_level = INFO +# log_address = /dev/log +# +# comma separated list of functions to call to setup custom log handlers. +# functions get passed: conf, name, log_to_console, log_route, fmt, logger, +# adapted_logger +# log_custom_handlers = +# +# If set, log_udp_host will override log_address +# log_udp_host = +# log_udp_port = 514 +# +# You can enable StatsD logging here: +# log_statsd_host = +# log_statsd_port = 8125 +# log_statsd_default_sample_rate = 1.0 +# log_statsd_sample_rate_factor = 1.0 +# log_statsd_metric_prefix = + +[container-reconciler] +# reclaim_age = 604800 +# interval = 300 +# request_tries = 3 +processes = 4 +process = 0 + +[pipeline:main] +pipeline = catch_errors proxy-logging cache proxy-server + +[app:proxy-server] +use = egg:swift#proxy +# See proxy-server.conf-sample for options + +[filter:cache] +use = egg:swift#memcache +# See proxy-server.conf-sample for options + +[filter:proxy-logging] +use = egg:swift#proxy_logging + +[filter:catch_errors] +use = egg:swift#catch_errors +# See proxy-server.conf-sample for options diff --git a/doc/saio/swift/container-reconciler/2.conf b/doc/saio/swift/container-reconciler/2.conf new file mode 100644 index 0000000000..6ffcf84d7d --- /dev/null +++ b/doc/saio/swift/container-reconciler/2.conf @@ -0,0 +1,49 @@ +[DEFAULT] +# swift_dir = /etc/swift +user = +# You can specify default log routing here if you want: +# log_name = swift +log_facility = LOG_LOCAL3 +# log_level = INFO +# log_address = /dev/log +# +# comma separated list of functions to call to setup custom log handlers. +# functions get passed: conf, name, log_to_console, log_route, fmt, logger, +# adapted_logger +# log_custom_handlers = +# +# If set, log_udp_host will override log_address +# log_udp_host = +# log_udp_port = 514 +# +# You can enable StatsD logging here: +# log_statsd_host = +# log_statsd_port = 8125 +# log_statsd_default_sample_rate = 1.0 +# log_statsd_sample_rate_factor = 1.0 +# log_statsd_metric_prefix = + +[container-reconciler] +# reclaim_age = 604800 +# interval = 300 +# request_tries = 3 +processes = 4 +process = 1 + +[pipeline:main] +pipeline = catch_errors proxy-logging cache proxy-server + +[app:proxy-server] +use = egg:swift#proxy +# See proxy-server.conf-sample for options + +[filter:cache] +use = egg:swift#memcache +# See proxy-server.conf-sample for options + +[filter:proxy-logging] +use = egg:swift#proxy_logging + +[filter:catch_errors] +use = egg:swift#catch_errors +# See proxy-server.conf-sample for options diff --git a/doc/saio/swift/container-reconciler/3.conf b/doc/saio/swift/container-reconciler/3.conf new file mode 100644 index 0000000000..843b7bb79e --- /dev/null +++ b/doc/saio/swift/container-reconciler/3.conf @@ -0,0 +1,49 @@ +[DEFAULT] +# swift_dir = /etc/swift +user = +# You can specify default log routing here if you want: +# log_name = swift +log_facility = LOG_LOCAL4 +# log_level = INFO +# log_address = /dev/log +# +# comma separated list of functions to call to setup custom log handlers. +# functions get passed: conf, name, log_to_console, log_route, fmt, logger, +# adapted_logger +# log_custom_handlers = +# +# If set, log_udp_host will override log_address +# log_udp_host = +# log_udp_port = 514 +# +# You can enable StatsD logging here: +# log_statsd_host = +# log_statsd_port = 8125 +# log_statsd_default_sample_rate = 1.0 +# log_statsd_sample_rate_factor = 1.0 +# log_statsd_metric_prefix = + +[container-reconciler] +# reclaim_age = 604800 +# interval = 300 +# request_tries = 3 +processes = 4 +process = 2 + +[pipeline:main] +pipeline = catch_errors proxy-logging cache proxy-server + +[app:proxy-server] +use = egg:swift#proxy +# See proxy-server.conf-sample for options + +[filter:cache] +use = egg:swift#memcache +# See proxy-server.conf-sample for options + +[filter:proxy-logging] +use = egg:swift#proxy_logging + +[filter:catch_errors] +use = egg:swift#catch_errors +# See proxy-server.conf-sample for options diff --git a/doc/saio/swift/container-reconciler/4.conf b/doc/saio/swift/container-reconciler/4.conf new file mode 100644 index 0000000000..b7d6dc9978 --- /dev/null +++ b/doc/saio/swift/container-reconciler/4.conf @@ -0,0 +1,49 @@ +[DEFAULT] +# swift_dir = /etc/swift +user = +# You can specify default log routing here if you want: +# log_name = swift +log_facility = LOG_LOCAL5 +# log_level = INFO +# log_address = /dev/log +# +# comma separated list of functions to call to setup custom log handlers. +# functions get passed: conf, name, log_to_console, log_route, fmt, logger, +# adapted_logger +# log_custom_handlers = +# +# If set, log_udp_host will override log_address +# log_udp_host = +# log_udp_port = 514 +# +# You can enable StatsD logging here: +# log_statsd_host = +# log_statsd_port = 8125 +# log_statsd_default_sample_rate = 1.0 +# log_statsd_sample_rate_factor = 1.0 +# log_statsd_metric_prefix = + +[container-reconciler] +# reclaim_age = 604800 +# interval = 300 +# request_tries = 3 +processes = 4 +process = 3 + +[pipeline:main] +pipeline = catch_errors proxy-logging cache proxy-server + +[app:proxy-server] +use = egg:swift#proxy +# See proxy-server.conf-sample for options + +[filter:cache] +use = egg:swift#memcache +# See proxy-server.conf-sample for options + +[filter:proxy-logging] +use = egg:swift#proxy_logging + +[filter:catch_errors] +use = egg:swift#catch_errors +# See proxy-server.conf-sample for options diff --git a/doc/saio/swift/container-server/1.conf b/doc/saio/swift/container-server/1.conf new file mode 100644 index 0000000000..af03ac8e3f --- /dev/null +++ b/doc/saio/swift/container-server/1.conf @@ -0,0 +1,42 @@ +[DEFAULT] +devices = /srv/1/node +mount_check = false +disable_fallocate = true +bind_ip = 127.0.0.1 +bind_port = 6211 +workers = 1 +user = +log_facility = LOG_LOCAL2 +recon_cache_path = /var/cache/swift +eventlet_debug = true + +[pipeline:main] +pipeline = healthcheck recon container-server + +[app:container-server] +use = egg:swift#container + +[filter:recon] +use = egg:swift#recon + +[filter:healthcheck] +use = egg:swift#healthcheck + +[container-replicator] +rsync_module = {replication_ip}::container{replication_port} + +[container-updater] + +[container-auditor] + +[container-sync] + +[container-sharder] +auto_shard = true +rsync_module = {replication_ip}::container{replication_port} +# This is intentionally much smaller than the default of 1,000,000 so tests +# can run in a reasonable amount of time +shard_container_threshold = 100 +# The probe tests make explicit assumptions about the batch sizes +shard_scanner_batch_size = 10 +cleave_batch_size = 2 diff --git a/doc/saio/swift/container-server/2.conf b/doc/saio/swift/container-server/2.conf new file mode 100644 index 0000000000..fe20c6d2c4 --- /dev/null +++ b/doc/saio/swift/container-server/2.conf @@ -0,0 +1,42 @@ +[DEFAULT] +devices = /srv/2/node +mount_check = false +disable_fallocate = true +bind_ip = 127.0.0.2 +bind_port = 6221 +workers = 1 +user = +log_facility = LOG_LOCAL3 +recon_cache_path = /var/cache/swift2 +eventlet_debug = true + +[pipeline:main] +pipeline = healthcheck recon container-server + +[app:container-server] +use = egg:swift#container + +[filter:recon] +use = egg:swift#recon + +[filter:healthcheck] +use = egg:swift#healthcheck + +[container-replicator] +rsync_module = {replication_ip}::container{replication_port} + +[container-updater] + +[container-auditor] + +[container-sync] + +[container-sharder] +auto_shard = true +rsync_module = {replication_ip}::container{replication_port} +# This is intentionally much smaller than the default of 1,000,000 so tests +# can run in a reasonable amount of time +shard_container_threshold = 100 +# The probe tests make explicit assumptions about the batch sizes +shard_scanner_batch_size = 10 +cleave_batch_size = 2 diff --git a/doc/saio/swift/container-server/3.conf b/doc/saio/swift/container-server/3.conf new file mode 100644 index 0000000000..a6960ba065 --- /dev/null +++ b/doc/saio/swift/container-server/3.conf @@ -0,0 +1,42 @@ +[DEFAULT] +devices = /srv/3/node +mount_check = false +disable_fallocate = true +bind_ip = 127.0.0.3 +bind_port = 6231 +workers = 1 +user = +log_facility = LOG_LOCAL4 +recon_cache_path = /var/cache/swift3 +eventlet_debug = true + +[pipeline:main] +pipeline = healthcheck recon container-server + +[app:container-server] +use = egg:swift#container + +[filter:recon] +use = egg:swift#recon + +[filter:healthcheck] +use = egg:swift#healthcheck + +[container-replicator] +rsync_module = {replication_ip}::container{replication_port} + +[container-updater] + +[container-auditor] + +[container-sync] + +[container-sharder] +auto_shard = true +rsync_module = {replication_ip}::container{replication_port} +# This is intentionally much smaller than the default of 1,000,000 so tests +# can run in a reasonable amount of time +shard_container_threshold = 100 +# The probe tests make explicit assumptions about the batch sizes +shard_scanner_batch_size = 10 +cleave_batch_size = 2 diff --git a/doc/saio/swift/container-server/4.conf b/doc/saio/swift/container-server/4.conf new file mode 100644 index 0000000000..35c0a528f6 --- /dev/null +++ b/doc/saio/swift/container-server/4.conf @@ -0,0 +1,42 @@ +[DEFAULT] +devices = /srv/4/node +mount_check = false +disable_fallocate = true +bind_ip = 127.0.0.4 +bind_port = 6241 +workers = 1 +user = +log_facility = LOG_LOCAL5 +recon_cache_path = /var/cache/swift4 +eventlet_debug = true + +[pipeline:main] +pipeline = healthcheck recon container-server + +[app:container-server] +use = egg:swift#container + +[filter:recon] +use = egg:swift#recon + +[filter:healthcheck] +use = egg:swift#healthcheck + +[container-replicator] +rsync_module = {replication_ip}::container{replication_port} + +[container-updater] + +[container-auditor] + +[container-sync] + +[container-sharder] +auto_shard = true +rsync_module = {replication_ip}::container{replication_port} +# This is intentionally much smaller than the default of 1,000,000 so tests +# can run in a reasonable amount of time +shard_container_threshold = 100 +# The probe tests make explicit assumptions about the batch sizes +shard_scanner_batch_size = 10 +cleave_batch_size = 2 diff --git a/doc/saio/swift/container-sync-realms.conf b/doc/saio/swift/container-sync-realms.conf new file mode 100644 index 0000000000..503a71c4f1 --- /dev/null +++ b/doc/saio/swift/container-sync-realms.conf @@ -0,0 +1,5 @@ +[saio] +key = changeme +key2 = changeme +cluster_saio_endpoint = http://127.0.0.1:8080/v1/ + diff --git a/doc/saio/swift/internal-client.conf b/doc/saio/swift/internal-client.conf new file mode 100644 index 0000000000..052d1e7549 --- /dev/null +++ b/doc/saio/swift/internal-client.conf @@ -0,0 +1,24 @@ +[DEFAULT] + +[pipeline:main] +pipeline = catch_errors proxy-logging cache symlink proxy-server + +[app:proxy-server] +use = egg:swift#proxy +account_autocreate = true +# See proxy-server.conf-sample for options + +[filter:symlink] +use = egg:swift#symlink +# See proxy-server.conf-sample for options + +[filter:cache] +use = egg:swift#memcache +# See proxy-server.conf-sample for options + +[filter:proxy-logging] +use = egg:swift#proxy_logging + +[filter:catch_errors] +use = egg:swift#catch_errors +# See proxy-server.conf-sample for options diff --git a/doc/saio/swift/object-expirer.conf b/doc/saio/swift/object-expirer.conf new file mode 100644 index 0000000000..58c85d2843 --- /dev/null +++ b/doc/saio/swift/object-expirer.conf @@ -0,0 +1,58 @@ +[DEFAULT] +# swift_dir = /etc/swift +user = +# You can specify default log routing here if you want: +log_name = object-expirer +log_facility = LOG_LOCAL6 +log_level = INFO +#log_address = /dev/log +# +# comma separated list of functions to call to setup custom log handlers. +# functions get passed: conf, name, log_to_console, log_route, fmt, logger, +# adapted_logger +# log_custom_handlers = +# +# If set, log_udp_host will override log_address +# log_udp_host = +# log_udp_port = 514 +# +# You can enable StatsD logging here: +# log_statsd_host = +# log_statsd_port = 8125 +# log_statsd_default_sample_rate = 1.0 +# log_statsd_sample_rate_factor = 1.0 +# log_statsd_metric_prefix = + +[object-expirer] +interval = 300 +# report_interval = 300 +# concurrency is the level of concurrency to use to do the work, this value +# must be set to at least 1 +# concurrency = 1 +# processes is how many parts to divide the work into, one part per process +# that will be doing the work +# processes set 0 means that a single process will be doing all the work +# processes can also be specified on the command line and will override the +# config value +# processes = 0 +# process is which of the parts a particular process will work on +# process can also be specified on the command line and will override the config +# value +# process is "zero based", if you want to use 3 processes, you should run +# processes with process set to 0, 1, and 2 +# process = 0 + +[pipeline:main] +pipeline = catch_errors cache proxy-server + +[app:proxy-server] +use = egg:swift#proxy +# See proxy-server.conf-sample for options + +[filter:cache] +use = egg:swift#memcache +# See proxy-server.conf-sample for options + +[filter:catch_errors] +use = egg:swift#catch_errors +# See proxy-server.conf-sample for options diff --git a/doc/saio/swift/object-server/1.conf b/doc/saio/swift/object-server/1.conf new file mode 100644 index 0000000000..ecd5ff01c9 --- /dev/null +++ b/doc/saio/swift/object-server/1.conf @@ -0,0 +1,34 @@ +[DEFAULT] +devices = /srv/1/node +mount_check = false +disable_fallocate = true +bind_ip = 127.0.0.1 +bind_port = 6210 +workers = 1 +user = +log_facility = LOG_LOCAL2 +recon_cache_path = /var/cache/swift +eventlet_debug = true + +[pipeline:main] +pipeline = healthcheck recon object-server + +[app:object-server] +use = egg:swift#object + +[filter:recon] +use = egg:swift#recon + +[filter:healthcheck] +use = egg:swift#healthcheck + +[object-replicator] +rsync_module = {replication_ip}::object{replication_port} + +[object-reconstructor] + +[object-updater] + +[object-auditor] + +[object-relinker] diff --git a/doc/saio/swift/object-server/2.conf b/doc/saio/swift/object-server/2.conf new file mode 100644 index 0000000000..456f7d5586 --- /dev/null +++ b/doc/saio/swift/object-server/2.conf @@ -0,0 +1,34 @@ +[DEFAULT] +devices = /srv/2/node +mount_check = false +disable_fallocate = true +bind_ip = 127.0.0.2 +bind_port = 6220 +workers = 1 +user = +log_facility = LOG_LOCAL3 +recon_cache_path = /var/cache/swift2 +eventlet_debug = true + +[pipeline:main] +pipeline = healthcheck recon object-server + +[app:object-server] +use = egg:swift#object + +[filter:recon] +use = egg:swift#recon + +[filter:healthcheck] +use = egg:swift#healthcheck + +[object-replicator] +rsync_module = {replication_ip}::object{replication_port} + +[object-reconstructor] + +[object-updater] + +[object-auditor] + +[object-relinker] diff --git a/doc/saio/swift/object-server/3.conf b/doc/saio/swift/object-server/3.conf new file mode 100644 index 0000000000..9a0ebbdca0 --- /dev/null +++ b/doc/saio/swift/object-server/3.conf @@ -0,0 +1,34 @@ +[DEFAULT] +devices = /srv/3/node +mount_check = false +disable_fallocate = true +bind_ip = 127.0.0.3 +bind_port = 6230 +workers = 1 +user = +log_facility = LOG_LOCAL4 +recon_cache_path = /var/cache/swift3 +eventlet_debug = true + +[pipeline:main] +pipeline = healthcheck recon object-server + +[app:object-server] +use = egg:swift#object + +[filter:recon] +use = egg:swift#recon + +[filter:healthcheck] +use = egg:swift#healthcheck + +[object-replicator] +rsync_module = {replication_ip}::object{replication_port} + +[object-reconstructor] + +[object-updater] + +[object-auditor] + +[object-relinker] diff --git a/doc/saio/swift/object-server/4.conf b/doc/saio/swift/object-server/4.conf new file mode 100644 index 0000000000..1c0db1ff51 --- /dev/null +++ b/doc/saio/swift/object-server/4.conf @@ -0,0 +1,34 @@ +[DEFAULT] +devices = /srv/4/node +mount_check = false +disable_fallocate = true +bind_ip = 127.0.0.4 +bind_port = 6240 +workers = 1 +user = +log_facility = LOG_LOCAL5 +recon_cache_path = /var/cache/swift4 +eventlet_debug = true + +[pipeline:main] +pipeline = healthcheck recon object-server + +[app:object-server] +use = egg:swift#object + +[filter:recon] +use = egg:swift#recon + +[filter:healthcheck] +use = egg:swift#healthcheck + +[object-replicator] +rsync_module = {replication_ip}::object{replication_port} + +[object-reconstructor] + +[object-updater] + +[object-auditor] + +[object-relinker] diff --git a/doc/saio/swift/proxy-server.conf b/doc/saio/swift/proxy-server.conf new file mode 100644 index 0000000000..c43c036072 --- /dev/null +++ b/doc/saio/swift/proxy-server.conf @@ -0,0 +1,109 @@ +[DEFAULT] +bind_ip = 127.0.0.1 +bind_port = 8080 +workers = 1 +user = +log_facility = LOG_LOCAL1 +eventlet_debug = true + +[pipeline:main] +# Yes, proxy-logging appears twice. This is so that +# middleware-originated requests get logged too. +pipeline = catch_errors gatekeeper healthcheck proxy-logging cache etag-quoter listing_formats bulk tempurl ratelimit crossdomain container_sync tempauth staticweb copy container-quotas account-quotas slo dlo versioned_writes symlink proxy-logging proxy-server + +[filter:catch_errors] +use = egg:swift#catch_errors + +[filter:healthcheck] +use = egg:swift#healthcheck + +[filter:proxy-logging] +use = egg:swift#proxy_logging + +[filter:bulk] +use = egg:swift#bulk + +[filter:ratelimit] +use = egg:swift#ratelimit + +[filter:crossdomain] +use = egg:swift#crossdomain + +[filter:dlo] +use = egg:swift#dlo + +[filter:slo] +use = egg:swift#slo + +[filter:container_sync] +use = egg:swift#container_sync +current = //saio/saio_endpoint + +[filter:tempurl] +use = egg:swift#tempurl + +[filter:tempauth] +use = egg:swift#tempauth +user_admin_admin = admin .admin .reseller_admin +user_test_tester = testing .admin +user_test_tester2 = testing2 .admin +user_test_tester3 = testing3 +user_test2_tester2 = testing2 .admin + +[filter:staticweb] +use = egg:swift#staticweb + +[filter:account-quotas] +use = egg:swift#account_quotas + +[filter:container-quotas] +use = egg:swift#container_quotas + +[filter:cache] +use = egg:swift#memcache + +[filter:etag-quoter] +use = egg:swift#etag_quoter +enable_by_default = false + +[filter:gatekeeper] +use = egg:swift#gatekeeper + +[filter:versioned_writes] +use = egg:swift#versioned_writes +allow_versioned_writes = true +allow_object_versioning = true + +[filter:copy] +use = egg:swift#copy + +[filter:listing_formats] +use = egg:swift#listing_formats + +[filter:domain_remap] +use = egg:swift#domain_remap + +[filter:symlink] +use = egg:swift#symlink + +# To enable, add the s3api middleware to the pipeline before tempauth +[filter:s3api] +use = egg:swift#s3api +s3_acl = yes +check_bucket_owner = yes +cors_preflight_allow_origin = * + +# Example to create root secret: `openssl rand -base64 32` +[filter:keymaster] +use = egg:swift#keymaster +encryption_root_secret = changeme/changeme/changeme/changeme/change/= + +# To enable use of encryption add both middlewares to pipeline, example: +# keymaster encryption proxy-logging proxy-server +[filter:encryption] +use = egg:swift#encryption + +[app:proxy-server] +use = egg:swift#proxy +allow_account_management = true +account_autocreate = true diff --git a/doc/saio/swift/swift.conf b/doc/saio/swift/swift.conf new file mode 100644 index 0000000000..e01a0ac881 --- /dev/null +++ b/doc/saio/swift/swift.conf @@ -0,0 +1,21 @@ +[swift-hash] +# random unique strings that can never change (DO NOT LOSE) +# Use only printable chars (python -c "import string; print(string.printable)") +swift_hash_path_prefix = changeme +swift_hash_path_suffix = changeme + +[storage-policy:0] +name = gold +policy_type = replication +default = yes + +[storage-policy:1] +name = silver +policy_type = replication + +[storage-policy:2] +name = ec42 +policy_type = erasure_coding +ec_type = liberasurecode_rs_vand +ec_num_data_fragments = 4 +ec_num_parity_fragments = 2 diff --git a/doc/source/_extra/.htaccess b/doc/source/_extra/.htaccess new file mode 100644 index 0000000000..b45cdfeb54 --- /dev/null +++ b/doc/source/_extra/.htaccess @@ -0,0 +1,2 @@ +# docs redirects are defined here +redirectmatch 301 ^/swift/([^/]+)/team.html$ https://github.com/openstack/swift/blob/master/AUTHORS diff --git a/doc/source/_ga/layout.html b/doc/source/_ga/layout.html deleted file mode 100644 index 876fe009d8..0000000000 --- a/doc/source/_ga/layout.html +++ /dev/null @@ -1,17 +0,0 @@ -{% extends "!layout.html" %} - -{% block footer %} -{{ super() }} - - -{% endblock %} - diff --git a/doc/source/_static/basic.css b/doc/source/_static/basic.css deleted file mode 100644 index d909ce37c7..0000000000 --- a/doc/source/_static/basic.css +++ /dev/null @@ -1,416 +0,0 @@ -/** - * Sphinx stylesheet -- basic theme - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - */ - -/* -- main layout ----------------------------------------------------------- */ - -div.clearer { - clear: both; -} - -/* -- relbar ---------------------------------------------------------------- */ - -div.related { - width: 100%; - font-size: 90%; -} - -div.related h3 { - display: none; -} - -div.related ul { - margin: 0; - padding: 0 0 0 10px; - list-style: none; -} - -div.related li { - display: inline; -} - -div.related li.right { - float: right; - margin-right: 5px; -} - -/* -- sidebar --------------------------------------------------------------- */ - -div.sphinxsidebarwrapper { - padding: 10px 5px 0 10px; -} - -div.sphinxsidebar { - float: left; - width: 230px; - margin-left: -100%; - font-size: 90%; -} - -div.sphinxsidebar ul { - list-style: none; -} - -div.sphinxsidebar ul ul, -div.sphinxsidebar ul.want-points { - margin-left: 20px; - list-style: square; -} - -div.sphinxsidebar ul ul { - margin-top: 0; - margin-bottom: 0; -} - -div.sphinxsidebar form { - margin-top: 10px; -} - -div.sphinxsidebar input { - border: 1px solid #98dbcc; - font-family: sans-serif; - font-size: 1em; -} - -img { - border: 0; -} - -/* -- search page ----------------------------------------------------------- */ - -ul.search { - margin: 10px 0 0 20px; - padding: 0; -} - -ul.search li { - padding: 5px 0 5px 20px; - background-image: url(file.png); - background-repeat: no-repeat; - background-position: 0 7px; -} - -ul.search li a { - font-weight: bold; -} - -ul.search li div.context { - color: #888; - margin: 2px 0 0 30px; - text-align: left; -} - -ul.keywordmatches li.goodmatch a { - font-weight: bold; -} - -/* -- index page ------------------------------------------------------------ */ - -table.contentstable { - width: 90%; -} - -table.contentstable p.biglink { - line-height: 150%; -} - -a.biglink { - font-size: 1.3em; -} - -span.linkdescr { - font-style: italic; - padding-top: 5px; - font-size: 90%; -} - -/* -- general index --------------------------------------------------------- */ - -table.indextable td { - text-align: left; - vertical-align: top; -} - -table.indextable dl, table.indextable dd { - margin-top: 0; - margin-bottom: 0; -} - -table.indextable tr.pcap { - height: 10px; -} - -table.indextable tr.cap { - margin-top: 10px; - background-color: #f2f2f2; -} - -img.toggler { - margin-right: 3px; - margin-top: 3px; - cursor: pointer; -} - -/* -- general body styles --------------------------------------------------- */ - -a.headerlink { - visibility: hidden; -} - -h1:hover > a.headerlink, -h2:hover > a.headerlink, -h3:hover > a.headerlink, -h4:hover > a.headerlink, -h5:hover > a.headerlink, -h6:hover > a.headerlink, -dt:hover > a.headerlink { - visibility: visible; -} - -div.body p.caption { - text-align: inherit; -} - -div.body td { - text-align: left; -} - -.field-list ul { - padding-left: 1em; -} - -.first { -} - -p.rubric { - margin-top: 30px; - font-weight: bold; -} - -/* -- sidebars -------------------------------------------------------------- */ - -div.sidebar { - margin: 0 0 0.5em 1em; - border: 1px solid #ddb; - padding: 7px 7px 0 7px; - background-color: #ffe; - width: 40%; - float: right; -} - -p.sidebar-title { - font-weight: bold; -} - -/* -- topics ---------------------------------------------------------------- */ - -div.topic { - border: 1px solid #ccc; - padding: 7px 7px 0 7px; - margin: 10px 0 10px 0; -} - -p.topic-title { - font-size: 1.1em; - font-weight: bold; - margin-top: 10px; -} - -/* -- admonitions ----------------------------------------------------------- */ - -div.admonition { - margin-top: 10px; - margin-bottom: 10px; - padding: 7px; -} - -div.admonition dt { - font-weight: bold; -} - -div.admonition dl { - margin-bottom: 0; -} - -p.admonition-title { - margin: 0px 10px 5px 0px; - font-weight: bold; -} - -div.body p.centered { - text-align: center; - margin-top: 25px; -} - -/* -- tables ---------------------------------------------------------------- */ - -table.docutils { - border: 0; - border-collapse: collapse; -} - -table.docutils td, table.docutils th { - padding: 1px 8px 1px 0; - border-top: 0; - border-left: 0; - border-right: 0; - border-bottom: 1px solid #aaa; -} - -table.field-list td, table.field-list th { - border: 0 !important; -} - -table.footnote td, table.footnote th { - border: 0 !important; -} - -th { - text-align: left; - padding-right: 5px; -} - -/* -- other body styles ----------------------------------------------------- */ - -dl { - margin-bottom: 15px; -} - -dd p { - margin-top: 0px; -} - -dd ul, dd table { - margin-bottom: 10px; -} - -dd { - margin-top: 3px; - margin-bottom: 10px; - margin-left: 30px; -} - -dt:target, .highlight { - background-color: #fbe54e; -} - -dl.glossary dt { - font-weight: bold; - font-size: 1.1em; -} - -.field-list ul { - margin: 0; - padding-left: 1em; -} - -.field-list p { - margin: 0; -} - -.refcount { - color: #060; -} - -.optional { - font-size: 1.3em; -} - -.versionmodified { - font-style: italic; -} - -.system-message { - background-color: #fda; - padding: 5px; - border: 3px solid red; -} - -.footnote:target { - background-color: #ffa -} - -.line-block { - display: block; - margin-top: 1em; - margin-bottom: 1em; -} - -.line-block .line-block { - margin-top: 0; - margin-bottom: 0; - margin-left: 1.5em; -} - -/* -- code displays --------------------------------------------------------- */ - -pre { - overflow: auto; -} - -td.linenos pre { - padding: 5px 0px; - border: 0; - background-color: transparent; - color: #aaa; -} - -table.highlighttable { - margin-left: 0.5em; -} - -table.highlighttable td { - padding: 0 0.5em 0 0.5em; -} - -tt.descname { - background-color: transparent; - font-weight: bold; - font-size: 1.2em; -} - -tt.descclassname { - background-color: transparent; -} - -tt.xref, a tt { - background-color: transparent; - font-weight: bold; -} - -h1 tt, h2 tt, h3 tt, h4 tt, h5 tt, h6 tt { - background-color: transparent; -} - -/* -- math display ---------------------------------------------------------- */ - -img.math { - vertical-align: middle; -} - -div.body div.math p { - text-align: center; -} - -span.eqno { - float: right; -} - -/* -- printout stylesheet --------------------------------------------------- */ - -@media print { - div.document, - div.documentwrapper, - div.bodywrapper { - margin: 0 !important; - width: 100%; - } - - div.sphinxsidebar, - div.related, - div.footer, - #top-link { - display: none; - } -} diff --git a/doc/source/_static/default.css b/doc/source/_static/default.css deleted file mode 100644 index c8091ecb4d..0000000000 --- a/doc/source/_static/default.css +++ /dev/null @@ -1,230 +0,0 @@ -/** - * Sphinx stylesheet -- default theme - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - */ - -@import url("basic.css"); - -/* -- page layout ----------------------------------------------------------- */ - -body { - font-family: sans-serif; - font-size: 100%; - background-color: #11303d; - color: #000; - margin: 0; - padding: 0; -} - -div.document { - background-color: #1c4e63; -} - -div.documentwrapper { - float: left; - width: 100%; -} - -div.bodywrapper { - margin: 0 0 0 230px; -} - -div.body { - background-color: #ffffff; - color: #000000; - padding: 0 20px 30px 20px; -} - -div.footer { - color: #ffffff; - width: 100%; - padding: 9px 0 9px 0; - text-align: center; - font-size: 75%; -} - -div.footer a { - color: #ffffff; - text-decoration: underline; -} - -div.related { - background-color: #133f52; - line-height: 30px; - color: #ffffff; -} - -div.related a { - color: #ffffff; -} - -div.sphinxsidebar { -} - -div.sphinxsidebar h3 { - font-family: 'Trebuchet MS', sans-serif; - color: #ffffff; - font-size: 1.4em; - font-weight: normal; - margin: 0; - padding: 0; -} - -div.sphinxsidebar h3 a { - color: #ffffff; -} - -div.sphinxsidebar h4 { - font-family: 'Trebuchet MS', sans-serif; - color: #ffffff; - font-size: 1.3em; - font-weight: normal; - margin: 5px 0 0 0; - padding: 0; -} - -div.sphinxsidebar p { - color: #ffffff; -} - -div.sphinxsidebar p.topless { - margin: 5px 10px 10px 10px; -} - -div.sphinxsidebar ul { - margin: 10px; - padding: 0; - color: #ffffff; -} - -div.sphinxsidebar a { - color: #98dbcc; -} - -div.sphinxsidebar input { - border: 1px solid #98dbcc; - font-family: sans-serif; - font-size: 1em; -} - -/* -- body styles ----------------------------------------------------------- */ - -a { - color: #355f7c; - text-decoration: none; -} - -a:hover { - text-decoration: underline; -} - -div.body p, div.body dd, div.body li { - text-align: left; - line-height: 130%; -} - -div.body h1, -div.body h2, -div.body h3, -div.body h4, -div.body h5, -div.body h6 { - font-family: 'Trebuchet MS', sans-serif; - background-color: #f2f2f2; - font-weight: normal; - color: #20435c; - border-bottom: 1px solid #ccc; - margin: 20px -20px 10px -20px; - padding: 3px 0 3px 10px; -} - -div.body h1 { margin-top: 0; font-size: 200%; } -div.body h2 { font-size: 160%; } -div.body h3 { font-size: 140%; } -div.body h4 { font-size: 120%; } -div.body h5 { font-size: 110%; } -div.body h6 { font-size: 100%; } - -a.headerlink { - color: #c60f0f; - font-size: 0.8em; - padding: 0 4px 0 4px; - text-decoration: none; -} - -a.headerlink:hover { - background-color: #c60f0f; - color: white; -} - -div.body p, div.body dd, div.body li { - text-align: left; - line-height: 130%; -} - -div.admonition p.admonition-title + p { - display: inline; -} - -div.admonition p { - margin-bottom: 5px; -} - -div.admonition pre { - margin-bottom: 5px; -} - -div.admonition ul, div.admonition ol { - margin-bottom: 5px; -} - -div.note { - background-color: #eee; - border: 1px solid #ccc; -} - -div.seealso { - background-color: #ffc; - border: 1px solid #ff6; -} - -div.topic { - background-color: #eee; -} - -div.warning { - background-color: #ffe4e4; - border: 1px solid #f66; -} - -p.admonition-title { - display: inline; -} - -p.admonition-title:after { - content: ":"; -} - -pre { - padding: 5px; - background-color: #eeffcc; - color: #333333; - line-height: 120%; - border: 1px solid #ac9; - border-left: none; - border-right: none; -} - -tt { - background-color: #ecf0f3; - padding: 0 1px 0 1px; - font-size: 0.95em; -} - -.warning tt { - background: #efc2c2; -} - -.note tt { - background: #d6d6d6; -} diff --git a/doc/source/_static/tweaks.css b/doc/source/_static/tweaks.css deleted file mode 100644 index 1b6fdaa186..0000000000 --- a/doc/source/_static/tweaks.css +++ /dev/null @@ -1,212 +0,0 @@ -ul.todo_list { - list-style-type: none; - margin: 0; - padding: 0; -} - -ul.todo_list li { - display: block; - margin: 0; - padding: 7px 0; - border-top: 1px solid #eee; -} - -ul.todo_list li p { - display: inline; -} - -ul.todo_list li p.link { - font-weight: bold; -} - -ul.todo_list li p.details { - font-style: italic; -} - -ul.todo_list li { -} - -div.admonition { - border: 1px solid #8F1000; -} - -div.admonition p.admonition-title { - background-color: #8F1000; - border-bottom: 1px solid #8E8E8E; -} - -a { - color: #CF2F19; -} - -div.related ul li a { - color: #CF2F19; -} - -div.sphinxsidebar h4 { - background-color:#8E8E8E; - border:1px solid #255E6E; - color:white; - font-size:1em; - margin:1em 0 0.5em; - padding:0.1em 0 0.1em 0.5em; -} - -em { - font-style: normal; -} - -table.docutils { - font-size: 11px; -} - -a tt { - color:#CF2F19; -} - -/* ------------------------------------------ -PURE CSS SPEECH BUBBLES -by Nicolas Gallagher -- http://nicolasgallagher.com/pure-css-speech-bubbles/ - -http://nicolasgallagher.com -http://twitter.com/necolas - -Created: 02 March 2010 -Version: 1.1 (21 October 2010) - -Dual licensed under MIT and GNU GPLv2 © Nicolas Gallagher ------------------------------------------- */ -/* THE SPEECH BUBBLE -------------------------------------------------------------------------------------------------------------------------------- */ - -/* THE SPEECH BUBBLE -------------------------------------------------------------------------------------------------------------------------------- */ - -.triangle-border { - position:relative; - padding:15px; - margin:1em 0 3em; - border:5px solid #BC1518; - color:#333; - background:#fff; - - /* css3 */ - -moz-border-radius:10px; - -webkit-border-radius:10px; - border-radius:10px; -} - -/* Variant : for left positioned triangle ------------------------------------------- */ - -.triangle-border.left { - margin-left:30px; -} - -/* Variant : for right positioned triangle ------------------------------------------- */ - -.triangle-border.right { - margin-right:30px; -} - -/* THE TRIANGLE -------------------------------------------------------------------------------------------------------------------------------- */ - -.triangle-border:before { - content:""; - display:block; /* reduce the damage in FF3.0 */ - position:absolute; - bottom:-40px; /* value = - border-top-width - border-bottom-width */ - left:40px; /* controls horizontal position */ - width:0; - height:0; - border:20px solid transparent; - border-top-color:#BC1518; -} - -/* creates the smaller triangle */ -.triangle-border:after { - content:""; - display:block; /* reduce the damage in FF3.0 */ - position:absolute; - bottom:-26px; /* value = - border-top-width - border-bottom-width */ - left:47px; /* value = (:before left) + (:before border-left) - (:after border-left) */ - width:0; - height:0; - border:13px solid transparent; - border-top-color:#fff; -} - -/* Variant : top ------------------------------------------- */ - -/* creates the larger triangle */ -.triangle-border.top:before { - top:-40px; /* value = - border-top-width - border-bottom-width */ - right:40px; /* controls horizontal position */ - bottom:auto; - left:auto; - border:20px solid transparent; - border-bottom-color:#BC1518; -} - -/* creates the smaller triangle */ -.triangle-border.top:after { - top:-26px; /* value = - border-top-width - border-bottom-width */ - right:47px; /* value = (:before right) + (:before border-right) - (:after border-right) */ - bottom:auto; - left:auto; - border:13px solid transparent; - border-bottom-color:#fff; -} - -/* Variant : left ------------------------------------------- */ - -/* creates the larger triangle */ -.triangle-border.left:before { - top:10px; /* controls vertical position */ - left:-30px; /* value = - border-left-width - border-right-width */ - bottom:auto; - border-width:15px 30px 15px 0; - border-style:solid; - border-color:transparent #BC1518; -} - -/* creates the smaller triangle */ -.triangle-border.left:after { - top:16px; /* value = (:before top) + (:before border-top) - (:after border-top) */ - left:-21px; /* value = - border-left-width - border-right-width */ - bottom:auto; - border-width:9px 21px 9px 0; - border-style:solid; - border-color:transparent #fff; -} - -/* Variant : right ------------------------------------------- */ - -/* creates the larger triangle */ -.triangle-border.right:before { - top:10px; /* controls vertical position */ - right:-30px; /* value = - border-left-width - border-right-width */ - bottom:auto; - left:auto; - border-width:15px 0 15px 30px; - border-style:solid; - border-color:transparent #BC1518; -} - -/* creates the smaller triangle */ -.triangle-border.right:after { - top:16px; /* value = (:before top) + (:before border-top) - (:after border-top) */ - right:-21px; /* value = - border-left-width - border-right-width */ - bottom:auto; - left:auto; - border-width:9px 0 9px 21px; - border-style:solid; - border-color:transparent #fff; -} - diff --git a/doc/source/_theme/layout.html b/doc/source/_theme/layout.html deleted file mode 100644 index fae4d6764d..0000000000 --- a/doc/source/_theme/layout.html +++ /dev/null @@ -1,69 +0,0 @@ -{% extends "sphinxdoc/layout.html" %} -{% set css_files = css_files + ['_static/tweaks.css'] %} - -{%- macro sidebar() %} -{%- if not embedded %}{% if not theme_nosidebar|tobool %} -
-
- {%- block sidebarlogo %} - {%- if logo %} - - {%- endif %} - {%- endblock %} - {%- block sidebartoc %} - {%- if display_toc %} -

{{ _('Table Of Contents') }}

- {{ toc }} - {%- endif %} - {%- endblock %} - {%- block sidebarrel %} - {%- if prev %} -

{{ _('Previous topic') }}

-

{{ prev.title }}

- {%- endif %} - {%- if next %} -

{{ _('Next topic') }}

-

{{ next.title }}

- {%- endif %} - {%- endblock %} - {%- block sidebarsourcelink %} - {%- if show_source and has_source and sourcename %} -

{{ _('This Page') }}

- - {%- endif %} - {%- endblock %} - {%- if customsidebar %} - {% include customsidebar %} - {%- endif %} - {%- block sidebarsearch %} - {%- if pagename != "search" %} - - -

- Psst... hey. Did you know you can read about Swift at docs.openstack.org also? Plus you can get to past versions atSwift 1.3 docs and Swift 1.2 docs. -

- {%- endif %} - - {%- endblock %} -
-
- {%- endif %}{% endif %} -{%- endmacro %} \ No newline at end of file diff --git a/doc/source/_theme/theme.conf b/doc/source/_theme/theme.conf deleted file mode 100644 index e039fe01f9..0000000000 --- a/doc/source/_theme/theme.conf +++ /dev/null @@ -1,5 +0,0 @@ -[theme] -inherit = sphinxdoc -stylesheet = sphinxdoc.css -pygments_style = friendly - diff --git a/doc/source/account.rst b/doc/source/account.rst index 2ddb1f7d33..e7f0722d52 100644 --- a/doc/source/account.rst +++ b/doc/source/account.rst @@ -4,22 +4,22 @@ Account ******* -.. _account-server: +.. _account-auditor: -Account Server -============== +Account Auditor +=============== -.. automodule:: swift.account.server +.. automodule:: swift.account.auditor :members: :undoc-members: :show-inheritance: -.. _account-auditor: +.. _account-backend: -Account Auditor +Account Backend =============== -.. automodule:: swift.account.auditor +.. automodule:: swift.account.backend :members: :undoc-members: :show-inheritance: @@ -34,3 +34,12 @@ Account Reaper :undoc-members: :show-inheritance: +.. _account-server: + +Account Server +============== + +.. automodule:: swift.account.server + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/admin/figures/objectstorage-accountscontainers.png b/doc/source/admin/figures/objectstorage-accountscontainers.png new file mode 100644 index 0000000000..4df7326a80 Binary files /dev/null and b/doc/source/admin/figures/objectstorage-accountscontainers.png differ diff --git a/doc/source/admin/figures/objectstorage-arch.png b/doc/source/admin/figures/objectstorage-arch.png new file mode 100644 index 0000000000..3b7978b673 Binary files /dev/null and b/doc/source/admin/figures/objectstorage-arch.png differ diff --git a/doc/source/admin/figures/objectstorage-buildingblocks.png b/doc/source/admin/figures/objectstorage-buildingblocks.png new file mode 100644 index 0000000000..8499ca1ead Binary files /dev/null and b/doc/source/admin/figures/objectstorage-buildingblocks.png differ diff --git a/doc/source/admin/figures/objectstorage-nodes.png b/doc/source/admin/figures/objectstorage-nodes.png new file mode 100644 index 0000000000..e7a0396f5f Binary files /dev/null and b/doc/source/admin/figures/objectstorage-nodes.png differ diff --git a/doc/source/admin/figures/objectstorage-partitions.png b/doc/source/admin/figures/objectstorage-partitions.png new file mode 100644 index 0000000000..7e319ca0b7 Binary files /dev/null and b/doc/source/admin/figures/objectstorage-partitions.png differ diff --git a/doc/source/admin/figures/objectstorage-replication.png b/doc/source/admin/figures/objectstorage-replication.png new file mode 100644 index 0000000000..8ce1309131 Binary files /dev/null and b/doc/source/admin/figures/objectstorage-replication.png differ diff --git a/doc/source/admin/figures/objectstorage-ring.png b/doc/source/admin/figures/objectstorage-ring.png new file mode 100644 index 0000000000..22ef31201a Binary files /dev/null and b/doc/source/admin/figures/objectstorage-ring.png differ diff --git a/doc/source/admin/figures/objectstorage-usecase.png b/doc/source/admin/figures/objectstorage-usecase.png new file mode 100644 index 0000000000..5d7c8f421e Binary files /dev/null and b/doc/source/admin/figures/objectstorage-usecase.png differ diff --git a/doc/source/admin/figures/objectstorage-zones.png b/doc/source/admin/figures/objectstorage-zones.png new file mode 100644 index 0000000000..ee5ffbf72c Binary files /dev/null and b/doc/source/admin/figures/objectstorage-zones.png differ diff --git a/doc/source/admin/figures/objectstorage.png b/doc/source/admin/figures/objectstorage.png new file mode 100644 index 0000000000..9454065c28 Binary files /dev/null and b/doc/source/admin/figures/objectstorage.png differ diff --git a/doc/source/admin/index.rst b/doc/source/admin/index.rst new file mode 100644 index 0000000000..1145a82570 --- /dev/null +++ b/doc/source/admin/index.rst @@ -0,0 +1,21 @@ +=================================== +OpenStack Swift Administrator Guide +=================================== + +.. toctree:: + :maxdepth: 2 + + objectstorage-intro.rst + objectstorage-features.rst + objectstorage-characteristics.rst + objectstorage-components.rst + objectstorage-ringbuilder.rst + objectstorage-arch.rst + objectstorage-replication.rst + objectstorage-large-objects.rst + objectstorage-auditors.rst + objectstorage-EC.rst + objectstorage-account-reaper.rst + objectstorage-tenant-specific-image-storage.rst + objectstorage-monitoring.rst + objectstorage-troubleshoot.rst diff --git a/doc/source/admin/objectstorage-EC.rst b/doc/source/admin/objectstorage-EC.rst new file mode 100644 index 0000000000..2e324079d0 --- /dev/null +++ b/doc/source/admin/objectstorage-EC.rst @@ -0,0 +1,19 @@ +============== +Erasure coding +============== + +Erasure coding is a set of algorithms that allows the reconstruction of +missing data from a set of original data. In theory, erasure coding uses +less capacity with similar durability characteristics as replicas. +From an application perspective, erasure coding support is transparent. +Object Storage (swift) implements erasure coding as a Storage Policy. +See :doc:`/overview_policies` for more details. + +There is no external API related to erasure coding. Create a container using a +Storage Policy; the interaction with the cluster is the same as any +other durability policy. Because support implements as a Storage Policy, +you can isolate all storage devices that associate with your cluster's +erasure coding capability. It is entirely possible to share devices between +storage policies, but for erasure coding it may make more sense to use +not only separate devices but possibly even entire nodes dedicated for erasure +coding. diff --git a/doc/source/admin/objectstorage-account-reaper.rst b/doc/source/admin/objectstorage-account-reaper.rst new file mode 100644 index 0000000000..0acdc20578 --- /dev/null +++ b/doc/source/admin/objectstorage-account-reaper.rst @@ -0,0 +1,51 @@ +============== +Account reaper +============== + +The purpose of the account reaper is to remove data from the deleted accounts. + +A reseller marks an account for deletion by issuing a ``DELETE`` request +on the account's storage URL. This action sets the ``status`` column of +the account_stat table in the account database and replicas to +``DELETED``, marking the account's data for deletion. + +Typically, a specific retention time or undelete are not provided. +However, you can set a ``delay_reaping`` value in the +``[account-reaper]`` section of the ``account-server.conf`` file to +delay the actual deletion of data. At this time, to undelete you have to update +the account database replicas directly, set the status column to an +empty string and update the put_timestamp to be greater than the +delete_timestamp. + +.. note:: + + It is on the development to-do list to write a utility that performs + this task, preferably through a REST call. + +The account reaper runs on each account server and scans the server +occasionally for account databases marked for deletion. It only fires up +on the accounts for which the server is the primary node, so that +multiple account servers aren't trying to do it simultaneously. Using +multiple servers to delete one account might improve the deletion speed +but requires coordination to avoid duplication. Speed really is not a +big concern with data deletion, and large accounts aren't deleted often. + +Deleting an account is simple. For each account container, all objects +are deleted and then the container is deleted. Deletion requests that +fail will not stop the overall process but will cause the overall +process to fail eventually (for example, if an object delete times out, +you will not be able to delete the container or the account). The +account reaper keeps trying to delete an account until it is empty, at +which point the database reclaim process within the db\_replicator will +remove the database files. + +A persistent error state may prevent the deletion of an object or +container. If this happens, you will see a message in the log, for example: + +.. code-block:: console + + Account has not been reaped since + +You can control when this is logged with the ``reap_warn_after`` value in the +``[account-reaper]`` section of the ``account-server.conf`` file. +The default value is 30 days. diff --git a/doc/source/admin/objectstorage-arch.rst b/doc/source/admin/objectstorage-arch.rst new file mode 100644 index 0000000000..69b7fd96a7 --- /dev/null +++ b/doc/source/admin/objectstorage-arch.rst @@ -0,0 +1,89 @@ +==================== +Cluster architecture +==================== + +Access tier +~~~~~~~~~~~ +Large-scale deployments segment off an access tier, which is considered +the Object Storage system's central hub. The access tier fields the +incoming API requests from clients and moves data in and out of the +system. This tier consists of front-end load balancers, ssl-terminators, +and authentication services. It runs the (distributed) brain of the +Object Storage system: the proxy server processes. + +.. note:: + + If you want to use OpenStack Identity API v3 for authentication, you + have the following options available in ``/etc/swift/dispersion.conf``: + ``auth_version``, ``user_domain_name``, ``project_domain_name``, + and ``project_name``. + +**Object Storage architecture** + + +.. figure:: figures/objectstorage-arch.png + + +Because access servers are collocated in their own tier, you can scale +out read/write access regardless of the storage capacity. For example, +if a cluster is on the public Internet, requires SSL termination, and +has a high demand for data access, you can provision many access +servers. However, if the cluster is on a private network and used +primarily for archival purposes, you need fewer access servers. + +Since this is an HTTP addressable storage service, you may incorporate a +load balancer into the access tier. + +Typically, the tier consists of a collection of 1U servers. These +machines use a moderate amount of RAM and are network I/O intensive. +Since these systems field each incoming API request, you should +provision them with two high-throughput (10GbE) interfaces - one for the +incoming front-end requests and the other for the back-end access to +the object storage nodes to put and fetch data. + +Factors to consider +------------------- + +For most publicly facing deployments as well as private deployments +available across a wide-reaching corporate network, you use SSL to +encrypt traffic to the client. SSL adds significant processing load to +establish sessions between clients, which is why you have to provision +more capacity in the access layer. SSL may not be required for private +deployments on trusted networks. + +Storage nodes +~~~~~~~~~~~~~ + +In most configurations, each of the five zones should have an equal +amount of storage capacity. Storage nodes use a reasonable amount of +memory and CPU. Metadata needs to be readily available to return objects +quickly. The object stores run services not only to field incoming +requests from the access tier, but to also run replicators, auditors, +and reapers. You can provision storage nodes with single gigabit or +10 gigabit network interface depending on the expected workload and +desired performance, although it may be desirable to isolate replication +traffic with a second interface. + +**Object Storage (swift)** + + +.. figure:: figures/objectstorage-nodes.png + + + +Currently, a 2 TB or 3 TB SATA disk delivers good performance for the +price. You can use desktop-grade drives if you have responsive remote +hands in the datacenter and enterprise-grade drives if you don't. + +Factors to consider +------------------- + +You should keep in mind the desired I/O performance for single-threaded +requests. This system does not use RAID, so a single disk handles each +request for an object. Disk performance impacts single-threaded response +rates. + +To achieve apparent higher throughput, the object storage system is +designed to handle concurrent uploads/downloads. The network I/O +capacity (1GbE, bonded 1GbE pair, or 10GbE) should match your desired +concurrent throughput needs for reads and writes. diff --git a/doc/source/admin/objectstorage-auditors.rst b/doc/source/admin/objectstorage-auditors.rst new file mode 100644 index 0000000000..1a3a5783cf --- /dev/null +++ b/doc/source/admin/objectstorage-auditors.rst @@ -0,0 +1,30 @@ +============== +Object Auditor +============== + +On system failures, the XFS file system can sometimes truncate files it is +trying to write and produce zero-byte files. The object-auditor will catch +these problems but in the case of a system crash it is advisable to run +an extra, less rate limited sweep, to check for these specific files. +You can run this command as follows: + +.. code-block:: console + + $ swift-object-auditor /path/to/object-server/config/file.conf once -z 1000 + +.. note:: + + "-z" means to only check for zero-byte files at 1000 files per second. + +It is useful to run the object auditor on a specific device or set of devices. +You can run the object-auditor once as follows: + +.. code-block:: console + + $ swift-object-auditor /path/to/object-server/config/file.conf once \ + --devices=sda,sdb + +.. note:: + + This will run the object auditor on only the ``sda`` and ``sdb`` devices. + This parameter accepts a comma-separated list of values. diff --git a/doc/source/admin/objectstorage-characteristics.rst b/doc/source/admin/objectstorage-characteristics.rst new file mode 100644 index 0000000000..3846bccb25 --- /dev/null +++ b/doc/source/admin/objectstorage-characteristics.rst @@ -0,0 +1,47 @@ +============================== +Object Storage characteristics +============================== + +The key characteristics of Object Storage are that: + +- All objects stored in Object Storage have a URL. + +- "Storage Policies" may be used to define different levels of durability + for objects stored in the cluster. These policies support not only + complete replicas but also erasure-coded fragments. + +- All replicas or fragments for an object are stored in as-unique-as-possible + zones to increase durability and availability. + +- All objects have their own metadata. + +- Developers interact with the object storage system through a RESTful + HTTP API. + +- Object data can be located anywhere in the cluster. + +- The cluster scales by adding additional nodes without sacrificing + performance, which allows a more cost-effective linear storage + expansion than fork-lift upgrades. + +- Data does not have to be migrated to an entirely new storage system. + +- New nodes can be added to the cluster without downtime. + +- Failed nodes and disks can be swapped out without downtime. + +- It runs on industry-standard hardware, such as Dell, HP, and + Supermicro. + +.. _objectstorage-figure: + +Object Storage (swift) + +.. figure:: figures/objectstorage.png + +Developers can either write directly to the Swift API or use one of the +many client libraries that exist for all of the popular programming +languages, such as Java, Python, Ruby, and C#. Amazon S3 and RackSpace +Cloud Files users should be very familiar with Object Storage. Users new +to object storage systems will have to adjust to a different approach +and mindset than those required for a traditional filesystem. diff --git a/doc/source/admin/objectstorage-components.rst b/doc/source/admin/objectstorage-components.rst new file mode 100644 index 0000000000..d784e07599 --- /dev/null +++ b/doc/source/admin/objectstorage-components.rst @@ -0,0 +1,255 @@ +========== +Components +========== + +Object Storage uses the following components to deliver high +availability, high durability, and high concurrency: + +- **Proxy servers** - Handle all of the incoming API requests. + +- **Rings** - Map logical names of data to locations on particular + disks. + +- **Zones** - Isolate data from other zones. A failure in one zone + does not impact the rest of the cluster as data replicates + across zones. + +- **Accounts and containers** - Each account and container are + individual databases that are distributed across the cluster. An + account database contains the list of containers in that account. A + container database contains the list of objects in that container. + +- **Objects** - The data itself. + +- **Partitions** - A partition stores objects, account databases, and + container databases and helps manage locations where data lives in + the cluster. + + +.. _objectstorage-building-blocks-figure: + +**Object Storage building blocks** + +.. figure:: figures/objectstorage-buildingblocks.png + + +Proxy servers +------------- + +Proxy servers are the public face of Object Storage and handle all of +the incoming API requests. Once a proxy server receives a request, it +determines the storage node based on the object's URL, for example: +``https://swift.example.com/v1/account/container/object``. Proxy servers +also coordinate responses, handle failures, and coordinate timestamps. + +Proxy servers use a shared-nothing architecture and can be scaled as +needed based on projected workloads. A minimum of two proxy servers +should be deployed behind a separately-managed load balancer. If one +proxy server fails, the others take over. + +Rings +----- + +A ring represents a mapping between the names of entities stored in the +cluster and their physical locations on disks. There are separate rings +for accounts, containers, and objects. When components of the system need +to perform an operation on an object, container, or account, they need to +interact with the corresponding ring to determine the appropriate location +in the cluster. + +The ring maintains this mapping using zones, devices, partitions, and +replicas. Each partition in the ring is replicated, by default, three +times across the cluster, and partition locations are stored in the +mapping maintained by the ring. The ring is also responsible for +determining which devices are used as handoffs in failure scenarios. + +Data can be isolated into zones in the ring. Each partition replica +will try to reside in a different zone. A zone could represent a +drive, a server, a cabinet, a switch, or even a data center. + +The partitions of the ring are distributed among all of the devices +in the Object Storage installation. When partitions need to be moved +around (for example, if a device is added to the cluster), the ring +ensures that a minimum number of partitions are moved at a time, and +only one replica of a partition is moved at a time. + +You can use weights to balance the distribution of partitions on drives +across the cluster. This can be useful, for example, when differently +sized drives are used in a cluster. + +The ring is used by the proxy server and several background processes +(like replication). + + +.. _objectstorage-ring-figure: + +**The ring** + +.. figure:: figures/objectstorage-ring.png + +These rings are externally managed. The server processes themselves +do not modify the rings, they are instead given new rings modified by +other tools. + +The ring uses a configurable number of bits from an ``MD5`` hash for a path +as a partition index that designates a device. The number of bits kept +from the hash is known as the partition power, and 2 to the partition +power indicates the partition count. Partitioning the full ``MD5`` hash ring +allows other parts of the cluster to work in batches of items at once +which ends up either more efficient or at least less complex than +working with each item separately or the entire cluster all at once. + +Another configurable value is the replica count, which indicates how +many of the partition-device assignments make up a single ring. For a +given partition index, each replica's device will not be in the same +zone as any other replica's device. Zones can be used to group devices +based on physical locations, power separations, network separations, or +any other attribute that would improve the availability of multiple +replicas at the same time. + +Zones +----- + +Object Storage allows configuring zones in order to isolate failure +boundaries. If possible, each data replica resides in a separate zone. +At the smallest level, a zone could be a single drive or a grouping of a +few drives. If there were five object storage servers, then each server +would represent its own zone. Larger deployments would have an entire +rack (or multiple racks) of object servers, each representing a zone. +The goal of zones is to allow the cluster to tolerate significant +outages of storage servers without losing all replicas of the data. + + +.. _objectstorage-zones-figure: + +**Zones** + +.. figure:: figures/objectstorage-zones.png + + +Accounts and containers +----------------------- + +Each account and container is an individual SQLite database that is +distributed across the cluster. An account database contains the list of +containers in that account. A container database contains the list of +objects in that container. + + +.. _objectstorage-accountscontainers-figure: + +**Accounts and containers** + +.. figure:: figures/objectstorage-accountscontainers.png + + +To keep track of object data locations, each account in the system has a +database that references all of its containers, and each container +database references each object. + +Partitions +---------- + +A partition is a collection of stored data. This includes account databases, +container databases, and objects. Partitions are core to the replication +system. + +Think of a partition as a bin moving throughout a fulfillment center +warehouse. Individual orders get thrown into the bin. The system treats +that bin as a cohesive entity as it moves throughout the system. A bin +is easier to deal with than many little things. It makes for fewer +moving parts throughout the system. + +System replicators and object uploads/downloads operate on partitions. +As the system scales up, its behavior continues to be predictable +because the number of partitions is a fixed number. + +Implementing a partition is conceptually simple: a partition is just a +directory sitting on a disk with a corresponding hash table of what it +contains. + + +.. _objectstorage-partitions-figure: + +**Partitions** + +.. figure:: figures/objectstorage-partitions.png + + +Replicators +----------- + +In order to ensure that there are three copies of the data everywhere, +replicators continuously examine each partition. For each local +partition, the replicator compares it against the replicated copies in +the other zones to see if there are any differences. + +The replicator knows if replication needs to take place by examining +hashes. A hash file is created for each partition, which contains hashes +of each directory in the partition. For a given partition, the hash files +for each of the partition's copies are compared. If the hashes are +different, then it is time to replicate, and the directory that needs to +be replicated is copied over. + +This is where partitions come in handy. With fewer things in the system, +larger chunks of data are transferred around (rather than lots of little +TCP connections, which is inefficient) and there is a consistent number +of hashes to compare. + +The cluster has an eventually-consistent behavior where old data may be +served from partitions that missed updates, but replication will cause +all partitions to converge toward the newest data. + + +.. _objectstorage-replication-figure: + +**Replication** + +.. figure:: figures/objectstorage-replication.png + + +If a zone goes down, one of the nodes containing a replica notices and +proactively copies data to a handoff location. + +Use cases +--------- + +The following sections show use cases for object uploads and downloads +and introduce the components. + + +Upload +~~~~~~ + +A client uses the REST API to make a HTTP request to PUT an object into +an existing container. The cluster receives the request. First, the +system must figure out where the data is going to go. To do this, the +account name, container name, and object name are all used to determine +the partition where this object should live. + +Then a lookup in the ring figures out which storage nodes contain the +partitions in question. + +The data is then sent to each storage node where it is placed in the +appropriate partition. At least two of the three writes must be +successful before the client is notified that the upload was successful. + +Next, the container database is updated asynchronously to reflect that +there is a new object in it. + + +.. _objectstorage-usecase-figure: + +**Object Storage in use** + +.. figure:: figures/objectstorage-usecase.png + + +Download +~~~~~~~~ + +A request comes in for an account/container/object. Using the same +consistent hashing, the partition index is determined. A lookup in the +ring reveals which storage nodes contain that partition. A request is +made to one of the storage nodes to fetch the object and, if that fails, +requests are made to the other nodes. diff --git a/doc/source/admin/objectstorage-features.rst b/doc/source/admin/objectstorage-features.rst new file mode 100644 index 0000000000..b0cb0ed0af --- /dev/null +++ b/doc/source/admin/objectstorage-features.rst @@ -0,0 +1,52 @@ +===================== +Features and benefits +===================== + +.. list-table:: + :header-rows: 1 + :widths: 10 40 + + * - Features + - Benefits + * - Leverages commodity hardware + - No lock-in, lower price/GB. + * - HDD/node failure agnostic + - Self-healing, reliable, data redundancy protects from failures. + * - Unlimited storage + - Large and flat namespace, highly scalable read/write access, + able to serve content directly from storage system. + * - Multi-dimensional scalability + - Scale-out architecture: Scale vertically and + horizontally-distributed storage. Backs up and archives large + amounts of data with linear performance. + * - Account/container/object structure + - No nesting, not a traditional file system: Optimized for scale, + it scales to multiple petabytes and billions of objects. + * - Built-in replication 3✕ + data redundancy (compared with 2✕ on + RAID) + - A configurable number of accounts, containers and object copies + for high availability. + * - Easily add capacity (unlike RAID resize) + - Elastic data scaling with ease. + * - No central database + - Higher performance, no bottlenecks. + * - RAID not required + - Handle many small, random reads and writes efficiently. + * - Built-in management utilities + - Account management: Create, add, verify, and delete users; + Container management: Upload, download, and verify; Monitoring: + Capacity, host, network, log trawling, and cluster health. + * - Drive auditing + - Detect drive failures preempting data corruption. + * - Expiring objects + - Users can set an expiration time or a TTL on an object to + control access. + * - Direct object access + - Enable direct browser access to content, such as for a control + panel. + * - Realtime visibility into client requests + - Know what users are requesting. + * - Supports S3 API + - Utilize tools that were designed for the popular S3 API. + * - Restrict containers per account + - Limit access to control usage by user. diff --git a/doc/source/admin/objectstorage-intro.rst b/doc/source/admin/objectstorage-intro.rst new file mode 100644 index 0000000000..c5061e8a14 --- /dev/null +++ b/doc/source/admin/objectstorage-intro.rst @@ -0,0 +1,23 @@ +============================== +Introduction to Object Storage +============================== + +OpenStack Object Storage (swift) is used for redundant, scalable data +storage using clusters of standardized servers to store petabytes of +accessible data. It is a long-term storage system for large amounts of +static data which can be retrieved and updated. Object Storage uses a +distributed architecture +with no central point of control, providing greater scalability, +redundancy, and permanence. Objects are written to multiple hardware +devices, with the OpenStack software responsible for ensuring data +replication and integrity across the cluster. Storage clusters scale +horizontally by adding new nodes. Should a node fail, OpenStack works to +replicate its content from other active nodes. Because OpenStack uses +software logic to ensure data replication and distribution across +different devices, inexpensive commodity hard drives and servers can be +used in lieu of more expensive equipment. + +Object Storage is ideal for cost effective, scale-out storage. It +provides a fully distributed, API-accessible storage platform that can +be integrated directly into applications or used for backup, archiving, +and data retention. diff --git a/doc/source/admin/objectstorage-large-objects.rst b/doc/source/admin/objectstorage-large-objects.rst new file mode 100644 index 0000000000..e4b0490314 --- /dev/null +++ b/doc/source/admin/objectstorage-large-objects.rst @@ -0,0 +1,32 @@ +==================== +Large object support +==================== + +Object Storage (swift) uses segmentation to support the upload of large +objects. By default, Object Storage limits the download size of a single +object to 5GB. Using segmentation, uploading a single object is virtually +unlimited. The segmentation process works by fragmenting the object, +and automatically creating a file that sends the segments together as +a single object. This option offers greater upload speed with the possibility +of parallel uploads. + +Large objects +~~~~~~~~~~~~~ +The large object is comprised of two types of objects: + +- **Segment objects** store the object content. You can divide your + content into segments, and upload each segment into its own segment + object. Segment objects do not have any special features. You create, + update, download, and delete segment objects just as you would normal + objects. + +- A **manifest object** links the segment objects into one logical + large object. When you download a manifest object, Object Storage + concatenates and returns the contents of the segment objects in the + response body of the request. The manifest object types are: + + - **Static large objects** + - **Dynamic large objects** + +To find out more information on large object support, +see :doc:`/overview_large_objects` in the developer documentation. diff --git a/doc/source/admin/objectstorage-monitoring.rst b/doc/source/admin/objectstorage-monitoring.rst new file mode 100644 index 0000000000..86e81711b8 --- /dev/null +++ b/doc/source/admin/objectstorage-monitoring.rst @@ -0,0 +1,216 @@ +========================= +Object Storage monitoring +========================= + +.. note:: + + This section was excerpted from a `blog post by Darrell + Bishop `_ and + has since been edited. + +An OpenStack Object Storage cluster is a collection of many daemons that +work together across many nodes. With so many different components, you +must be able to tell what is going on inside the cluster. Tracking +server-level meters like CPU utilization, load, memory consumption, disk +usage and utilization, and so on is necessary, but not sufficient. + +Swift Recon +~~~~~~~~~~~ + +The Swift Recon middleware (see :ref:`cluster_telemetry_and_monitoring`) +provides general machine statistics, such as load average, socket +statistics, ``/proc/meminfo`` contents, as well as Swift-specific meters: + +- The ``MD5`` sum of each ring file. + +- The most recent object replication time. + +- Count of each type of quarantined file: Account, container, or + object. + +- Count of "async_pendings" (deferred container updates) on disk. + +Swift Recon is middleware that is installed in the object servers +pipeline and takes one required option: A local cache directory. To +track ``async_pendings``, you must set up an additional cron job for +each object server. You access data by either sending HTTP requests +directly to the object server or using the ``swift-recon`` command-line +client. + +There are Object Storage cluster statistics but the typical +server meters overlap with existing server monitoring systems. To get +the Swift-specific meters into a monitoring system, they must be polled. +Swift Recon acts as a middleware meters collector. The +process that feeds meters to your statistics system, such as +``collectd`` and ``gmond``, should already run on the storage node. +You can choose to either talk to Swift Recon or collect the meters +directly. + +Swift-Informant +~~~~~~~~~~~~~~~ + +Swift-Informant middleware (see +`swift-informant `_) has +real-time visibility into Object Storage client requests. It sits in the +pipeline for the proxy server, and after each request to the proxy server it +sends three meters to a ``StatsD`` server: + +- A counter increment for a meter like ``obj.GET.200`` or + ``cont.PUT.404``. + +- Timing data for a meter like ``acct.GET.200`` or ``obj.GET.200``. + [The README says the meters look like ``duration.acct.GET.200``, but + I do not see the ``duration`` in the code. I am not sure what the + Etsy server does but our StatsD server turns timing meters into five + derivative meters with new segments appended, so it probably works as + coded. The first meter turns into ``acct.GET.200.lower``, + ``acct.GET.200.upper``, ``acct.GET.200.mean``, + ``acct.GET.200.upper_90``, and ``acct.GET.200.count``]. + +- A counter increase by the bytes transferred for a meter like + ``tfer.obj.PUT.201``. + +This is used for receiving information on the quality of service clients +experience with the timing meters, as well as sensing the volume of the +various modifications of a request server type, command, and response +code. Swift-Informant requires no change to core Object +Storage code because it is implemented as middleware. However, it gives +no insight into the workings of the cluster past the proxy server. +If the responsiveness of one storage node degrades, you can only see +that some of the requests are bad, either as high latency or error +status codes. + +Statsdlog +~~~~~~~~~ + +The `Statsdlog `_ +project increments StatsD counters based on logged events. Like +Swift-Informant, it is also non-intrusive, however statsdlog can track +events from all Object Storage daemons, not just proxy-server. The +daemon listens to a UDP stream of syslog messages, and StatsD counters +are incremented when a log line matches a regular expression. Meter +names are mapped to regex match patterns in a JSON file, allowing +flexible configuration of what meters are extracted from the log stream. + +Currently, only the first matching regex triggers a StatsD counter +increment, and the counter is always incremented by one. There is no way +to increment a counter by more than one or send timing data to StatsD +based on the log line content. The tool could be extended to handle more +meters for each line and data extraction, including timing data. But a +coupling would still exist between the log textual format and the log +parsing regexes, which would themselves be more complex to support +multiple matches for each line and data extraction. Also, log processing +introduces a delay between the triggering event and sending the data to +StatsD. It would be preferable to increment error counters where they +occur and send timing data as soon as it is known to avoid coupling +between a log string and a parsing regex and prevent a time delay +between events and sending data to StatsD. + +The next section describes another method for gathering Object Storage +operational meters. + +Swift StatsD logging +~~~~~~~~~~~~~~~~~~~~ + +StatsD (see `Measure Anything, Measure Everything +`_) +was designed for application code to be deeply instrumented. Meters are +sent in real-time by the code that just noticed or did something. The +overhead of sending a meter is extremely low: a ``sendto`` of one UDP +packet. If that overhead is still too high, the StatsD client library +can send only a random portion of samples and StatsD approximates the +actual number when flushing meters upstream. + +To avoid the problems inherent with middleware-based monitoring and +after-the-fact log processing, the sending of StatsD meters is +integrated into Object Storage itself. Details of the meters tracked +are in the :doc:`/admin_guide`. + +The sending of meters is integrated with the logging framework. To +enable, configure ``log_statsd_host`` in the relevant config file. You +can also specify the port and a default sample rate. The specified +default sample rate is used unless a specific call to a statsd logging +method (see the list below) overrides it. Currently, no logging calls +override the sample rate, but it is conceivable that some meters may +require accuracy (``sample_rate=1``) while others may not. + +.. code-block:: ini + + [DEFAULT] + # ... + log_statsd_host = 127.0.0.1 + log_statsd_port = 8125 + log_statsd_default_sample_rate = 1 + +Then the LogAdapter object returned by ``get_logger()``, usually stored +in ``self.logger``, has these new methods: + +- ``update_stats(self, metric, amount, sample_rate=1)`` Increments + the supplied meter by the given amount. This is used when you need + to add or subtract more that one from a counter, like incrementing + ``suffix.hashes`` by the number of computed hashes in the object + replicator. + +- ``increment(self, metric, sample_rate=1)`` Increments the given counter + meter by one. + +- ``decrement(self, metric, sample_rate=1)`` Lowers the given counter + meter by one. + +- ``timing(self, metric, timing_ms, sample_rate=1)`` Record that the + given meter took the supplied number of milliseconds. + +- ``timing_since(self, metric, orig_time, sample_rate=1)`` + Convenience method to record a timing meter whose value is "now" + minus an existing timestamp. + +.. note:: + + These logging methods may safely be called anywhere you have a + logger object. If StatsD logging has not been configured, the methods + are no-ops. This avoids messy conditional logic each place a meter is + recorded. These example usages show the new logging methods: + + .. code-block:: python + + # swift/obj/replicator.py + def update(self, job): + # ... + begin = time.time() + try: + hashed, local_hash = tpool.execute(tpooled_get_hashes, job['path'], + do_listdir=(self.replication_count % 10) == 0, + reclaim_age=self.reclaim_age) + # See tpooled_get_hashes "Hack". + if isinstance(hashed, BaseException): + raise hashed + self.suffix_hash += hashed + self.logger.update_stats('suffix.hashes', hashed) + # ... + finally: + self.partition_times.append(time.time() - begin) + self.logger.timing_since('partition.update.timing', begin) + + .. code-block:: python + + # swift/container/updater.py + def process_container(self, dbfile): + # ... + start_time = time.time() + # ... + for event in events: + if 200 <= event.wait() < 300: + successes += 1 + else: + failures += 1 + if successes > failures: + self.logger.increment('successes') + # ... + else: + self.logger.increment('failures') + # ... + # Only track timing data for attempted updates: + self.logger.timing_since('timing', start_time) + else: + self.logger.increment('no_changes') + self.no_changes += 1 diff --git a/doc/source/admin/objectstorage-replication.rst b/doc/source/admin/objectstorage-replication.rst new file mode 100644 index 0000000000..32cd33ad60 --- /dev/null +++ b/doc/source/admin/objectstorage-replication.rst @@ -0,0 +1,98 @@ +=========== +Replication +=========== + +Because each replica in Object Storage functions independently and +clients generally require only a simple majority of nodes to respond to +consider an operation successful, transient failures like network +partitions can quickly cause replicas to diverge. These differences are +eventually reconciled by asynchronous, peer-to-peer replicator +processes. The replicator processes traverse their local file systems +and concurrently perform operations in a manner that balances load +across physical disks. + +Replication uses a push model, with records and files generally only +being copied from local to remote replicas. This is important because +data on the node might not belong there (as in the case of hand offs and +ring changes), and a replicator cannot know which data it should pull in +from elsewhere in the cluster. Any node that contains data must ensure +that data gets to where it belongs. The ring handles replica placement. + +To replicate deletions in addition to creations, every deleted record or +file in the system is marked by a tombstone. The replication process +cleans up tombstones after a time period known as the ``consistency +window``. This window defines the duration of the replication and how +long transient failure can remove a node from the cluster. Tombstone +cleanup must be tied to replication to reach replica convergence. + +If a replicator detects that a remote drive has failed, the replicator +uses the ``get_more_nodes`` interface for the ring to choose an +alternate node with which to synchronize. The replicator can maintain +desired levels of replication during disk failures, though some replicas +might not be in an immediately usable location. + +.. note:: + + The replicator does not maintain desired levels of replication when + failures such as entire node failures occur; most failures are + transient. + +The main replication types are: + +- Database replication + Replicates containers and objects. + +- Object replication + Replicates object data. + +Database replication +~~~~~~~~~~~~~~~~~~~~ + +Database replication completes a low-cost hash comparison to determine +whether two replicas already match. Normally, this check can quickly +verify that most databases in the system are already synchronized. If +the hashes differ, the replicator synchronizes the databases by sharing +records added since the last synchronization point. + +This synchronization point is a high water mark that notes the last +record at which two databases were known to be synchronized, and is +stored in each database as a tuple of the remote database ID and record +ID. Database IDs are unique across all replicas of the database, and +record IDs are monotonically increasing integers. After all new records +are pushed to the remote database, the entire synchronization table of +the local database is pushed, so the remote database can guarantee that +it is synchronized with everything with which the local database was +previously synchronized. + +If a replica is missing, the whole local database file is transmitted to +the peer by using rsync(1) and is assigned a new unique ID. + +In practice, database replication can process hundreds of databases per +concurrency setting per second (up to the number of available CPUs or +disks) and is bound by the number of database transactions that must be +performed. + +Object replication +~~~~~~~~~~~~~~~~~~ + +The initial implementation of object replication performed an rsync to +push data from a local partition to all remote servers where it was +expected to reside. While this worked at small scale, replication times +skyrocketed once directory structures could no longer be held in RAM. +This scheme was modified to save a hash of the contents for each suffix +directory to a per-partition hashes file. The hash for a suffix +directory is no longer valid when the contents of that suffix directory +is modified. + +The object replication process reads in hash files and calculates any +invalidated hashes. Then, it transmits the hashes to each remote server +that should hold the partition, and only suffix directories with +differing hashes on the remote server are rsynced. After pushing files +to the remote server, the replication process notifies it to recalculate +hashes for the rsynced suffix directories. + +The number of uncached directories that object replication must +traverse, usually as a result of invalidated suffix directory hashes, +impedes performance. To provide acceptable replication speeds, object +replication is designed to invalidate around 2 percent of the hash space +on a normal node each day. diff --git a/doc/source/admin/objectstorage-ringbuilder.rst b/doc/source/admin/objectstorage-ringbuilder.rst new file mode 100644 index 0000000000..ddd6f6063c --- /dev/null +++ b/doc/source/admin/objectstorage-ringbuilder.rst @@ -0,0 +1,228 @@ +============ +Ring-builder +============ + +Use the swift-ring-builder utility to build and manage rings. This +utility assigns partitions to devices and writes an optimized Python +structure to a gzipped, serialized file on disk for transmission to the +servers. The server processes occasionally check the modification time +of the file and reload in-memory copies of the ring structure as needed. +If you use a slightly older version of the ring, one of the three +replicas for a partition subset will be incorrect because of the way the +ring-builder manages changes to the ring. You can work around this +issue. + +The ring-builder also keeps its own builder file with the ring +information and additional data required to build future rings. It is +very important to keep multiple backup copies of these builder files. +One option is to copy the builder files out to every server while +copying the ring files themselves. Another is to upload the builder +files into the cluster itself. If you lose the builder file, you have to +create a new ring from scratch. Nearly all partitions would be assigned +to different devices and, therefore, nearly all of the stored data would +have to be replicated to new locations. So, recovery from a builder file +loss is possible, but data would be unreachable for an extended time. + +Ring data structure +~~~~~~~~~~~~~~~~~~~ + +The ring data structure consists of three top level fields: a list of +devices in the cluster, a list of lists of device ids indicating +partition to device assignments, and an integer indicating the number of +bits to shift an MD5 hash to calculate the partition for the hash. + +Partition assignment list +~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is a list of ``array('H')`` of devices ids. The outermost list +contains an ``array('H')`` for each replica. Each ``array('H')`` has a +length equal to the partition count for the ring. Each integer in the +``array('H')`` is an index into the above list of devices. The partition +list is known internally to the Ring class as ``_replica2part2dev_id``. + +So, to create a list of device dictionaries assigned to a partition, the +Python code would look like: + +.. code-block:: python + + devices = [self.devs[part2dev_id[partition]] for + part2dev_id in self._replica2part2dev_id] + +That code is a little simplistic because it does not account for the +removal of duplicate devices. If a ring has more replicas than devices, +a partition will have more than one replica on a device. + +``array('H')`` is used for memory conservation as there may be millions +of partitions. + +Overload +~~~~~~~~ + +The ring builder tries to keep replicas as far apart as possible while +still respecting device weights. When it can not do both, the overload +factor determines what happens. Each device takes an extra +fraction of its desired partitions to allow for replica dispersion; +after that extra fraction is exhausted, replicas are placed closer +together than optimal. + +The overload factor lets the operator trade off replica +dispersion (durability) against data dispersion (uniform disk usage). + +The default overload factor is 0, so device weights are strictly +followed. + +With an overload factor of 0.1, each device accepts 10% more +partitions than it otherwise would, but only if it needs to maintain +partition dispersion. + +For example, consider a 3-node cluster of machines with equal-size disks; +node A has 12 disks, node B has 12 disks, and node C has +11 disks. The ring has an overload factor of 0.1 (10%). + +Without the overload, some partitions would end up with replicas only +on nodes A and B. However, with the overload, every device can accept +up to 10% more partitions for the sake of dispersion. The +missing disk in C means there is one disk's worth of partitions +to spread across the remaining 11 disks, which gives each +disk in C an extra 9.09% load. Since this is less than the 10% +overload, there is one replica of each partition on each node. + +However, this does mean that the disks in node C have more data +than the disks in nodes A and B. If 80% full is the warning +threshold for the cluster, node C's disks reach 80% full while A +and B's disks are only 72.7% full. + + +Replica counts +~~~~~~~~~~~~~~ + +To support the gradual change in replica counts, a ring can have a real +number of replicas and is not restricted to an integer number of +replicas. + +A fractional replica count is for the whole ring and not for individual +partitions. It indicates the average number of replicas for each +partition. For example, a replica count of 3.2 means that 20 percent of +partitions have four replicas and 80 percent have three replicas. + +The replica count is adjustable. For example: + +.. code-block:: console + + $ swift-ring-builder account.builder set_replicas 4 + $ swift-ring-builder account.builder rebalance + +You must rebalance the replica ring in globally distributed clusters. +Operators of these clusters generally want an equal number of replicas +and regions. Therefore, when an operator adds or removes a region, the +operator adds or removes a replica. Removing unneeded replicas saves on +the cost of disks. + +You can gradually increase the replica count at a rate that does not +adversely affect cluster performance. For example: + +.. code-block:: console + + $ swift-ring-builder object.builder set_replicas 3.01 + $ swift-ring-builder object.builder rebalance + ... + + $ swift-ring-builder object.builder set_replicas 3.02 + $ swift-ring-builder object.builder rebalance + ... + +Changes take effect after the ring is rebalanced. Therefore, if you +intend to change from 3 replicas to 3.01 but you accidentally type +2.01, no data is lost. + +Additionally, the :command:`swift-ring-builder X.builder create` command can +now take a decimal argument for the number of replicas. + +Partition shift value +~~~~~~~~~~~~~~~~~~~~~ + +The partition shift value is known internally to the Ring class as +``_part_shift``. This value is used to shift an MD5 hash to calculate +the partition where the data for that hash should reside. Only the top +four bytes of the hash is used in this process. For example, to compute +the partition for the ``/account/container/object`` path using Python: + +.. code-block:: python + + partition = unpack_from('>I', + md5('/account/container/object').digest())[0] >> + self._part_shift + +For a ring generated with part\_power P, the partition shift value is +``32 - P``. + +Build the ring +~~~~~~~~~~~~~~ + +The ring builder process includes these high-level steps: + +#. The utility calculates the number of partitions to assign to each + device based on the weight of the device. For example, for a + partition at the power of 20, the ring has 1,048,576 partitions. One + thousand devices of equal weight each want 1,048.576 partitions. The + devices are sorted by the number of partitions they desire and kept + in order throughout the initialization process. + + .. note:: + + Each device is also assigned a random tiebreaker value that is + used when two devices desire the same number of partitions. This + tiebreaker is not stored on disk anywhere, and so two different + rings created with the same parameters will have different + partition assignments. For repeatable partition assignments, + ``RingBuilder.rebalance()`` takes an optional seed value that + seeds the Python pseudo-random number generator. + +#. The ring builder assigns each partition replica to the device that + requires most partitions at that point while keeping it as far away + as possible from other replicas. The ring builder prefers to assign a + replica to a device in a region that does not already have a replica. + If no such region is available, the ring builder searches for a + device in a different zone, or on a different server. If it does not + find one, it looks for a device with no replicas. Finally, if all + options are exhausted, the ring builder assigns the replica to the + device that has the fewest replicas already assigned. + + .. note:: + + The ring builder assigns multiple replicas to one device only if + the ring has fewer devices than it has replicas. + +#. When building a new ring from an old ring, the ring builder + recalculates the desired number of partitions that each device wants. + +#. The ring builder unassigns partitions and gathers these partitions + for reassignment, as follows: + + - The ring builder unassigns any assigned partitions from any + removed devices and adds these partitions to the gathered list. + - The ring builder unassigns any partition replicas that can be + spread out for better durability and adds these partitions to the + gathered list. + - The ring builder unassigns random partitions from any devices that + have more partitions than they need and adds these partitions to + the gathered list. + +#. The ring builder reassigns the gathered partitions to devices by + using a similar method to the one described previously. + +#. When the ring builder reassigns a replica to a partition, the ring + builder records the time of the reassignment. The ring builder uses + this value when it gathers partitions for reassignment so that no + partition is moved twice in a configurable amount of time. The + RingBuilder class knows this configurable amount of time as + ``min_part_hours``. The ring builder ignores this restriction for + replicas of partitions on removed devices because removal of a device + happens on device failure only, and reassignment is the only choice. + +These steps do not always perfectly rebalance a ring due to the random +nature of gathering partitions for reassignment. To help reach a more +balanced ring, the rebalance process is repeated until near perfect +(less than 1 percent off) or when the balance does not improve by at +least 1 percent (indicating we probably cannot get perfect balance due +to wildly imbalanced zones or too many partitions recently moved). diff --git a/doc/source/admin/objectstorage-tenant-specific-image-storage.rst b/doc/source/admin/objectstorage-tenant-specific-image-storage.rst new file mode 100644 index 0000000000..69855d8ef1 --- /dev/null +++ b/doc/source/admin/objectstorage-tenant-specific-image-storage.rst @@ -0,0 +1,32 @@ +============================================================== +Configure project-specific image locations with Object Storage +============================================================== + +For some deployers, it is not ideal to store all images in one place to +enable all projects and users to access them. You can configure the Image +service to store image data in project-specific image locations. Then, +only the following projects can use the Image service to access the +created image: + +- The project who owns the image +- Projects that are defined in ``swift_store_admin_tenants`` and that + have admin-level accounts + +**To configure project-specific image locations** + +#. Configure swift as your ``default_store`` in the + ``glance-api.conf`` file. + +#. Set these configuration options in the ``glance-api.conf`` file: + + - swift_store_multi_tenant + Set to ``True`` to enable tenant-specific storage locations. + Default is ``False``. + + - swift_store_admin_tenants + Specify a list of tenant IDs that can grant read and write access to all + Object Storage containers that are created by the Image service. + +With this configuration, images are stored in an Object Storage service +(swift) endpoint that is pulled from the service catalog for the +authenticated user. diff --git a/doc/source/admin/objectstorage-troubleshoot.rst b/doc/source/admin/objectstorage-troubleshoot.rst new file mode 100644 index 0000000000..29adaba07a --- /dev/null +++ b/doc/source/admin/objectstorage-troubleshoot.rst @@ -0,0 +1,208 @@ +=========================== +Troubleshoot Object Storage +=========================== + +For Object Storage, everything is logged in ``/var/log/syslog`` (or +``messages`` on some distros). Several settings enable further +customization of logging, such as ``log_name``, ``log_facility``, and +``log_level``, within the object server configuration files. + +Drive failure +~~~~~~~~~~~~~ + +Problem +------- + +Drive failure can prevent Object Storage performing replication. + +Solution +-------- + +In the event that a drive has failed, the first step is to make sure the +drive is unmounted. This will make it easier for Object Storage to work +around the failure until it has been resolved. If the drive is going to +be replaced immediately, then it is just best to replace the drive, +format it, remount it, and let replication fill it up. + +If you cannot replace the drive immediately, then it is best to leave it +unmounted, and remove the drive from the ring. This will allow all the +replicas that were on that drive to be replicated elsewhere until the +drive is replaced. Once the drive is replaced, it can be re-added to the +ring. + +You can look at error messages in the ``/var/log/kern.log`` file for +hints of drive failure. + +Server failure +~~~~~~~~~~~~~~ + +Problem +------- + +The server is potentially offline, and may have failed, or require a +reboot. + +Solution +-------- + +If a server is having hardware issues, it is a good idea to make sure +the Object Storage services are not running. This will allow Object +Storage to work around the failure while you troubleshoot. + +If the server just needs a reboot, or a small amount of work that should +only last a couple of hours, then it is probably best to let Object +Storage work around the failure and get the machine fixed and back +online. When the machine comes back online, replication will make sure +that anything that is missing during the downtime will get updated. + +If the server has more serious issues, then it is probably best to +remove all of the server's devices from the ring. Once the server has +been repaired and is back online, the server's devices can be added back +into the ring. It is important that the devices are reformatted before +putting them back into the ring as it is likely to be responsible for a +different set of partitions than before. + +Detect failed drives +~~~~~~~~~~~~~~~~~~~~ + +Problem +------- + +When drives fail, it can be difficult to detect that a drive has failed, +and the details of the failure. + +Solution +-------- + +It has been our experience that when a drive is about to fail, error +messages appear in the ``/var/log/kern.log`` file. There is a script called +``swift-drive-audit`` that can be run via cron to watch for bad drives. If +errors are detected, it will unmount the bad drive, so that Object +Storage can work around it. The script takes a configuration file with +the following settings: + +.. list-table:: **Description of configuration options for [drive-audit] in drive-audit.conf** + :header-rows: 1 + + * - Configuration option = Default value + - Description + * - ``device_dir = /srv/node`` + - Directory devices are mounted under + * - ``error_limit = 1`` + - Number of errors to find before a device is unmounted + * - ``log_address = /dev/log`` + - Location where syslog sends the logs to + * - ``log_facility = LOG_LOCAL0`` + - Syslog log facility + * - ``log_file_pattern = /var/log/kern.*[!.][!g][!z]`` + - Location of the log file with globbing pattern to check against device + errors locate device blocks with errors in the log file + * - ``log_level = INFO`` + - Logging level + * - ``log_max_line_length = 0`` + - Caps the length of log lines to the value given; no limit if set to 0, + the default. + * - ``log_to_console = False`` + - No help text available for this option. + * - ``minutes = 60`` + - Number of minutes to look back in ``/var/log/kern.log`` + * - ``recon_cache_path = /var/cache/swift`` + - Directory where stats for a few items will be stored + * - ``regex_pattern_1 = \berror\b.*\b(dm-[0-9]{1,2}\d?)\b`` + - No help text available for this option. + * - ``unmount_failed_device = True`` + - No help text available for this option. + +.. warning:: + + This script has only been tested on Ubuntu 10.04; use with caution on + other operating systems in production. + +Emergency recovery of ring builder files +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Problem +------- + +An emergency might prevent a successful backup from restoring the +cluster to operational status. + +Solution +-------- + +You should always keep a backup of swift ring builder files. However, if +an emergency occurs, this procedure may assist in returning your cluster +to an operational state. + +Using existing swift tools, there is no way to recover a builder file +from a ``ring.gz`` file. However, if you have a knowledge of Python, it +is possible to construct a builder file that is pretty close to the one +you have lost. + +.. warning:: + + This procedure is a last-resort for emergency circumstances. It + requires knowledge of the swift python code and may not succeed. + +#. Load the ring and a new ringbuilder object in a Python REPL: + + .. code-block:: python + + >>> from swift.common.ring import RingData, RingBuilder + >>> ring = RingData.load('/path/to/account.ring.gz') + +#. Start copying the data we have in the ring into the builder: + + .. code-block:: python + + >>> import math + >>> partitions = len(ring._replica2part2dev_id[0]) + >>> replicas = len(ring._replica2part2dev_id) + + >>> builder = RingBuilder(int(math.log(partitions, 2)), replicas, 1) + >>> builder.devs = ring.devs + >>> builder._replica2part2dev = ring._replica2part2dev_id + >>> builder._last_part_moves_epoch = 0 + >>> from array import array + >>> builder._last_part_moves = array('B', (0 for _ in range(partitions))) + >>> builder._set_parts_wanted() + >>> for d in builder._iter_devs(): + d['parts'] = 0 + >>> for p2d in builder._replica2part2dev: + for dev_id in p2d: + builder.devs[dev_id]['parts'] += 1 + + This is the extent of the recoverable fields. + +#. For ``min_part_hours`` you either have to remember what the value you + used was, or just make up a new one: + + .. code-block:: python + + >>> builder.change_min_part_hours(24) # or whatever you want it to be + +#. Validate the builder. If this raises an exception, check your + previous code: + + .. code-block:: python + + >>> builder.validate() + +#. After it validates, save the builder and create a new ``account.builder``: + + .. code-block:: python + + >>> import pickle + >>> pickle.dump(builder.to_dict(), open('account.builder', 'wb'), protocol=2) + >>> exit () + +#. You should now have a file called ``account.builder`` in the current + working directory. Run + :command:`swift-ring-builder account.builder write_ring` and compare the new + ``account.ring.gz`` to the ``account.ring.gz`` that you started + from. They probably are not byte-for-byte identical, but if you load them + in a REPL and their ``_replica2part2dev_id`` and ``devs`` attributes are + the same (or nearly so), then you are in good shape. + +#. Repeat the procedure for ``container.ring.gz`` and + ``object.ring.gz``, and you might get usable builder files. diff --git a/doc/source/admin_guide.rst b/doc/source/admin_guide.rst index b1d00fa364..8754fd34d7 100644 --- a/doc/source/admin_guide.rst +++ b/doc/source/admin_guide.rst @@ -2,6 +2,33 @@ Administrator's Guide ===================== +------------------------- +Defining Storage Policies +------------------------- + +Defining your Storage Policies is very easy to do with Swift. It is important +that the administrator understand the concepts behind Storage Policies +before actually creating and using them in order to get the most benefit out +of the feature and, more importantly, to avoid having to make unnecessary changes +once a set of policies have been deployed to a cluster. + +It is highly recommended that the reader fully read and comprehend +:doc:`overview_policies` before proceeding with administration of +policies. Plan carefully and it is suggested that experimentation be +done first on a non-production cluster to be certain that the desired +configuration meets the needs of the users. See :ref:`upgrade-policy` +before planning the upgrade of your existing deployment. + +Following is a high level view of the very few steps it takes to configure +policies once you have decided what you want to do: + +#. Define your policies in ``/etc/swift/swift.conf`` +#. Create the corresponding object rings +#. Communicate the names of the Storage Policies to cluster users + +For a specific example that takes you through these steps, please see +:doc:`policies_saio` + ------------------ Managing the Rings ------------------ @@ -22,25 +49,27 @@ ring building server **last** after all Swift nodes have been successfully upgraded, or refrain from generating rings until all Swift nodes have been successfully upgraded. -If you need to downgrade from a version of swift greater than 1.6.0 to +If you need to downgrade from a version of Swift greater than 1.6.0 to a version less than or equal to 1.6.0, first downgrade your ring-building server, generate new rings, push them out, then continue with the rest of the downgrade. For more information see :doc:`overview_ring`. +.. highlight:: none + Removing a device from the ring:: swift-ring-builder remove / - + Removing a server from the ring:: swift-ring-builder remove - + Adding devices to the ring: See :ref:`ring-preparing` - + See what devices for a server are in the ring:: swift-ring-builder search @@ -49,10 +78,79 @@ Once you are done with all changes to the ring, the changes need to be "committed":: swift-ring-builder rebalance - + Once the new rings are built, they should be pushed out to all the servers in the cluster. +Optionally, if invoked as 'swift-ring-builder-safe' the directory containing +the specified builder file will be locked (via a .lock file in the parent +directory). This provides a basic safe guard against multiple instances +of the swift-ring-builder (or other utilities that observe this lock) from +attempting to write to or read the builder/ring files while operations are in +progress. This can be useful in environments where ring management has been +automated but the operator still needs to interact with the rings manually. + +If the ring builder is not producing the balances that you are +expecting, you can gain visibility into what it's doing with the +``--debug`` flag.:: + + swift-ring-builder rebalance --debug + +This produces a great deal of output that is mostly useful if you are +either (a) attempting to fix the ring builder, or (b) filing a bug +against the ring builder. + +You may notice in the rebalance output a 'dispersion' number. What this +number means is explained in :ref:`ring_dispersion` but in essence +is the percentage of partitions in the ring that have too many replicas +within a particular failure domain. You can ask 'swift-ring-builder' what +the dispersion is with:: + + swift-ring-builder dispersion + +This will give you the percentage again, if you want a detailed view of +the dispersion simply add a ``--verbose``:: + + swift-ring-builder dispersion --verbose + +This will not only display the percentage but will also display a dispersion +table that lists partition dispersion by tier. You can use this table to figure +out were you need to add capacity or to help tune an :ref:`ring_overload` value. + +Now let's take an example with 1 region, 3 zones and 4 devices. Each device has +the same weight, and the ``dispersion --verbose`` might show the following:: + + Dispersion is 16.666667, Balance is 0.000000, Overload is 0.00% + Required overload is 33.333333% + Worst tier is 33.333333 (r1z3) + -------------------------------------------------------------------------- + Tier Parts % Max 0 1 2 3 + -------------------------------------------------------------------------- + r1 768 0.00 3 0 0 0 256 + r1z1 192 0.00 1 64 192 0 0 + r1z1-127.0.0.1 192 0.00 1 64 192 0 0 + r1z1-127.0.0.1/sda 192 0.00 1 64 192 0 0 + r1z2 192 0.00 1 64 192 0 0 + r1z2-127.0.0.2 192 0.00 1 64 192 0 0 + r1z2-127.0.0.2/sda 192 0.00 1 64 192 0 0 + r1z3 384 33.33 1 0 128 128 0 + r1z3-127.0.0.3 384 33.33 1 0 128 128 0 + r1z3-127.0.0.3/sda 192 0.00 1 64 192 0 0 + r1z3-127.0.0.3/sdb 192 0.00 1 64 192 0 0 + +The first line reports that there are 256 partitions with 3 copies in region 1; +and this is an expected output in this case (single region with 3 replicas) as +reported by the "Max" value. + +However, there is some imbalance in the cluster, more precisely in zone 3. The +"Max" reports a maximum of 1 copy in this zone; however 50.00% of the partitions +are storing 2 replicas in this zone (which is somewhat expected, because there +are more disks in this zone). + +You can now either add more capacity to the other zones, decrease the total +weight in zone 3 or set the overload to a value `greater than` 33.333333% - +only as much overload as needed will be used. + ----------------------- Scripting Ring Creation ----------------------- @@ -65,18 +163,19 @@ You can create scripts to create the account and container rings and rebalance. cd /etc/swift rm -f account.builder account.ring.gz backups/account.builder backups/account.ring.gz swift-ring-builder account.builder create 18 3 1 - swift-ring-builder account.builder add z1-:6002/sdb1 1 - swift-ring-builder account.builder add z2-:6002/sdb1 1 + swift-ring-builder account.builder add r1z1-:6202/sdb1 1 + swift-ring-builder account.builder add r1z2-:6202/sdb1 1 swift-ring-builder account.builder rebalance You need to replace the values of , , etc. with the IP addresses of the account servers used in your setup. You can have as many account servers as you need. All account servers are assumed to be listening on port - 6002, and have a storage device called "sdb1" (this is a directory + 6202, and have a storage device called "sdb1" (this is a directory name created under /drives when we setup the account server). The "z1", "z2", etc. designate zones, and you can choose whether you - put devices in the same or different zones. + put devices in the same or different zones. The "r1" designates + the region, with different regions specified as "r1", "r2", etc. 2. Make the script file executable and run it to create the account ring file:: @@ -104,22 +203,37 @@ Handling Drive Failure ---------------------- In the event that a drive has failed, the first step is to make sure the drive -is unmounted. This will make it easier for swift to work around the failure +is unmounted. This will make it easier for Swift to work around the failure until it has been resolved. If the drive is going to be replaced immediately, then it is just best to replace the drive, format it, remount it, and let replication fill it up. +After the drive is unmounted, make sure the mount point is owned by root +(root:root 755). This ensures that rsync will not try to replicate into the +root drive once the failed drive is unmounted. + If the drive can't be replaced immediately, then it is best to leave it -unmounted, and remove the drive from the ring. This will allow all the +unmounted, and set the device weight to 0. This will allow all the replicas that were on that drive to be replicated elsewhere until the drive -is replaced. Once the drive is replaced, it can be re-added to the ring. +is replaced. Once the drive is replaced, the device weight can be increased +again. Setting the device weight to 0 instead of removing the drive from the +ring gives Swift the chance to replicate data from the failing disk too (in case +it is still possible to read some of the data). + +Setting the device weight to 0 (or removing a failed drive from the ring) has +another benefit: all partitions that were stored on the failed drive are +distributed over the remaining disks in the cluster, and each disk only needs to +store a few new partitions. This is much faster compared to replicating all +partitions to a single, new disk. It decreases the time to recover from a +degraded number of replicas significantly, and becomes more and more important +with bigger disks. ----------------------- Handling Server Failure ----------------------- -If a server is having hardware issues, it is a good idea to make sure the -swift services are not running. This will allow Swift to work around the +If a server is having hardware issues, it is a good idea to make sure the +Swift services are not running. This will allow Swift to work around the failure while you troubleshoot. If the server just needs a reboot, or a small amount of work that should @@ -141,31 +255,156 @@ Detecting Failed Drives It has been our experience that when a drive is about to fail, error messages will spew into `/var/log/kern.log`. There is a script called -`swift-drive-audit` that can be run via cron to watch for bad drives. If +`swift-drive-audit` that can be run via cron to watch for bad drives. If errors are detected, it will unmount the bad drive, so that Swift can work around it. The script takes a configuration file with the following settings: -[drive-audit] +``[drive-audit]`` + +================== ============== =========================================== +Option Default Description +------------------ -------------- ------------------------------------------- +user swift Drop privileges to this user for non-root + tasks +log_facility LOG_LOCAL0 Syslog log facility +log_level INFO Log level +device_dir /srv/node Directory devices are mounted under +minutes 60 Number of minutes to look back in + `/var/log/kern.log` +error_limit 1 Number of errors to find before a device + is unmounted +log_file_pattern /var/log/kern* Location of the log file with globbing + pattern to check against device errors +regex_pattern_X (see below) Regular expression patterns to be used to + locate device blocks with errors in the + log file +================== ============== =========================================== + +The default regex pattern used to locate device blocks with errors are +`\berror\b.*\b(sd[a-z]{1,2}\d?)\b` and `\b(sd[a-z]{1,2}\d?)\b.*\berror\b`. +One is able to overwrite the default above by providing new expressions +using the format `regex_pattern_X = regex_expression`, where `X` is a number. + +This script has been tested on Ubuntu 10.04 and Ubuntu 12.04, so if you are +using a different distro or OS, some care should be taken before using in production. + +------------------------------ +Preventing Disk Full Scenarios +------------------------------ + +.. highlight:: cfg + +Prevent disk full scenarios by ensuring that the ``proxy-server`` blocks PUT +requests and rsync prevents replication to the specific drives. + +You can prevent `proxy-server` PUT requests to low space disks by +ensuring ``fallocate_reserve`` is set in ``account-server.conf``, +``container-server.conf``, and ``object-server.conf``. By default, +``fallocate_reserve`` is set to 1%. In the object server, this blocks +PUT requests that would leave the free disk space below 1% of the +disk. In the account and container servers, this blocks operations +that will increase account or container database size once the free +disk space falls below 1%. + +Setting ``fallocate_reserve`` is highly recommended to avoid filling +disks to 100%. When Swift's disks are completely full, all requests +involving those disks will fail, including DELETE requests that would +otherwise free up space. This is because object deletion includes the +creation of a zero-byte tombstone (.ts) to record the time of the +deletion for replication purposes; this happens prior to deletion of +the object's data. On a completely-full filesystem, that zero-byte .ts +file cannot be created, so the DELETE request will fail and the disk +will remain completely full. If ``fallocate_reserve`` is set, then the +filesystem will have enough space to create the zero-byte .ts file, +and thus the deletion of the object will succeed and free up some +space. + +In order to prevent rsync replication to specific drives, firstly +setup ``rsync_module`` per disk in your ``object-replicator``. +Set this in ``object-server.conf``: + +.. code:: cfg + + [object-replicator] + rsync_module = {replication_ip}::object_{device} + +Set the individual drives in ``rsync.conf``. For example: + +.. code:: cfg + + [object_sda] + max connections = 4 + lock file = /var/lock/object_sda.lock + + [object_sdb] + max connections = 4 + lock file = /var/lock/object_sdb.lock + +Finally, monitor the disk space of each disk and adjust the rsync +``max connections`` per drive to ``-1``. We recommend utilising your existing +monitoring solution to achieve this. The following is an example script: -================== ========== =========================================== -Option Default Description ------------------- ---------- ------------------------------------------- -log_facility LOG_LOCAL0 Syslog log facility -log_level INFO Log level -device_dir /srv/node Directory devices are mounted under -minutes 60 Number of minutes to look back in - `/var/log/kern.log` -error_limit 1 Number of errors to find before a device - is unmounted -================== ========== =========================================== +.. code-block:: python -This script has only been tested on Ubuntu 10.04, so if you are using a -different distro or OS, some care should be taken before using in production. + #!/usr/bin/env python + import os + import errno --------------- -Cluster Health --------------- + RESERVE = 500 * 2 ** 20 # 500 MiB + + DEVICES = '/srv/node1' + + path_template = '/etc/rsync.d/disable_%s.conf' + config_template = ''' + [object_%s] + max connections = -1 + ''' + + def disable_rsync(device): + with open(path_template % device, 'w') as f: + f.write(config_template.lstrip() % device) + + + def enable_rsync(device): + try: + os.unlink(path_template % device) + except OSError as e: + # ignore file does not exist + if e.errno != errno.ENOENT: + raise + + + for device in os.listdir(DEVICES): + path = os.path.join(DEVICES, device) + st = os.statvfs(path) + free = st.f_bavail * st.f_frsize + if free < RESERVE: + disable_rsync(device) + else: + enable_rsync(device) + +For the above script to work, ensure ``/etc/rsync.d/`` conf files are +included, by specifying ``&include`` in your ``rsync.conf`` file: + +.. code:: cfg + + &include /etc/rsync.d + +Use this in conjunction with a cron job to periodically run the script, for example: + +.. highlight:: none + +.. code:: cfg + + # /etc/cron.d/devicecheck + * * * * * root /some/path/to/disable_rsync.py + +.. _dispersion_report: + +----------------- +Dispersion Report +----------------- There is a swift-dispersion-report tool for measuring overall cluster health. This is accomplished by checking if a set of deliberately distributed @@ -191,6 +430,8 @@ object names until they fall on distinct partitions. Last, and repeatedly for the life of the cluster, we need to run the swift-dispersion-report tool to check the health of each of these containers and objects. +.. highlight:: cfg + These tools need direct access to the entire cluster and to the ring files (installing them on a proxy server will probably do). Both swift-dispersion-populate and swift-dispersion-report use the same @@ -200,10 +441,14 @@ configuration file, /etc/swift/dispersion.conf. Example conf file:: auth_url = http://localhost:8080/auth/v1.0 auth_user = test:tester auth_key = testing + endpoint_type = internalURL + +.. highlight:: none There are also options for the conf file for specifying the dispersion coverage (defaults to 1%), retries, concurrency, etc. though usually the defaults are -fine. +fine. If you want to use keystone v3 for authentication there are options like +auth_version, user_domain_name, project_domain_name and project_name. Once the configuration is in place, run `swift-dispersion-populate` to populate the containers and objects throughout the cluster. @@ -216,7 +461,7 @@ the cluster. Here is an example of a cluster in perfect health:: Queried 2621 containers for dispersion reporting, 19s, 0 retries 100.00% of container copies found (7863 of 7863) Sample represents 1.00% of the container partition space - + Queried 2619 objects for dispersion reporting, 7s, 0 retries 100.00% of object copies found (7857 of 7857) Sample represents 1.00% of the object partition space @@ -232,7 +477,7 @@ that has:: Queried 2621 containers for dispersion reporting, 8s, 0 retries 100.00% of container copies found (7863 of 7863) Sample represents 1.00% of the container partition space - + Queried 2619 objects for dispersion reporting, 7s, 0 retries There were 1763 partitions missing one copy. 77.56% of object copies found (6094 of 7857) @@ -266,12 +511,157 @@ You can also run the report for only containers or objects:: 100.00% of object copies found (7857 of 7857) Sample represents 1.00% of the object partition space -Alternatively, the dispersion report can also be output in json format. This +Alternatively, the dispersion report can also be output in JSON format. This allows it to be more easily consumed by third party utilities:: $ swift-dispersion-report -j {"object": {"retries:": 0, "missing_two": 0, "copies_found": 7863, "missing_one": 0, "copies_expected": 7863, "pct_found": 100.0, "overlapping": 0, "missing_all": 0}, "container": {"retries:": 0, "missing_two": 0, "copies_found": 12534, "missing_one": 0, "copies_expected": 12534, "pct_found": 100.0, "overlapping": 15, "missing_all": 0}} +Note that you may select which storage policy to use by setting the option +'--policy-name silver' or '-P silver' (silver is the example policy name here). +If no policy is specified, the default will be used per the swift.conf file. +When you specify a policy the containers created also include the policy index, +thus even when running a container_only report, you will need to specify the +policy not using the default. + +----------------------------------------------- +Geographically Distributed Swift Considerations +----------------------------------------------- + +Swift provides two features that may be used to distribute replicas of objects +across multiple geographically distributed data-centers: with +:doc:`overview_global_cluster` object replicas may be dispersed across devices +from different data-centers by using `regions` in ring device descriptors; with +:doc:`overview_container_sync` objects may be copied between independent Swift +clusters in each data-center. The operation and configuration of each are +described in their respective documentation. The following points should be +considered when selecting the feature that is most appropriate for a particular +use case: + +#. Global Clusters allows the distribution of object replicas across + data-centers to be controlled by the cluster operator on per-policy basis, + since the distribution is determined by the assignment of devices from + each data-center in each policy's ring file. With Container Sync the end + user controls the distribution of objects across clusters on a + per-container basis. + +#. Global Clusters requires an operator to coordinate ring deployments across + multiple data-centers. Container Sync allows for independent management of + separate Swift clusters in each data-center, and for existing Swift + clusters to be used as peers in Container Sync relationships without + deploying new policies/rings. + +#. Global Clusters seamlessly supports features that may rely on + cross-container operations such as large objects and versioned writes. + Container Sync requires the end user to ensure that all required + containers are sync'd for these features to work in all data-centers. + +#. Global Clusters makes objects available for GET or HEAD requests in both + data-centers even if a replica of the object has not yet been + asynchronously migrated between data-centers, by forwarding requests + between data-centers. Container Sync is unable to serve requests for an + object in a particular data-center until the asynchronous sync process has + copied the object to that data-center. + +#. Global Clusters may require less storage capacity than Container Sync to + achieve equivalent durability of objects in each data-center. Global + Clusters can restore replicas that are lost or corrupted in one + data-center using replicas from other data-centers. Container Sync + requires each data-center to independently manage the durability of + objects, which may result in each data-center storing more replicas than + with Global Clusters. + +#. Global Clusters execute all account/container metadata updates + synchronously to account/container replicas in all data-centers, which may + incur delays when making updates across WANs. Container Sync only copies + objects between data-centers and all Swift internal traffic is + confined to each data-center. + +#. Global Clusters does not yet guarantee the availability of objects stored + in Erasure Coded policies when one data-center is offline. With Container + Sync the availability of objects in each data-center is independent of the + state of other data-centers once objects have been synced. Container Sync + also allows objects to be stored using different policy types in different + data-centers. + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Checking handoff partition distribution +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can check if handoff partitions are piling up on a server by +comparing the expected number of partitions with the actual number on +your disks. First get the number of partitions that are currently +assigned to a server using the ``dispersion`` command from +``swift-ring-builder``:: + + swift-ring-builder sample.builder dispersion --verbose + Dispersion is 0.000000, Balance is 0.000000, Overload is 0.00% + Required overload is 0.000000% + -------------------------------------------------------------------------- + Tier Parts % Max 0 1 2 3 + -------------------------------------------------------------------------- + r1 8192 0.00 2 0 0 8192 0 + r1z1 4096 0.00 1 4096 4096 0 0 + r1z1-172.16.10.1 4096 0.00 1 4096 4096 0 0 + r1z1-172.16.10.1/sda1 4096 0.00 1 4096 4096 0 0 + r1z2 4096 0.00 1 4096 4096 0 0 + r1z2-172.16.10.2 4096 0.00 1 4096 4096 0 0 + r1z2-172.16.10.2/sda1 4096 0.00 1 4096 4096 0 0 + r1z3 4096 0.00 1 4096 4096 0 0 + r1z3-172.16.10.3 4096 0.00 1 4096 4096 0 0 + r1z3-172.16.10.3/sda1 4096 0.00 1 4096 4096 0 0 + r1z4 4096 0.00 1 4096 4096 0 0 + r1z4-172.16.20.4 4096 0.00 1 4096 4096 0 0 + r1z4-172.16.20.4/sda1 4096 0.00 1 4096 4096 0 0 + r2 8192 0.00 2 0 8192 0 0 + r2z1 4096 0.00 1 4096 4096 0 0 + r2z1-172.16.20.1 4096 0.00 1 4096 4096 0 0 + r2z1-172.16.20.1/sda1 4096 0.00 1 4096 4096 0 0 + r2z2 4096 0.00 1 4096 4096 0 0 + r2z2-172.16.20.2 4096 0.00 1 4096 4096 0 0 + r2z2-172.16.20.2/sda1 4096 0.00 1 4096 4096 0 0 + +As you can see from the output, each server should store 4096 partitions, and +each region should store 8192 partitions. This example used a partition power +of 13 and 3 replicas. + +With write_affinity enabled it is expected to have a higher number of +partitions on disk compared to the value reported by the +swift-ring-builder dispersion command. The number of additional (handoff) +partitions in region r1 depends on your cluster size, the amount +of incoming data as well as the replication speed. + +Let's use the example from above with 6 nodes in 2 regions, and write_affinity +configured to write to region r1 first. `swift-ring-builder` reported that +each node should store 4096 partitions:: + + Expected partitions for region r2: 8192 + Handoffs stored across 4 nodes in region r1: 8192 / 4 = 2048 + Maximum number of partitions on each server in region r1: 2048 + 4096 = 6144 + +Worst case is that handoff partitions in region 1 are populated with new +object replicas faster than replication is able to move them to region 2. +In that case you will see ~ 6144 partitions per +server in region r1. Your actual number should be lower and +between 4096 and 6144 partitions (preferably on the lower side). + +Now count the number of object partitions on a given server in region 1, +for example on 172.16.10.1. Note that the pathnames might be +different; `/srv/node/` is the default mount location, and `objects` +applies only to storage policy 0 (storage policy 1 would use +`objects-1` and so on):: + + find -L /srv/node/ -maxdepth 3 -type d -wholename "*objects/*" | wc -l + +If this number is always on the upper end of the expected partition +number range (4096 to 6144) or increasing you should check your +replication speed and maybe even disable write_affinity. +Please refer to the next section how to collect metrics from Swift, and +especially :ref:`swift-recon -r ` how to check replication +stats. + + +.. _cluster_telemetry_and_monitoring: -------------------------------- Cluster Telemetry and Monitoring @@ -282,6 +672,8 @@ object servers using the recon server middleware and the swift-recon cli. To do so update your account, container, or object servers pipelines to include recon and add the associated filter config. +.. highlight:: cfg + object-server.conf sample:: [pipeline:main] @@ -309,9 +701,11 @@ account-server.conf sample:: use = egg:swift#recon recon_cache_path = /var/cache/swift +.. highlight:: none + The recon_cache_path simply sets the directory where stats for a few items will be stored. Depending on the method of deployment you may need to create this -directory manually and ensure that swift has read/write access. +directory manually and ensure that Swift has read/write access. Finally, if you also wish to track asynchronous pending on your object servers you will need to setup a cronjob to run the swift-recon-cron script @@ -319,10 +713,11 @@ periodically on your object servers:: */5 * * * * swift /usr/bin/swift-recon-cron /etc/swift/object-server.conf -Once the recon middleware is enabled a GET request for "/recon/" to -the server will return a json formatted response:: +Once the recon middleware is enabled, a GET request for +"/recon/" to the backend object server will return a +JSON-formatted response:: - fhines@ubuntu:~$ curl -i http://localhost:6030/recon/async + fhines@ubuntu:~$ curl -i http://localhost:6230/recon/async HTTP/1.1 200 OK Content-Type: application/json Content-Length: 20 @@ -330,6 +725,10 @@ the server will return a json formatted response:: {"async_pending": 0} + +Note that the default port for the object server is 6200, except on a +Swift All-In-One installation, which uses 6210, 6220, 6230, and 6240. + The following metrics and telemetry are currently exposed: ========================= ======================================================================================== @@ -340,23 +739,32 @@ Request URI Description /recon/mounted returns *ALL* currently mounted filesystems /recon/unmounted returns all unmounted drives if mount_check = True /recon/diskusage returns disk utilization for storage devices +/recon/driveaudit returns # of drive audit errors /recon/ringmd5 returns object/container/account ring md5sums +/recon/swiftconfmd5 returns swift.conf md5sum /recon/quarantined returns # of quarantined objects/accounts/containers /recon/sockstat returns consumable info from /proc/net/sockstat|6 /recon/devices returns list of devices and devices dir i.e. /srv/node /recon/async returns count of async pending -/recon/replication returns object replication times (for backward compatability) +/recon/replication returns object replication info (for backward compatibility) /recon/replication/ returns replication info for given type (account, container, object) /recon/auditor/ returns auditor stats on last reported scan for given type (account, container, object) /recon/updater/ returns last updater sweep times for given type (container, object) +/recon/expirer/object returns time elapsed and number of objects deleted during last object expirer sweep +/recon/version returns Swift version +/recon/time returns node time ========================= ======================================================================================== +Note that 'object_replication_last' and 'object_replication_time' in object +replication info are considered to be transitional and will be removed in +the subsequent releases. Use 'replication_last' and 'replication_time' instead. + This information can also be queried via the swift-recon command line utility:: fhines@ubuntu:~$ swift-recon -h - Usage: + Usage: usage: swift-recon [-v] [--suppress] [-a] [-r] [-u] [-d] - [-l] [--md5] [--auditor] [--updater] [--expirer] [--sockstat] + [-R] [-l] [-T] [--md5] [--auditor] [--updater] [--expirer] [--sockstat] account|container|object Defaults to object server. @@ -370,6 +778,7 @@ This information can also be queried via the swift-recon command line utility:: --suppress Suppress most connection related errors -a, --async Get async stats -r, --replication Get replication stats + -R, --reconstruction Get reconstruction stats --auditor Get auditor stats --updater Get updater stats --expirer Get expirer stats @@ -379,12 +788,17 @@ This information can also be queried via the swift-recon command line utility:: -q, --quarantined Get cluster quarantine stats --md5 Get md5sum of servers ring and compare to local copy --sockstat Get cluster socket usage stats - --all Perform all checks. Equal to -arudlq --md5 --sockstat + -T, --time Check time synchronization + --all Perform all checks. Equal to + -arudlqT --md5 --sockstat --auditor --updater + --expirer --driveaudit --validate-servers -z ZONE, --zone=ZONE Only query servers in specified zone -t SECONDS, --timeout=SECONDS Time to wait for a response from a server --swiftdir=SWIFTDIR Default = /etc/swift +.. _recon-replication: + For example, to obtain container replication info from all hosts in zone "3":: fhines@ubuntu:~$ swift-recon container -r --zone 3 @@ -401,20 +815,31 @@ For example, to obtain container replication info from all hosts in zone "3":: Reporting Metrics to StatsD --------------------------- +.. highlight:: cfg + +.. note:: + The legacy statsd metrics described in this section are being supplemented + with :doc:`metrics/labels`. + If you have a StatsD_ server running, Swift may be configured to send it real-time operational metrics. To enable this, set the following configuration entries (see the sample configuration files):: log_statsd_host = localhost log_statsd_port = 8125 - log_statsd_default_sample_rate = 1 + log_statsd_default_sample_rate = 1.0 + log_statsd_sample_rate_factor = 1.0 log_statsd_metric_prefix = [empty-string] If `log_statsd_host` is not set, this feature is disabled. The default values -for the other settings are given above. - -.. _StatsD: http://codeascraft.etsy.com/2011/02/15/measure-anything-measure-everything/ -.. _Graphite: http://graphite.wikidot.com/ +for the other settings are given above. The `log_statsd_host` can be a +hostname, an IPv4 address, or an IPv6 address (not surrounded with brackets, as +this is unnecessary since the port is specified separately). If a hostname +resolves to an IPv4 address, an IPv4 socket will be used to send StatsD UDP +packets, even if the hostname would also resolve to an IPv6 address. + +.. _StatsD: https://codeascraft.com/2011/02/15/measure-anything-measure-everything/ +.. _Graphite: http://graphiteapp.org/ .. _Ganglia: http://ganglia.sourceforge.net/ The sample rate is a real number between 0 and 1 which defines the @@ -422,9 +847,24 @@ probability of sending a sample for any given event or timing measurement. This sample rate is sent with each sample to StatsD and used to multiply the value. For example, with a sample rate of 0.5, StatsD will multiply that counter's value by 2 when flushing the metric to an upstream -monitoring system (Graphite_, Ganglia_, etc.). To get the best data, start -with the default `log_statsd_default_sample_rate` value of 1 and only lower -it as needed. +monitoring system (Graphite_, Ganglia_, etc.). + +Some relatively high-frequency metrics have a default sample rate less than +one. If you want to override the default sample rate for all metrics whose +default sample rate is not specified in the Swift source, you may set +`log_statsd_default_sample_rate` to a value less than one. This is NOT +recommended (see next paragraph). A better way to reduce StatsD load is to +adjust `log_statsd_sample_rate_factor` to a value less than one. The +`log_statsd_sample_rate_factor` is multiplied to any sample rate (either the +global default or one specified by the actual metric logging call in the Swift +source) prior to handling. In other words, this one tunable can lower the +frequency of all StatsD logging by a proportional amount. + +To get the best data, start with the default `log_statsd_default_sample_rate` +and `log_statsd_sample_rate_factor` values of 1 and only lower +`log_statsd_sample_rate_factor` if needed. The +`log_statsd_default_sample_rate` should not be used and remains for backward +compatibility only. The metric prefix will be prepended to every metric sent to the StatsD server For example, with:: @@ -437,411 +877,38 @@ servers when sending statistics to a central StatsD server. If you run a local StatsD server per node, you could configure a per-node metrics prefix there and leave `log_statsd_metric_prefix` blank. -Note that metrics reported to StatsD are counters or timing data (which -StatsD usually expands out to min, max, avg, count, and 90th percentile -per timing metric). Some important "gauge" metrics will still need to -be collected using another method. For example, the -`object-server.async_pendings` StatsD metric counts the generation of -async_pendings in real-time, but will not tell you the current number -of async_pending container updates on disk at any point in time. +Note that metrics reported to StatsD are counters or timing data (which are +sent in units of milliseconds). StatsD usually expands timing data out to min, +max, avg, count, and 90th percentile per timing metric, but the details of +this behavior will depend on the configuration of your StatsD server. Some +important "gauge" metrics may still need to be collected using another method. +For example, the `object-server.async_pendings` StatsD metric counts the generation +of async_pendings in real-time, but will not tell you the current number of +async_pending container updates on disk at any point in time. Note also that the set of metrics collected, their names, and their semantics -are not locked down and will change over time. StatsD logging is currently in -a "beta" stage and will continue to evolve. - -Metrics for `account-auditor`: - -========================== ========================================================= -Metric Name Description --------------------------- --------------------------------------------------------- -`account-auditor.errors` Count of audit runs (across all account databases) which - caught an Exception. -`account-auditor.passes` Count of individual account databases which passed audit. -`account-auditor.failures` Count of individual account databases which failed audit. -`account-auditor.timing` Timing data for individual account database audits. -========================== ========================================================= - -Metrics for `account-reaper`: - -============================================== ==================================================== -Metric Name Description ----------------------------------------------- ---------------------------------------------------- -`account-reaper.errors` Count of devices failing the mount check. -`account-reaper.timing` Timing data for each reap_account() call. -`account-reaper.return_codes.X` Count of HTTP return codes from various operations - (eg. object listing, container deletion, etc.). The - value for X is the first digit of the return code - (2 for 201, 4 for 404, etc.). -`account-reaper.containers_failures` Count of failures to delete a container. -`account-reaper.containers_deleted` Count of containers successfully deleted. -`account-reaper.containers_remaining` Count of containers which failed to delete with - zero successes. -`account-reaper.containers_possibly_remaining` Count of containers which failed to delete with - at least one success. -`account-reaper.objects_failures` Count of failures to delete an object. -`account-reaper.objects_deleted` Count of objects successfully deleted. -`account-reaper.objects_remaining` Count of objects which failed to delete with zero - successes. -`account-reaper.objects_possibly_remaining` Count of objects which failed to delete with at - least one success. -============================================== ==================================================== - -Metrics for `account-server` ("Not Found" is not considered an error and requests -which increment `errors` are not included in the timing data): - -======================================== ======================================================= -Metric Name Description ----------------------------------------- ------------------------------------------------------- -`account-server.DELETE.errors.timing` Timing data for each DELETE request resulting in an - error: bad request, not mounted, missing timestamp. -`account-server.DELETE.timing` Timing data for each DELETE request not resulting in - an error. -`account-server.PUT.errors.timing` Timing data for each PUT request resulting in an error: - bad request, not mounted, conflict, recently-deleted. -`account-server.PUT.timing` Timing data for each PUT request not resulting in an - error. -`account-server.HEAD.errors.timing` Timing data for each HEAD request resulting in an - error: bad request, not mounted. -`account-server.HEAD.timing` Timing data for each HEAD request not resulting in - an error. -`account-server.GET.errors.timing` Timing data for each GET request resulting in an - error: bad request, not mounted, bad delimiter, - account listing limit too high, bad accept header. -`account-server.GET.timing` Timing data for each GET request not resulting in - an error. -`account-server.REPLICATE.errors.timing` Timing data for each REPLICATE request resulting in an - error: bad request, not mounted. -`account-server.REPLICATE.timing` Timing data for each REPLICATE request not resulting - in an error. -`account-server.POST.errors.timing` Timing data for each POST request resulting in an - error: bad request, bad or missing timestamp, not - mounted. -`account-server.POST.timing` Timing data for each POST request not resulting in - an error. -======================================== ======================================================= - -Metrics for `account-replicator`: - -===================================== ==================================================== -Metric Name Description -------------------------------------- ---------------------------------------------------- -`account-replicator.diffs` Count of syncs handled by sending differing rows. -`account-replicator.diff_caps` Count of "diffs" operations which failed because - "max_diffs" was hit. -`account-replicator.no_changes` Count of accounts found to be in sync. -`account-replicator.hashmatches` Count of accounts found to be in sync via hash - comparison (`broker.merge_syncs` was called). -`account-replicator.rsyncs` Count of completely missing accounts which were sent - via rsync. -`account-replicator.remote_merges` Count of syncs handled by sending entire database - via rsync. -`account-replicator.attempts` Count of database replication attempts. -`account-replicator.failures` Count of database replication attempts which failed - due to corruption (quarantined) or inability to read - as well as attempts to individual nodes which - failed. -`account-replicator.removes.` Count of databases on deleted because the - delete_timestamp was greater than the put_timestamp - and the database had no rows or because it was - successfully sync'ed to other locations and doesn't - belong here anymore. -`account-replicator.successes` Count of replication attempts to an individual node - which were successful. -`account-replicator.timing` Timing data for each database replication attempt - not resulting in a failure. -===================================== ==================================================== - -Metrics for `container-auditor`: - -============================ ==================================================== -Metric Name Description ----------------------------- ---------------------------------------------------- -`container-auditor.errors` Incremented when an Exception is caught in an audit - pass (only once per pass, max). -`container-auditor.passes` Count of individual containers passing an audit. -`container-auditor.failures` Count of individual containers failing an audit. -`container-auditor.timing` Timing data for each container audit. -============================ ==================================================== - -Metrics for `container-replicator`: - -======================================= ==================================================== -Metric Name Description ---------------------------------------- ---------------------------------------------------- -`container-replicator.diffs` Count of syncs handled by sending differing rows. -`container-replicator.diff_caps` Count of "diffs" operations which failed because - "max_diffs" was hit. -`container-replicator.no_changes` Count of containers found to be in sync. -`container-replicator.hashmatches` Count of containers found to be in sync via hash - comparison (`broker.merge_syncs` was called). -`container-replicator.rsyncs` Count of completely missing containers where were sent - via rsync. -`container-replicator.remote_merges` Count of syncs handled by sending entire database - via rsync. -`container-replicator.attempts` Count of database replication attempts. -`container-replicator.failures` Count of database replication attempts which failed - due to corruption (quarantined) or inability to read - as well as attempts to individual nodes which - failed. -`container-replicator.removes.` Count of databases deleted on because the - delete_timestamp was greater than the put_timestamp - and the database had no rows or because it was - successfully sync'ed to other locations and doesn't - belong here anymore. -`container-replicator.successes` Count of replication attempts to an individual node - which were successful. -`container-replicator.timing` Timing data for each database replication attempt - not resulting in a failure. -======================================= ==================================================== - -Metrics for `container-server` ("Not Found" is not considered an error and requests -which increment `errors` are not included in the timing data): - -========================================== ==================================================== -Metric Name Description ------------------------------------------- ---------------------------------------------------- -`container-server.DELETE.errors.timing` Timing data for DELETE request errors: bad request, - not mounted, missing timestamp, conflict. -`container-server.DELETE.timing` Timing data for each DELETE request not resulting in - an error. -`container-server.PUT.errors.timing` Timing data for PUT request errors: bad request, - missing timestamp, not mounted, conflict. -`container-server.PUT.timing` Timing data for each PUT request not resulting in an - error. -`container-server.HEAD.errors.timing` Timing data for HEAD request errors: bad request, - not mounted. -`container-server.HEAD.timing` Timing data for each HEAD request not resulting in - an error. -`container-server.GET.errors.timing` Timing data for GET request errors: bad request, - not mounted, parameters not utf8, bad accept header. -`container-server.GET.timing` Timing data for each GET request not resulting in - an error. -`container-server.REPLICATE.errors.timing` Timing data for REPLICATE request errors: bad - request, not mounted. -`container-server.REPLICATE.timing` Timing data for each REPLICATE request not resulting - in an error. -`container-server.POST.errors.timing` Timing data for POST request errors: bad request, - bad x-container-sync-to, not mounted. -`container-server.POST.timing` Timing data for each POST request not resulting in - an error. -========================================== ==================================================== - -Metrics for `container-sync`: - -=============================== ==================================================== -Metric Name Description -------------------------------- ---------------------------------------------------- -`container-sync.skips` Count of containers skipped because they don't have - sync'ing enabled. -`container-sync.failures` Count of failures sync'ing of individual containers. -`container-sync.syncs` Count of individual containers sync'ed successfully. -`container-sync.deletes` Count of container database rows sync'ed by - deletion. -`container-sync.deletes.timing` Timing data for each container database row - sychronization via deletion. -`container-sync.puts` Count of container database rows sync'ed by PUTing. -`container-sync.puts.timing` Timing data for each container database row - synchronization via PUTing. -=============================== ==================================================== - -Metrics for `container-updater`: - -============================== ==================================================== -Metric Name Description ------------------------------- ---------------------------------------------------- -`container-updater.successes` Count of containers which successfully updated their - account. -`container-updater.failures` Count of containers which failed to update their - account. -`container-updater.no_changes` Count of containers which didn't need to update - their account. -`container-updater.timing` Timing data for processing a container; only - includes timing for containers which needed to - update their accounts (i.e. "successes" and - "failures" but not "no_changes"). -============================== ==================================================== - -Metrics for `object-auditor`: - -============================ ==================================================== -Metric Name Description ----------------------------- ---------------------------------------------------- -`object-auditor.quarantines` Count of objects failing audit and quarantined. -`object-auditor.errors` Count of errors encountered while auditing objects. -`object-auditor.timing` Timing data for each object audit (does not include - any rate-limiting sleep time for - max_files_per_second, but does include rate-limiting - sleep time for max_bytes_per_second). -============================ ==================================================== - -Metrics for `object-expirer`: - -======================== ==================================================== -Metric Name Description ------------------------- ---------------------------------------------------- -`object-expirer.objects` Count of objects expired. -`object-expirer.errors` Count of errors encountered while attempting to - expire an object. -`object-expirer.timing` Timing data for each object expiration attempt, - including ones resulting in an error. -======================== ==================================================== - -Metrics for `object-replicator`: - -=================================================== ==================================================== -Metric Name Description ---------------------------------------------------- ---------------------------------------------------- -`object-replicator.partition.delete.count.` A count of partitions on which were - replicated to another node because they didn't - belong on this node. This metric is tracked - per-device to allow for "quiescence detection" for - object replication activity on each device. -`object-replicator.partition.delete.timing` Timing data for partitions replicated to another - node because they didn't belong on this node. This - metric is not tracked per device. -`object-replicator.partition.update.count.` A count of partitions on which were - replicated to another node, but also belong on this - node. As with delete.count, this metric is tracked - per-device. -`object-replicator.partition.update.timing` Timing data for partitions replicated which also - belong on this node. This metric is not tracked - per-device. -`object-replicator.suffix.hashes` Count of suffix directories whose hash (of filenames) - was recalculated. -`object-replicator.suffix.syncs` Count of suffix directories replicated with rsync. -=================================================== ==================================================== - -Metrics for `object-server`: - -======================================= ==================================================== -Metric Name Description ---------------------------------------- ---------------------------------------------------- -`object-server.quarantines` Count of objects (files) found bad and moved to - quarantine. -`object-server.async_pendings` Count of container updates saved as async_pendings - (may result from PUT or DELETE requests). -`object-server.POST.errors.timing` Timing data for POST request errors: bad request, - missing timestamp, delete-at in past, not mounted. -`object-server.POST.timing` Timing data for each POST request not resulting in - an error. -`object-server.PUT.errors.timing` Timing data for PUT request errors: bad request, - not mounted, missing timestamp, object creation - constraint violation, delete-at in past. -`object-server.PUT.timeouts` Count of object PUTs which exceeded max_upload_time. -`object-server.PUT.timing` Timing data for each PUT request not resulting in an - error. -`object-server.GET.errors.timing` Timing data for GET request errors: bad request, - not mounted, header timestamps before the epoch, - precondition failed. - File errors resulting in a quarantine are not - counted here. -`object-server.GET.timing` Timing data for each GET request not resulting in an - error. Includes requests which couldn't find the - object (including disk errors resulting in file - quarantine). -`object-server.HEAD.errors.timing` Timing data for HEAD request errors: bad request, - not mounted. -`object-server.HEAD.timing` Timing data for each HEAD request not resulting in - an error. Includes requests which couldn't find the - object (including disk errors resulting in file - quarantine). -`object-server.DELETE.errors.timing` Timing data for DELETE request errors: bad request, - missing timestamp, not mounted, precondition - failed. Includes requests which couldn't find or - match the object. -`object-server.DELETE.timing` Timing data for each DELETE request not resulting - in an error. -`object-server.REPLICATE.errors.timing` Timing data for REPLICATE request errors: bad - request, not mounted. -`object-server.REPLICATE.timing` Timing data for each REPLICATE request not resulting - in an error. -======================================= ==================================================== - -Metrics for `object-updater`: - -============================ ==================================================== -Metric Name Description ----------------------------- ---------------------------------------------------- -`object-updater.errors` Count of drives not mounted or async_pending files - with an unexpected name. -`object-updater.timing` Timing data for object sweeps to flush async_pending - container updates. Does not include object sweeps - which did not find an existing async_pending storage - directory. -`object-updater.quarantines` Count of async_pending container updates which were - corrupted and moved to quarantine. -`object-updater.successes` Count of successful container updates. -`object-updater.failures` Count of failed container updates. -`object-updater.unlinks` Count of async_pending files unlinked. An - async_pending file is unlinked either when it is - successfully processed or when the replicator sees - that there is a newer async_pending file for the - same object. -============================ ==================================================== - -Metrics for `proxy-server` (in the table, `` is the proxy-server -controller responsible for the request and will be one of "account", -"container", or "object"): - -======================================== ==================================================== -Metric Name Description ----------------------------------------- ---------------------------------------------------- -`proxy-server.errors` Count of errors encountered while serving requests - before the controller type is determined. Includes - invalid Content-Length, errors finding the internal - controller to handle the request, invalid utf8, and - bad URLs. -`proxy-server..handoff_count` Count of node hand-offs; only tracked if log_handoffs - is set in the proxy-server config. -`proxy-server..handoff_all_count` Count of times *only* hand-off locations were - utilized; only tracked if log_handoffs is set in the - proxy-server config. -`proxy-server..client_timeouts` Count of client timeouts (client did not read within - `client_timeout` seconds during a GET or did not - supply data within `client_timeout` seconds during - a PUT). -`proxy-server..client_disconnects` Count of detected client disconnects during PUT - operations (does NOT include caught Exceptions in - the proxy-server which caused a client disconnect). -======================================== ==================================================== - -Metrics for `proxy-logging` middleware (in the table, `` is either the -proxy-server controller responsible for the request: "account", "container", -"object", or the string "SOS" if the request came from the `Swift Origin Server`_ -middleware. The `` portion will be one of "GET", "HEAD", "POST", "PUT", -"DELETE", "COPY", "OPTIONS", or "BAD_METHOD". The list of valid HTTP methods -is configurable via the `log_statsd_valid_http_methods` config variable and -the default setting yields the above behavior. - -.. _Swift Origin Server: https://github.com/dpgoetz/sos - -============================================ ==================================================== -Metric Name Description --------------------------------------------- ---------------------------------------------------- -`proxy-server....timing` Timing data for requests. The portion is - the numeric HTTP status code for the request (eg. - "200" or "404") -`proxy-server....xfer` The count of the sum of bytes transferred in (from - clients) and out (to clients) for requests. The - , , and portions of the metric - are just like the timing metric. -============================================ ==================================================== - -Metrics for `tempauth` middleware (in the table, `` represents -the actual configured reseller_prefix or "`NONE`" if the reseller_prefix is the -empty string): - -========================================= ==================================================== -Metric Name Description ------------------------------------------ ---------------------------------------------------- -`tempauth..unauthorized` Count of regular requests which were denied with - HTTPUnauthorized. -`tempauth..forbidden` Count of regular requests which were denied with - HTTPForbidden. -`tempauth..token_denied` Count of token requests which were denied. -`tempauth..errors` Count of errors. -========================================= ==================================================== - +are not locked down and will change over time. For more details, see the +service-specific tables listed below: + +.. toctree:: + metrics/account_auditor + metrics/account_reaper + metrics/account_server + metrics/account_replicator + metrics/container_auditor + metrics/container_replicator + metrics/container_server + metrics/container_sync + metrics/container_updater + metrics/object_auditor + metrics/object_expirer + metrics/object_reconstructor + metrics/object_replicator + metrics/object_server + metrics/object_updater + metrics/proxy_server + +Or, view :doc:`metrics/all` as one page. ------------------------ Debugging Tips and Tools @@ -858,6 +925,14 @@ If you are looking at an object on the server and need more info, `swift-object-info` will display the account, container, replica locations and metadata of the object. +If you are looking at a container on the server and need more info, +`swift-container-info` will display all the information like the account, +container, replica locations and metadata of the container. + +If you are looking at an account on the server and need more info, +`swift-account-info` will display the account, replica locations +and metadata of the account. + If you want to audit the data for an account, `swift-account-audit` can be used to crawl the account, checking that all containers and objects can be found. @@ -866,23 +941,41 @@ found. Managing Services ----------------- -Swift services are generally managed with `swift-init`. the general usage is -``swift-init ``, where service is the swift service to +Swift services are generally managed with ``swift-init``. the general usage is +``swift-init ``, where service is the Swift service to manage (for example object, container, account, proxy) and command is one of: -========== =============================================== -Command Description ----------- ----------------------------------------------- -start Start the service -stop Stop the service -restart Restart the service -shutdown Attempt to gracefully shutdown the service -reload Attempt to gracefully restart the service -========== =============================================== - -A graceful shutdown or reload will finish any current requests before -completely stopping the old service. There is also a special case of -`swift-init all `, which will run the command for all swift services. +=============== =============================================== +Command Description +--------------- ----------------------------------------------- +start Start the service +stop Stop the service +restart Restart the service +shutdown Attempt to gracefully shutdown the service +reload Attempt to gracefully restart the service +reload-seamless Attempt to seamlessly restart the service +=============== =============================================== + +A graceful shutdown or reload will allow all server workers to finish any +current requests before exiting. The parent server process exits immediately. + +A seamless reload will make new configuration settings active, with no window +where client requests fail due to there being no active listen socket. +The parent server process will re-exec itself, retaining its existing PID. +After the re-exec'ed parent server process binds its listen sockets, the old +listen sockets are closed and old server workers finish any current requests +before exiting. + +There is also a special case of ``swift-init all ``, which will run +the command for all swift services. + +In cases where there are multiple configs for a service, a specific config +can be managed with ``swift-init . ``. +For example, when a separate replication network is used, there might be +``/etc/swift/object-server/public.conf`` for the object server and +``/etc/swift/object-server/replication.conf`` for the replication services. +In this case, the replication services could be restarted with +``swift-init object-server.replication restart``. -------------- Object Auditor @@ -892,20 +985,31 @@ On system failures, the XFS file system can sometimes truncate files it's trying to write and produce zero-byte files. The object-auditor will catch these problems but in the case of a system crash it would be advisable to run an extra, less rate limited sweep to check for these specific files. You can -run this command as follows: -`swift-object-auditor /path/to/object-server/config/file.conf once -z 1000` -"-z" means to only check for zero-byte files at 1000 files per second. +run this command as follows:: + + swift-object-auditor /path/to/object-server/config/file.conf once -z 1000 + +``-z`` means to only check for zero-byte files at 1000 files per second. + +At times it is useful to be able to run the object auditor on a specific +device or set of devices. You can run the object-auditor as follows:: + + swift-object-auditor /path/to/object-server/config/file.conf once --devices=sda,sdb + +This will run the object auditor on only the sda and sdb devices. This param +accepts a comma separated list of values. ----------------- Object Replicator ----------------- At times it is useful to be able to run the object replicator on a specific -device or partition. You can run the object-replicator as follows: -swift-object-replicator /path/to/object-server/config/file.conf once --devices=sda,sdb +device or partition. You can run the object-replicator as follows:: + + swift-object-replicator /path/to/object-server/config/file.conf once --devices=sda,sdb This will run the object replicator on only the sda and sdb devices. You can -likewise run that command with --partitions. Both params accept a comma +likewise run that command with ``--partitions``. Both params accept a comma separated list of values. If both are specified they will be ANDed together. These can only be run in "once" mode. @@ -915,9 +1019,9 @@ Swift Orphans Swift Orphans are processes left over after a reload of a Swift server. -For example, when upgrading a proxy server you would probaby finish -with a `swift-init proxy-server reload` or `/etc/init.d/swift-proxy -reload`. This kills the parent proxy server process and leaves the +For example, when upgrading a proxy server you would probably finish +with a ``swift-init proxy-server reload`` or ``/etc/init.d/swift-proxy +reload``. This kills the parent proxy server process and leaves the child processes running to finish processing whatever requests they might be handling at the time. It then starts up a new parent proxy server process and its children to handle new incoming requests. This @@ -927,16 +1031,16 @@ The orphaned child processes may take a while to exit, depending on the length of the requests they were handling. However, sometimes an old process can be hung up due to some bug or hardware issue. In these cases, these orphaned processes will hang around -forever. `swift-orphans` can be used to find and kill these orphans. +forever. ``swift-orphans`` can be used to find and kill these orphans. -`swift-orphans` with no arguments will just list the orphans it finds +``swift-orphans`` with no arguments will just list the orphans it finds that were started more than 24 hours ago. You shouldn't really check for orphans until 24 hours after you perform a reload, as some -requests can take a long time to process. `swift-orphans -k TERM` will -send the SIG_TERM signal to the orphans processes, or you can `kill --TERM` the pids yourself if you prefer. +requests can take a long time to process. ``swift-orphans -k TERM`` will +send the SIG_TERM signal to the orphans processes, or you can ``kill +-TERM`` the pids yourself if you prefer. -You can run `swift-orphans --help` for more options. +You can run ``swift-orphans --help`` for more options. ------------ @@ -947,11 +1051,11 @@ Swift Oldies are processes that have just been around for a long time. There's nothing necessarily wrong with this, but it might indicate a hung process if you regularly upgrade and reload/restart services. You might have so many servers that you don't notice when a -reload/restart fails; `swift-oldies` can help with this. +reload/restart fails; ``swift-oldies`` can help with this. For example, if you upgraded and reloaded/restarted everything 2 days -ago, and you've already cleaned up any orphans with `swift-orphans`, -you can run `swift-oldies -a 48` to find any Swift processes still +ago, and you've already cleaned up any orphans with ``swift-orphans``, +you can run ``swift-oldies -a 48`` to find any Swift processes still around that were started more than 2 days ago and then investigate them accordingly. @@ -963,9 +1067,9 @@ Custom Log Handlers Swift supports setting up custom log handlers for services by specifying a comma-separated list of functions to invoke when logging is setup. It does so -via the `log_custom_handlers` configuration option. Logger hooks invoked are -passed the same arguments as Swift's get_logger function (as well as the -getLogger and LogAdapter object): +via the ``log_custom_handlers`` configuration option. Logger hooks invoked are +passed the same arguments as Swift's ``get_logger`` function, as well as the +``logging.Logger`` and ``SwiftLogAdapter`` objects: ============== =============================================== Name Description @@ -979,6 +1083,14 @@ logger The logging.getLogger object adapted_logger The LogAdapter object ============== =============================================== +.. note:: + The instance of ``SwiftLogAdapter`` that wraps the ``logging.Logger`` + object may be replaced with cloned instances during runtime, for example to + use a different log prefix with the same ``logging.Logger``. Custom log + handlers should therefore not modify any attributes of the + ``SwiftLogAdapter`` instance other than those that will be copied if it is + cloned. + A basic example that sets up a custom logger might look like the following: @@ -993,3 +1105,10 @@ following: See :ref:`custom-logger-hooks-label` for sample use cases. +------------------------ +Securing OpenStack Swift +------------------------ + +Please refer to the security guide at https://docs.openstack.org/security-guide +and in particular the `Object Storage +`__ section. diff --git a/doc/source/apache_deployment_guide.rst b/doc/source/apache_deployment_guide.rst new file mode 100644 index 0000000000..e0b3ed4924 --- /dev/null +++ b/doc/source/apache_deployment_guide.rst @@ -0,0 +1,193 @@ +======================= +Apache Deployment Guide +======================= + +---------------------------- +Web Front End Considerations +---------------------------- + +Swift can be configured to work both using an integral web front-end and using a +full-fledged Web Server such as the Apache2 (HTTPD) web server. The integral +web front-end is a wsgi mini "Web Server" which opens up its own socket and +serves http requests directly. The incoming requests accepted by the integral +web front-end are then forwarded to a wsgi application (the core swift) for +further handling, possibly via wsgi middleware sub-components. + +client<---->'integral web front-end'<---->middleware<---->'core swift' + +To gain full advantage of Apache2, Swift can alternatively be configured to work +as a request processor of the Apache2 server. This alternative deployment +scenario uses mod_wsgi of Apache2 to forward requests to the swift wsgi +application and middleware. + +client<---->'Apache2 with mod_wsgi'<----->middleware<---->'core swift' + +The integral web front-end offers simplicity and requires minimal configuration. +It is also the web front-end most commonly used with Swift. Additionally, the +integral web front-end includes support for receiving chunked transfer encoding +from a client, presently not supported by Apache2 in the operation mode +described here. + +The use of Apache2 offers new ways to extend Swift and integrate it with +existing authentication, administration and control systems. A single Apache2 +server can serve as the web front end of any number of swift servers residing on +a swift node. For example when a storage node offers account, container and +object services, a single Apache2 server can serve as the web front end of all +three services. + +The apache variant described here was tested as part of an IBM research work. +It was found that following tuning, the Apache2 offer generally equivalent +performance to that offered by the integral web front-end. Alternative to +Apache2, other web servers may be used, but were never tested. + +------------- +Apache2 Setup +------------- +Both Apache2 and mod-wsgi needs to be installed on the system. Ubuntu comes +with Apache2 installed. Install mod-wsgi using:: + + sudo apt-get install libapache2-mod-wsgi + +Create a directory for the Apache2 wsgi files:: + + sudo mkdir /srv/www/swift + +Create a working directory for the wsgi processes:: + + sudo mkdir -m 2770 /var/lib/swift + sudo chown swift:swift /var/lib/swift + +Create a file for each service under ``/srv/www/swift``. + +For a proxy service create ``/srv/www/swift/proxy-server.wsgi``:: + + from swift.common.wsgi import init_request_processor + application, conf, logger, log_name = \ + init_request_processor('/etc/swift/proxy-server.conf','proxy-server') + +For an account service create ``/srv/www/swift/account-server.wsgi``:: + + from swift.common.wsgi import init_request_processor + application, conf, logger, log_name = \ + init_request_processor('/etc/swift/account-server.conf', + 'account-server') + +For an container service create ``/srv/www/swift/container-server.wsgi``:: + + from swift.common.wsgi import init_request_processor + application, conf, logger, log_name = \ + init_request_processor('/etc/swift/container-server.conf', + 'container-server') + +For an object service create ``/srv/www/swift/object-server.wsgi``:: + + from swift.common.wsgi import init_request_processor + application, conf, logger, log_name = \ + init_request_processor('/etc/swift/object-server.conf', + 'object-server') + +Create a ``/etc/apache2/conf.d/swift_wsgi.conf`` configuration file that will +define a port and Virtual Host per each local service. For example an Apache2 +serving as a web front end of a proxy service:: + + # Proxy + Listen 8080 + + + ServerName proxy-server + + LimitRequestBody 5368709122 + LimitRequestFields 200 + + WSGIDaemonProcess proxy-server processes=5 threads=1 user=swift group=swift display-name=%{GROUP} + WSGIProcessGroup proxy-server + WSGIScriptAlias / /srv/www/swift/proxy-server.wsgi + LogLevel debug + CustomLog /var/log/apache2/proxy.log combined + ErrorLog /var/log/apache2/proxy-server + + +Notice that when using Apache the limit on the maximal object size should be +imposed by Apache using the `LimitRequestBody` rather by the swift proxy. Note +also that the `LimitRequestBody` should indicate the same value as indicated by +`max_file_size` located in both ``/etc/swift/swift.conf`` and in +``/etc/swift/test.conf``. The Swift default value for `max_file_size` (when not +present) is `5368709122`. For example an Apache2 serving as a web front end of a +storage node:: + + # Object Service + Listen 6200 + + + ServerName object-server + + LimitRequestFields 200 + + WSGIDaemonProcess object-server processes=5 threads=1 user=swift group=swift display-name=%{GROUP} + WSGIProcessGroup object-server + WSGIScriptAlias / /srv/www/swift/object-server.wsgi + LogLevel debug + CustomLog /var/log/apache2/access.log combined + ErrorLog /var/log/apache2/object-server + + + # Container Service + Listen 6201 + + + ServerName container-server + + LimitRequestFields 200 + + WSGIDaemonProcess container-server processes=5 threads=1 user=swift group=swift display-name=%{GROUP} + WSGIProcessGroup container-server + WSGIScriptAlias / /srv/www/swift/container-server.wsgi + LogLevel debug + CustomLog /var/log/apache2/access.log combined + ErrorLog /var/log/apache2/container-server + + + # Account Service + Listen 6202 + + + ServerName account-server + + LimitRequestFields 200 + + WSGIDaemonProcess account-server processes=5 threads=1 user=swift group=swift display-name=%{GROUP} + WSGIProcessGroup account-server + WSGIScriptAlias / /srv/www/swift/account-server.wsgi + LogLevel debug + CustomLog /var/log/apache2/access.log combined + ErrorLog /var/log/apache2/account-server + + +Enable the newly configured Virtual Hosts:: + + a2ensite swift_wsgi.conf + +Next, stop, test and start Apache2 again:: + + # stop it + systemctl stop apache2.service + + # test the configuration + apache2ctl -t + + # start it if the test succeeds + systemctl start apache2.service + + +Edit the tests config file and add:: + + web_front_end = apache2 + normalized_urls = True + +Also check to see that the file includes `max_file_size` of the same value as +used for the `LimitRequestBody` in the apache config file above. + +We are done. You may run functional tests to test - e.g.:: + + cd ~swift/swift + ./.functests diff --git a/doc/source/api/authentication.rst b/doc/source/api/authentication.rst new file mode 100644 index 0000000000..3d1044e7c3 --- /dev/null +++ b/doc/source/api/authentication.rst @@ -0,0 +1,58 @@ +============== +Authentication +============== + +The owner of an Object Storage account controls access to that account +and its containers and objects. An owner is the user who has the +''admin'' role for that tenant. The tenant is also known as the project +or account. As the account owner, you can modify account metadata and +create, modify, and delete containers and objects. + +To identify yourself as the account owner, include an authentication +token in the ''X-Auth-Token'' header in the API request. + +Depending on the token value in the ''X-Auth-Token'' header, one of the +following actions occur: + +- ''X-Auth-Token'' contains the token for the account owner. + + The request is permitted and has full access to make changes to the + account. + +- The ''X-Auth-Token'' header is omitted or it contains a token for a + non-owner or a token that is not valid. + + The request fails with a 401 Unauthorized or 403 Forbidden response. + + You have no access to accounts or containers, unless an access + control list (ACL) explicitly grants access. + + The account owner can grant account and container access to users + through access control lists (ACLs). + +In addition, it is possible to provide an additional token in the +''X-Service-Token'' header. More information about how this is used is in +:doc:`../overview_backing_store`. + +The following list describes the authentication services that you can +use with Object Storage: + +- OpenStack Identity (keystone): For Object Storage, account is synonymous with + project or tenant ID. + +- Tempauth middleware: Object Storage includes this middleware. User and account + management is performed in Object Storage itself. + +- Swauth middleware: Stored in github, this custom middleware is modeled on + Tempauth. Usage is similar to Tempauth. + +- Other custom middleware: Write it yourself to fit your environment. + +Specifically, you use the ''X-Auth-Token'' header to pass an +authentication token to an API request. + +Authentication tokens expire after a time period that the authentication +service defines. When a token expires, use of the token causes requests +to fail with a 401 Unauthorized response. To continue, you must obtain a +new token. + diff --git a/doc/source/api/bulk-delete.rst b/doc/source/api/bulk-delete.rst new file mode 100644 index 0000000000..367eed3aa8 --- /dev/null +++ b/doc/source/api/bulk-delete.rst @@ -0,0 +1,93 @@ +.. _bulk-delete: + +=========== +Bulk delete +=========== + +To discover whether your Object Storage system supports this feature, +see :ref:`discoverability`. Alternatively, check with your service provider. + +With bulk delete, you can delete up to 10,000 objects or containers +(configurable) in one request. + +Bulk delete request +~~~~~~~~~~~~~~~~~~~ + +To perform a bulk delete operation, add the ``bulk-delete`` query +parameter to the path of a ``POST`` or ``DELETE`` operation. + +.. note:: + + The ``DELETE`` operation is supported for backwards compatibility. + +The path is the account, such as ``/v1/12345678912345``, that contains +the objects and containers. + +In the request body of the ``POST`` or ``DELETE`` operation, list the +objects or containers to be deleted. Separate each name with a newline +character. You can include a maximum of 10,000 items (configurable) in +the list. + +In addition, you must: + +- UTF-8-encode and then URL-encode the names. + +- To indicate an object, specify the container and object name as: + ``CONTAINER_NAME``/``OBJECT_NAME``. + +- To indicate a container, specify the container name as: + ``CONTAINER_NAME``. Make sure that the container is empty. If it + contains objects, Object Storage cannot delete the container. + +- Set the ``Content-Type`` request header to ``text/plain``. + +Bulk delete response +~~~~~~~~~~~~~~~~~~~~ + +When Object Storage processes the request, it performs multiple +sub-operations. Even if all sub-operations fail, the operation returns a +200 status. The bulk operation returns a response body that contains +details that indicate which sub-operations have succeeded and failed. +Some sub-operations might succeed while others fail. Examine the +response body to determine the results of each delete sub-operation. + +You can set the ``Accept`` request header to one of the following values +to define the response format: + +``text/plain`` + Formats response as plain text. If you omit the + ``Accept`` header, ``text/plain`` is the default. + +``application/json`` + Formats response as JSON. + +``application/xml`` or ``text/xml`` + Formats response as XML. + +The response body contains the following information: + +- The number of files actually deleted. + +- The number of not found objects. + +- Errors. A list of object names and associated error statuses for the + objects that failed to delete. The format depends on the value that + you set in the ``Accept`` header. + +The following bulk delete response is in ``application/xml`` format. In +this example, the ``mycontainer`` container is not empty, so it cannot +be deleted. + +.. code-block:: xml + + + 2 + 4 + + + /v1/12345678912345/mycontainer + 409 Conflict + + + + diff --git a/doc/source/api/container_quotas.rst b/doc/source/api/container_quotas.rst new file mode 100644 index 0000000000..9c58eef274 --- /dev/null +++ b/doc/source/api/container_quotas.rst @@ -0,0 +1,32 @@ +.. _container_quotas: + +================ +Container quotas +================ + +You can set quotas on the size and number of objects stored in a +container by setting the following metadata: + +- ``X-Container-Meta-Quota-Bytes``. The size, in bytes, of objects that + can be stored in a container. + +- ``X-Container-Meta-Quota-Count``. The number of objects that can be + stored in a container. + +When you exceed a container quota, subsequent requests to create objects +fail with a 413 Request Entity Too Large error. + +The Object Storage system uses an eventual consistency model. When you +create a new object, the container size and object count might not be +immediately updated. Consequently, you might be allowed to create +objects even though you have actually exceeded the quota. + +At some later time, the system updates the container size and object +count to the actual values. At this time, subsequent requests fails. In +addition, if you are currently under the +``X-Container-Meta-Quota-Bytes`` limit and a request uses chunked +transfer encoding, the system cannot know if the request will exceed the +quota so the system allows the request. However, once the quota is +exceeded, any subsequent uploads that use chunked transfer encoding +fail. + diff --git a/doc/source/api/discoverability.rst b/doc/source/api/discoverability.rst new file mode 100644 index 0000000000..c086d16f7f --- /dev/null +++ b/doc/source/api/discoverability.rst @@ -0,0 +1,37 @@ +=============== +Discoverability +=============== + +Your Object Storage system might not enable all features that you read about because your service provider chooses which features to enable. + +To discover which features are enabled in your Object Storage system, +use the ``/info`` request. However, your service provider might have +disabled the ``/info`` request, or you might be using an older version +that does not support the ``/info`` request. + +To use the ``/info`` request, send a **GET** request using the ``/info`` +path to the Object Store endpoint as shown in this example: + +.. code:: console + + # curl https://storage.clouddrive.com/info + +This example shows a truncated response body: + +.. code:: console + + { + "swift":{ + "version":"1.11.0" + }, + "staticweb":{ + + }, + "tempurl":{ + + } + } + +This output shows that the Object Storage system has enabled the static +website and temporary URL features. + diff --git a/doc/source/api/form_post_middleware.rst b/doc/source/api/form_post_middleware.rst new file mode 100644 index 0000000000..97921d41e8 --- /dev/null +++ b/doc/source/api/form_post_middleware.rst @@ -0,0 +1,211 @@ +==================== +Form POST middleware +==================== + +To discover whether your Object Storage system supports this feature, +check with your service provider or send a **GET** request using the :file:`/info` +path. + +You can upload objects directly to the Object Storage system from a +browser by using the form **POST** middleware. This middleware uses +account or container secret keys to generate a cryptographic signature for the +request. This means that you do not need to send an authentication token +in the ``X-Auth-Token`` header to perform the request. + +The form **POST** middleware uses the same secret keys as the temporary +URL middleware uses. For information about how to set these keys, see +:ref:`secret_keys`. + +For information about the form **POST** middleware configuration +options, see :ref:`formpost` in the *Source Documentation*. + +Form POST format +~~~~~~~~~~~~~~~~ + +To upload objects to a cluster, you can use an HTML form **POST** +request. + +The format of the form **POST** request is: + +**Example 1.14. Form POST format** + +.. code:: xml + +
+ + + + + + +
+ +
+ + +**action="SWIFT_URL"** + +Set to full URL where the objects are to be uploaded. The names of +uploaded files are appended to the specified *SWIFT_URL*. So, you +can upload directly to the root of a container with a URL like: + +.. code:: none + + https://swift-cluster.example.com/v1/my_account/container/ + +Optionally, you can include an object prefix to separate uploads, such +as: + +.. code:: none + + https://swift-cluster.example.com/v1/my_account/container/OBJECT_PREFIX + + +**method="POST"** + +Must be ``POST``. + + +**enctype="multipart/form-data"** + +Must be ``multipart/form-data``. + + +**name="redirect" value="REDIRECT_URL"** + +Redirects the browser to the *REDIRECT_URL* after the upload +completes. The URL has status and message query parameters added to it, +which specify the HTTP status code for the upload and an optional error +message. The 2\ *nn* status code indicates success. + +The *REDIRECT_URL* can be an empty string. If so, the ``Location`` +response header is not set. + +**name="max\_file\_size" value="BYTES"** + +Required. Indicates the size, in bytes, of the maximum single file +upload. + +**name="max\_file\_count" value= "COUNT"** + +Required. Indicates the maximum number of files that can be uploaded +with the form. + + +**name="expires" value="UNIX_TIMESTAMP"** + +The UNIX timestamp that specifies the time before which the form must be +submitted before it becomes no longer valid. + + +**name="signature" value="HMAC"** + +The HMAC-SHA1 signature of the form. + + +**type="file" name="FILE_NAME"** + +File name of the file to be uploaded. You can include from one to the +``max_file_count`` value of files. + +The file attributes must appear after the other attributes to be +processed correctly. + +If attributes appear after the file attributes, they are not sent with +the sub-request because all attributes in the file cannot be parsed on +the server side unless the whole file is read into memory; the server +does not have enough memory to service these requests. Attributes that +follow the file attributes are ignored. + +Optionally, if you want the uploaded files to be temporary you can set x-delete-at or x-delete-after attributes by adding one of these as a form input: + +.. code:: xml + + + + + +**type= "submit"** + +Must be ``submit``. + +HMAC-SHA1 signature for form POST +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Form **POST** middleware uses an HMAC-SHA1 cryptographic signature. This +signature includes these elements from the form: + +- The path. Starting with ``/v1/`` onwards and including a container + name and, optionally, an object prefix. In `Example 1.15`, "HMAC-SHA1 + signature for form + POST" the path is + ``/v1/my_account/container/object_prefix``. Do not URL-encode the + path at this stage. + +- A redirect URL. If there is no redirect URL, use the empty string. + +- Maximum file size. In `Example 1.15`, "HMAC-SHA1 signature for form + POST" the + ``max_file_size`` is ``104857600`` bytes. + +- The maximum number of objects to upload. In `Example 1.15`, "HMAC-SHA1 + signature for form + POST" ``max_file_count`` is ``10``. + +- Expiry time. In `Example 1.15, "HMAC-SHA1 signature for form + POST" the expiry time + is set to ``600`` seconds into the future. + +- The secret key. Set as the ``X-Account-Meta-Temp-URL-Key`` header + value for accounts or ``X-Container-Meta-Temp-URL-Key`` header + value for containers. See :ref:`secret_keys` for more information. + +The following example code generates a signature for use with form +**POST**: + +**Example 1.15. HMAC-SHA1 signature for form POST** + +.. code:: python + + import hmac + from hashlib import sha1 + from time import time + path = '/v1/my_account/container/object_prefix' + redirect = 'https://myserver.com/some-page' + max_file_size = 104857600 + max_file_count = 10 + expires = int(time() + 600) + key = 'MYKEY' + hmac_body = '%s\n%s\n%s\n%s\n%s' % (path, redirect, + max_file_size, max_file_count, expires) + signature = hmac.new(key, hmac_body, sha1).hexdigest() + + +For more information, see `RFC 2104: HMAC: Keyed-Hashing for Message +Authentication `__. + +Form POST example +~~~~~~~~~~~~~~~~~ + +The following example shows how to submit a form by using a cURL +command. In this example, the object prefix is ``photos/`` and the file +being uploaded is called ``flower.jpg``. + +This example uses the **swift-form-signature** script to compute the +``expires`` and ``signature`` values. + +.. code:: console + + $ bin/swift-form-signature /v1/my_account/container/photos/ https://example.com/done.html 5373952000 1 200 MYKEY + Expires: 1390825338 + Signature: 35129416ebda2f1a21b3c2b8939850dfc63d8f43 + +.. code:: console + + $ curl -i https://swift-cluster.example.com/v1/my_account/container/photos/ -X POST \ + -F max_file_size=5373952000 -F max_file_count=1 -F expires=1390825338 \ + -F signature=35129416ebda2f1a21b3c2b8939850dfc63d8f43 \ + -F redirect=https://example.com/done.html \ + -F file=@flower.jpg diff --git a/doc/source/api/large_objects.rst b/doc/source/api/large_objects.rst new file mode 100644 index 0000000000..f05f72ebd9 --- /dev/null +++ b/doc/source/api/large_objects.rst @@ -0,0 +1,347 @@ +============= +Large objects +============= + +By default, the content of an object cannot be greater than 5 GB. +However, you can use a number of smaller objects to construct a large +object. The large object is comprised of two types of objects: + +- **Segment objects** store the object content. You can divide your + content into segments, and upload each segment into its own segment + object. Segment objects do not have any special features. You create, + update, download, and delete segment objects just as you would normal + objects. + +- A **manifest object** links the segment objects into one logical + large object. When you download a manifest object, Object Storage + concatenates and returns the contents of the segment objects in the + response body of the request. This behavior extends to the response + headers returned by **GET** and **HEAD** requests. The + ``Content-Length`` response header value is the total size of all + segment objects. Object Storage calculates the ``ETag`` response + header value by taking the ``ETag`` value of each segment, + concatenating them together, and returning the MD5 checksum of the + result. The manifest object types are: + + **Static large objects** + The manifest object content is an ordered list of the names of + the segment objects in JSON format. + + **Dynamic large objects** + The manifest object has a ``X-Object-Manifest`` metadata header. + The value of this header is ``{container}/{prefix}``, + where ``{container}`` is the name of the container where the + segment objects are stored, and ``{prefix}`` is a string that all + segment objects have in common. The manifest object should have + no content. However, this is not enforced. + +Note +~~~~ + +If you make a **COPY** request by using a manifest object as the source, +the new object is a normal, and not a segment, object. If the total size +of the source segment objects exceeds 5 GB, the **COPY** request fails. +However, you can make a duplicate of the manifest object and this new +object can be larger than 5 GB. + +Static large objects +~~~~~~~~~~~~~~~~~~~~ + +To create a static large object, divide your content into pieces and +create (upload) a segment object to contain each piece. + +Create a manifest object. Include the ``multipart-manifest=put`` +query parameter at the end of the manifest object name to indicate that +this is a manifest object. + +The body of the **PUT** request on the manifest object comprises a json +list, where each element is an object representing a segment. These objects +may contain the following attributes: + +- ``path`` (required). The container and object name in the format: + ``{container-name}/{object-name}`` + +- ``etag`` (optional). If provided, this value must match the ``ETag`` + of the segment object. This was included in the response headers when + the segment was created. Generally, this will be the MD5 sum of the + segment. + +- ``size_bytes`` (optional). The size of the segment object. If provided, + this value must match the ``Content-Length`` of that object. + +- ``range`` (optional). The subset of the referenced object that should + be used for segment data. This behaves similar to the ``Range`` header. + If omitted, the entire object will be used. + +Providing the optional ``etag`` and ``size_bytes`` attributes for each +segment ensures that the upload cannot corrupt your data. + +**Example Static large object manifest list** + +This example shows three segment objects. You can use several containers +and the object names do not have to conform to a specific pattern, in +contrast to dynamic large objects. + +.. code:: json + + [ + { + "path": "mycontainer/objseg1", + "etag": "0228c7926b8b642dfb29554cd1f00963", + "size_bytes": 1468006 + }, + { + "path": "mycontainer/pseudodir/seg-obj2", + "etag": "5bfc9ea51a00b790717eeb934fb77b9b", + "size_bytes": 1572864 + }, + { + "path": "other-container/seg-final", + "etag": "b9c3da507d2557c1ddc51f27c54bae51", + "size_bytes": 256 + } + ] + +| + +The ``Content-Length`` request header must contain the length of the +json content—not the length of the segment objects. However, after the +**PUT** operation completes, the ``Content-Length`` metadata is set to +the total length of all the object segments. When using the ``ETag`` +request header in a **PUT** operation, it must contain the MD5 checksum +of the concatenated ``ETag`` values of the object segments. You can also +set the ``Content-Type`` request header and custom object metadata. + +When the **PUT** operation sees the ``multipart-manifest=put`` query +parameter, it reads the request body and verifies that each segment +object exists and that the sizes and ETags match. If there is a +mismatch, the **PUT** operation fails. + +This verification process can take a long time to complete, particularly +as the number of segments increases. You may include a ``heartbeat=on`` +query parameter to have the server: + +1. send a ``202 Accepted`` response before it begins validating segments, +2. periodically send whitespace characters to keep the connection alive, and +3. send a final response code in the body. + +.. note:: + The server may still immediately respond with ``400 Bad Request`` + if it can determine that the request is invalid before making + backend requests. + +If everything matches, the manifest object is created. The +``X-Static-Large-Object`` metadata is set to ``true`` indicating that +this is a static object manifest. + +Normally when you perform a **GET** operation on the manifest object, +the response body contains the concatenated content of the segment +objects. To download the manifest list, use the +``multipart-manifest=get`` query parameter. The resulting list is not +formatted the same as the manifest you originally used in the **PUT** +operation. + +If you use the **DELETE** operation on a manifest object, the manifest +object is deleted. The segment objects are not affected. However, if you +add the ``multipart-manifest=delete`` query parameter, the segment +objects are deleted and if all are successfully deleted, the manifest +object is also deleted. + +To change the manifest, use a **PUT** operation with the +``multipart-manifest=put`` query parameter. This request creates a +manifest object. You can also update the object metadata in the usual +way. + +Dynamic large objects +~~~~~~~~~~~~~~~~~~~~~ + +You must segment objects that are larger than 5 GB before you can upload +them. You then upload the segment objects like you would any other +object and create a dynamic large manifest object. The manifest object +tells Object Storage how to find the segment objects that comprise the +large object. The segments remain individually addressable, but +retrieving the manifest object streams all the segments concatenated. +There is no limit to the number of segments that can be a part of a +single large object, but ``Content-Length`` is included in **GET** or **HEAD** +response only if the number of segments is smaller than container listing +limit. In other words, the number of segments that fit within a single +container listing page. + +To ensure the download works correctly, you must upload all the object +segments to the same container and ensure that each object name is +prefixed in such a way that it sorts in the order in which it should be +concatenated. You also create and upload a manifest file. The manifest +file is a zero-byte file with the extra ``X-Object-Manifest`` +``{container}/{prefix}`` header, where ``{container}`` is the container +the object segments are in and ``{prefix}`` is the common prefix for all +the segments. You must UTF-8-encode and then URL-encode the container +and common prefix in the ``X-Object-Manifest`` header. + +It is best to upload all the segments first and then create or update +the manifest. With this method, the full object is not available for +downloading until the upload is complete. Also, you can upload a new set +of segments to a second location and update the manifest to point to +this new location. During the upload of the new segments, the original +manifest is still available to download the first set of segments. + +.. note:: + + When updating a manifest object using a POST request, a + ``X-Object-Manifest`` header must be included for the + object to continue to behave as a manifest object. + +**Example Upload segment of large object request: HTTP** + +.. code:: none + + PUT /{api_version}/{account}/{container}/{object} HTTP/1.1 + Host: storage.clouddrive.com + X-Auth-Token: eaaafd18-0fed-4b3a-81b4-663c99ec1cbb + ETag: 8a964ee2a5e88be344f36c22562a6486 + Content-Length: 1 + X-Object-Meta-PIN: 1234 + + +No response body is returned. A status code of 2\ *``nn``* (between 200 +and 299, inclusive) indicates a successful write; status 411 Length +Required denotes a missing ``Content-Length`` or ``Content-Type`` header +in the request. If the MD5 checksum of the data written to the storage +system does NOT match the (optionally) supplied ETag value, a 422 +Unprocessable Entity response is returned. + +You can continue uploading segments like this example shows, prior to +uploading the manifest. + +**Example Upload next segment of large object request: HTTP** + +.. code:: none + + PUT /{api_version}/{account}/{container}/{object} HTTP/1.1 + Host: storage.clouddrive.com + X-Auth-Token: eaaafd18-0fed-4b3a-81b4-663c99ec1cbb + ETag: 8a964ee2a5e88be344f36c22562a6486 + Content-Length: 1 + X-Object-Meta-PIN: 1234 + + +Next, upload the manifest you created that indicates the container the +object segments reside within. Note that uploading additional segments +after the manifest is created causes the concatenated object to be that +much larger but you do not need to recreate the manifest file for +subsequent additional segments. + +**Example Upload manifest request: HTTP** + +.. code:: none + + PUT /{api_version}/{account}/{container}/{object} HTTP/1.1 + Host: storage.clouddrive.com + X-Auth-Token: eaaafd18-0fed-4b3a-81b4-663c99ec1cbb + Content-Length: 0 + X-Object-Meta-PIN: 1234 + X-Object-Manifest: {container}/{prefix} + + +**Example Upload manifest response: HTTP** + +.. code:: none + + [...] + + +The ``Content-Type`` in the response for a **GET** or **HEAD** on the +manifest is the same as the ``Content-Type`` set during the **PUT** +request that created the manifest. You can easily change the +``Content-Type`` by reissuing the **PUT** request. + +Comparison of static and dynamic large objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +While static and dynamic objects have similar behavior, here are +their differences: + +End-to-end integrity +-------------------- + +With static large objects, integrity can be assured. +The list of segments may include the MD5 checksum (``ETag``) of each segment. +You cannot upload the manifest object if the ``ETag`` in the list differs +from the uploaded segment object. If a segment is somehow lost, an attempt +to download the manifest object results in an error. + +With dynamic large objects, integrity is not guaranteed. The eventual +consistency model means that although you have uploaded a segment object, it +might not appear in the container listing until later. If you download the +manifest before it appears in the container, it does not form part of the +content returned in response to a **GET** request. + +Upload Order +------------ + +With static large objects, you must upload the +segment objects before you upload the manifest object. + +With dynamic large objects, you can upload manifest and segment objects +in any order. In case a premature download of the manifest occurs, we +recommend users upload the manifest object after the segments. However, +the system does not enforce the order. + +Removal or addition of segment objects +-------------------------------------- + +With static large objects, you cannot add or +remove segment objects from the manifest. However, you can create a +completely new manifest object of the same name with a different manifest +list. + +With dynamic large objects, you can upload new segment objects or remove +existing segments. The names must simply match the ``{prefix}`` supplied +in ``X-Object-Manifest``. + +Segment object size and number +------------------------------ + +With static large objects, the segment objects must be at least 1 byte in size. +However, if the segment objects are less than 1MB (by default), +the SLO download is (by default) rate limited. At most, +1000 segments are supported (by default) and the manifest has a limit +(by default) of 2MB in size. + +With dynamic large objects, segment objects can be any size. + +Segment object container name +----------------------------- + +With static large objects, the manifest list includes the container name of each object. +Segment objects can be in different containers. + +With dynamic large objects, all segment objects must be in the same container. + +Manifest object metadata +------------------------ + +With static large objects, the manifest object has ``X-Static-Large-Object`` +set to ``true``. You do not set this +metadata directly. Instead the system sets it when you **PUT** a static +manifest object. + +With dynamic large objects, the ``X-Object-Manifest`` value is the +``{container}/{prefix}``, which indicates +where the segment objects are located. You supply this request header in the +**PUT** operation. + +Copying the manifest object +--------------------------- + +The semantics are the same for both static and dynamic large objects. +When copying large objects, the **COPY** operation does not create +a manifest object but a normal object with content same as what you would +get on a **GET** request to the original manifest object. + +To copy the manifest object, you include the ``multipart-manifest=get`` +query parameter in the **COPY** request. The new object contains the same +manifest as the original. The segment objects are not copied. Instead, +both the original and new manifest objects share the same set of segment +objects. + + diff --git a/doc/source/api/object-expiration.rst b/doc/source/api/object-expiration.rst new file mode 100644 index 0000000000..e101b4de5e --- /dev/null +++ b/doc/source/api/object-expiration.rst @@ -0,0 +1,48 @@ +================= +Object expiration +================= + +You can schedule Object Storage (swift) objects to expire by setting the +``X-Delete-At`` or ``X-Delete-After`` header. Once the object is deleted, +swift will no longer serve the object and it will be deleted from the cluster +shortly thereafter. + +* Set an object to expire at an absolute time (in Unix time). You + can get the current Unix time by running ``date +'%s'``. + + .. code-block:: console + + $ swift post CONTAINER OBJECT_FILENAME -H "X-Delete-At:UNIX_TIME" + + Verify the ``X-Delete-At`` header has posted to the object: + + .. code-block:: console + + $ swift stat CONTAINER OBJECT_FILENAME + +* Set an object to expire after a relative amount of time (in seconds): + + .. code-block:: console + + $ swift post CONTAINER OBJECT_FILENAME -H "X-Delete-After:SECONDS" + + The ``X-Delete-After`` header will be converted to ``X-Delete-At``. + Verify the ``X-Delete-At`` header has posted to the object: + + .. code-block:: console + + $ swift stat CONTAINER OBJECT_FILENAME + + If you no longer want to expire the object, you can remove the + ``X-Delete-At`` header: + + .. code-block:: console + + $ swift post CONTAINER OBJECT_FILENAME -H "X-Remove-Delete-At:" + +.. note:: + + In order for object expiration to work properly, the + ``swift-object-expirer`` daemon will need access to all backend + servers in the cluster. The daemon does not need access to the + proxy-server or public network. diff --git a/doc/source/api/object_api_v1_overview.rst b/doc/source/api/object_api_v1_overview.rst new file mode 100644 index 0000000000..37fa28e40f --- /dev/null +++ b/doc/source/api/object_api_v1_overview.rst @@ -0,0 +1,194 @@ +Object Storage API overview +--------------------------- + +OpenStack Object Storage is a highly available, distributed, eventually +consistent object/blob store. You create, modify, and get objects and +metadata by using the Object Storage API, which is implemented as a set +of Representational State Transfer (REST) web services. + +For an introduction to OpenStack Object Storage, see the :doc:`/admin/index`. + +You use the HTTPS (SSL) protocol to interact with Object Storage, and +you use standard HTTP calls to perform API operations. You can also use +language-specific APIs, which use the RESTful API, that make it easier +for you to integrate into your applications. + +To assert your right to access and change data in an account, you +identify yourself to Object Storage by using an authentication token. To +get a token, you present your credentials to an authentication service. +The authentication service returns a token and the URL for the account. +Depending on which authentication service that you use, the URL for the +account appears in: + +- **OpenStack Identity Service**. The URL is defined in the service + catalog. + +- **Tempauth**. The URL is provided in the ``X-Storage-Url`` response + header. + +In both cases, the URL is the full URL and includes the account +resource. + +The Object Storage API supports the standard, non-serialized response +format, which is the default, and both JSON and XML serialized response +formats. + +The Object Storage system organizes data in a hierarchy, as follows: + +- **Account**. Represents the top-level of the hierarchy. + + Your service provider creates your account and you own all resources + in that account. The account defines a namespace for containers. A + container might have the same name in two different accounts. + + In the OpenStack environment, *account* is synonymous with a project + or tenant. + +- **Container**. Defines a namespace for objects. An object with the + same name in two different containers represents two different + objects. You can create any number of containers within an account. + + In addition to containing objects, you can also use the container to + control access to objects by using an access control list (ACL). You + cannot store an ACL with individual objects. + + In addition, you configure and control many other features, such as + object versioning, at the container level. + + You can bulk-delete up to 10,000 containers in a single request. + + You can set a storage policy on a container with predefined names + and definitions from your cloud provider. + +- **Object**. Stores data content, such as documents, images, and so + on. You can also store custom metadata with an object. + + With the Object Storage API, you can: + + - Store an unlimited number of objects. Each object can be as large + as 5 GB, which is the default. You can configure the maximum + object size. + + - Upload and store objects of any size with large object creation. + + - Use cross-origin resource sharing to manage object security. + + - Compress files using content-encoding metadata. + + - Override browser behavior for an object using content-disposition metadata. + + - Schedule objects for deletion. + + - Bulk-delete up to 10,000 objects in a single request. + + - Auto-extract archive files. + + - Generate a URL that provides time-limited **GET** access to an + object. + + - Upload objects directly to the Object Storage system from a + browser by using form **POST** middleware. + + - Create symbolic links to other objects. + +The account, container, and object hierarchy affects the way you +interact with the Object Storage API. + +Specifically, the resource path reflects this structure and has this +format: + +.. code:: none + + /v1/{account}/{container}/{object} + +For example, for the ``flowers/rose.jpg`` object in the ``images`` +container in the ``12345678912345`` account, the resource path is: + +.. code:: none + + /v1/12345678912345/images/flowers/rose.jpg + +Notice that the object name contains the ``/`` character. This slash +does not indicate that Object Storage has a sub-hierarchy called +``flowers`` because containers do not store objects in actual +sub-folders. However, the inclusion of ``/`` or a similar convention +inside object names enables you to create pseudo-hierarchical folders +and directories. + +For example, if the endpoint for Object Storage is +``objects.mycloud.com``, the returned URL is +``https://objects.mycloud.com/v1/12345678912345``. + +To access a container, append the container name to the resource path. + +To access an object, append the container and the object name to the +path. + +If you have a large number of containers or objects, you can use query +parameters to page through large lists of containers or objects. Use the +``marker``, ``limit``, and ``end_marker`` query parameters to +control how many items are returned in a list and where the list starts +or ends. If you want to page through in reverse order, you can use the query +parameter ``reverse``, noting that your marker and end_markers should be +switched when applied to a reverse listing. I.e, for a list of objects +``[a, b, c, d, e]`` the non-reversed could be: + +.. code:: none + + /v1/{account}/{container}/?marker=a&end_marker=d + b + c + +However, when reversed marker and end_marker are applied to a reversed list: + +.. code:: none + + /v1/{account}/{container}/?marker=d&end_marker=a&reverse=on + c + b + +Object Storage HTTP requests have the following default constraints. +Your service provider might use different default values. + +============================ ============= ===== +Item Maximum value Notes +============================ ============= ===== +Number of HTTP headers 90 +Length of HTTP headers 4096 bytes +Length per HTTP request line 8192 bytes +Length of HTTP request 5 GB +Length of container names 256 bytes Cannot contain the ``/`` character. +Length of object names 1024 bytes By default, there are no character restrictions. +============================ ============= ===== + +You must UTF-8-encode and then URL-encode container and object names +before you call the API binding. If you use an API binding that performs +the URL-encoding for you, do not URL-encode the names before you call +the API binding. Otherwise, you double-encode these names. Check the +length restrictions against the URL-encoded string. + +The API Reference describes the operations that you can perform with the +Object Storage API: + +- `Storage + accounts `__: + Use to perform account-level tasks. + + Lists containers for a specified account. Creates, updates, and + deletes account metadata. Shows account metadata. + +- `Storage + containers `__: + Use to perform container-level tasks. + + Lists objects in a specified container. Creates, shows details for, + and deletes containers. Creates, updates, shows, and deletes + container metadata. + +- `Storage + objects `__: + Use to perform object-level tasks. + + Creates, replaces, shows details for, and deletes objects. Copies + objects with another object with a new or different name. Updates + object metadata. diff --git a/doc/source/api/object_versioning.rst b/doc/source/api/object_versioning.rst new file mode 100644 index 0000000000..b3438a6e49 --- /dev/null +++ b/doc/source/api/object_versioning.rst @@ -0,0 +1,351 @@ +================= +Object versioning +================= + +You can store multiple versions of your content so that you can recover +from unintended overwrites. Object versioning is an easy way to +implement version control, which you can use with any type of content. + +.. note:: + You cannot version a large-object manifest file, but the large-object + manifest file can point to versioned segments. + +.. note:: + It is strongly recommended that you put non-current objects in a + different container than the container where current object versions + reside. + +To allow object versioning within a cluster, the cloud provider should add the +``versioned_writes`` filter to the pipeline and set the +``allow_versioned_writes`` option to ``true`` in the +``[filter:versioned_writes]`` section of the proxy-server configuration file. + +To enable object versioning for a container, you must specify an "archive +container" that will retain non-current versions via either the +``X-Versions-Location`` or ``X-History-Location`` header. These two headers +enable two distinct modes of operation. Either mode may be used within a +cluster, but only one mode may be active for any given container. You must +UTF-8-encode and then URL-encode the container name before you include it in +the header. + +For both modes, **PUT** requests will archive any pre-existing objects before +writing new data, and **GET** requests will serve the current version. **COPY** +requests behave like a **GET** followed by a **PUT**; that is, if the copy +*source* is in a versioned container then the current version will be copied, +and if the copy *destination* is in a versioned container then any pre-existing +object will be archived before writing new data. + +If object versioning was enabled using ``X-History-Location``, then object +**DELETE** requests will copy the current version to the archive container then +remove it from the versioned container. + +If object versioning was enabled using ``X-Versions-Location``, then object +**DELETE** requests will restore the most-recent version from the archive +container, overwriting the current version. + +Example Using ``X-Versions-Location`` +------------------------------------- + +#. Create the ``current`` container: + + .. code:: console + + # curl -i $publicURL/current -X PUT -H "Content-Length: 0" -H "X-Auth-Token: $token" -H "X-Versions-Location: archive" + + .. code:: console + + HTTP/1.1 201 Created + Content-Length: 0 + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: txb91810fb717347d09eec8-0052e18997 + X-Openstack-Request-Id: txb91810fb717347d09eec8-0052e18997 + Date: Thu, 23 Jan 2014 21:28:55 GMT + +#. Create the first version of an object in the ``current`` container: + + .. code:: console + + # curl -i $publicURL/current/my_object --data-binary 1 -X PUT -H "Content-Length: 0" -H "X-Auth-Token: $token" + + .. code:: console + + HTTP/1.1 201 Created + Last-Modified: Thu, 23 Jan 2014 21:31:22 GMT + Content-Length: 0 + Etag: d41d8cd98f00b204e9800998ecf8427e + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: tx5992d536a4bd4fec973aa-0052e18a2a + X-Openstack-Request-Id: tx5992d536a4bd4fec973aa-0052e18a2a + Date: Thu, 23 Jan 2014 21:31:22 GMT + + Nothing is written to the non-current version container when you + initially **PUT** an object in the ``current`` container. However, + subsequent **PUT** requests that edit an object trigger the creation + of a version of that object in the ``archive`` container. + + These non-current versions are named as follows: + + .. code:: none + + / + + Where ``length`` is the 3-character, zero-padded hexadecimal + character length of the object, ```` is the object name, + and ```` is the time when the object was initially created + as a current version. + +#. Create a second version of the object in the ``current`` container: + + .. code:: console + + # curl -i $publicURL/current/my_object --data-binary 2 -X PUT -H "Content-Length: 0" -H "X-Auth-Token: $token" + + .. code:: console + + HTTP/1.1 201 Created + Last-Modified: Thu, 23 Jan 2014 21:41:32 GMT + Content-Length: 0 + Etag: d41d8cd98f00b204e9800998ecf8427e + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: tx468287ce4fc94eada96ec-0052e18c8c + X-Openstack-Request-Id: tx468287ce4fc94eada96ec-0052e18c8c + Date: Thu, 23 Jan 2014 21:41:32 GMT + +#. Issue a **GET** request to a versioned object to get the current + version of the object. You do not have to do any request redirects or + metadata lookups. + + List older versions of the object in the ``archive`` container: + + .. code:: console + + # curl -i $publicURL/archive?prefix=009my_object -X GET -H "X-Auth-Token: $token" + + .. code:: console + + HTTP/1.1 200 OK + Content-Length: 30 + X-Container-Object-Count: 1 + Accept-Ranges: bytes + X-Timestamp: 1390513280.79684 + X-Container-Bytes-Used: 0 + Content-Type: text/plain; charset=utf-8 + X-Trans-Id: tx9a441884997542d3a5868-0052e18d8e + X-Openstack-Request-Id: tx9a441884997542d3a5868-0052e18d8e + Date: Thu, 23 Jan 2014 21:45:50 GMT + + 009my_object/1390512682.92052 + + .. note:: + A **POST** request to a versioned object updates only the metadata + for the object and does not create a new version of the object. New + versions are created only when the content of the object changes. + +#. Issue a **DELETE** request to a versioned object to remove the + current version of the object and replace it with the next-most + current version in the non-current container. + + .. code:: console + + # curl -i $publicURL/current/my_object -X DELETE -H "X-Auth-Token: $token" + + .. code:: console + + HTTP/1.1 204 No Content + Content-Length: 0 + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: tx006d944e02494e229b8ee-0052e18edd + X-Openstack-Request-Id: tx006d944e02494e229b8ee-0052e18edd + Date: Thu, 23 Jan 2014 21:51:25 GMT + + List objects in the ``archive`` container to show that the archived + object was moved back to the ``current`` container: + + .. code:: console + + # curl -i $publicURL/archive?prefix=009my_object -X GET -H "X-Auth-Token: $token" + + .. code:: console + + HTTP/1.1 204 No Content + Content-Length: 0 + X-Container-Object-Count: 0 + Accept-Ranges: bytes + X-Timestamp: 1390513280.79684 + X-Container-Bytes-Used: 0 + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: tx044f2a05f56f4997af737-0052e18eed + X-Openstack-Request-Id: tx044f2a05f56f4997af737-0052e18eed + Date: Thu, 23 Jan 2014 21:51:41 GMT + + This next-most current version carries with it any metadata last set + on it. If want to completely remove an object and you have five + versions of it, you must **DELETE** it five times. + +Example Using ``X-History-Location`` +------------------------------------ + +#. Create the ``current`` container: + + .. code:: console + + # curl -i $publicURL/current -X PUT -H "Content-Length: 0" -H "X-Auth-Token: $token" -H "X-History-Location: archive" + + .. code:: console + + HTTP/1.1 201 Created + Content-Length: 0 + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: txb91810fb717347d09eec8-0052e18997 + X-Openstack-Request-Id: txb91810fb717347d09eec8-0052e18997 + Date: Thu, 23 Jan 2014 21:28:55 GMT + +#. Create the first version of an object in the ``current`` container: + + .. code:: console + + # curl -i $publicURL/current/my_object --data-binary 1 -X PUT -H "Content-Length: 0" -H "X-Auth-Token: $token" + + .. code:: console + + HTTP/1.1 201 Created + Last-Modified: Thu, 23 Jan 2014 21:31:22 GMT + Content-Length: 0 + Etag: d41d8cd98f00b204e9800998ecf8427e + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: tx5992d536a4bd4fec973aa-0052e18a2a + X-Openstack-Request-Id: tx5992d536a4bd4fec973aa-0052e18a2a + Date: Thu, 23 Jan 2014 21:31:22 GMT + + Nothing is written to the non-current version container when you + initially **PUT** an object in the ``current`` container. However, + subsequent **PUT** requests that edit an object trigger the creation + of a version of that object in the ``archive`` container. + + These non-current versions are named as follows: + + .. code:: none + + / + + Where ``length`` is the 3-character, zero-padded hexadecimal + character length of the object, ```` is the object name, + and ```` is the time when the object was initially created + as a current version. + +#. Create a second version of the object in the ``current`` container: + + .. code:: console + + # curl -i $publicURL/current/my_object --data-binary 2 -X PUT -H "Content-Length: 0" -H "X-Auth-Token: $token" + + .. code:: console + + HTTP/1.1 201 Created + Last-Modified: Thu, 23 Jan 2014 21:41:32 GMT + Content-Length: 0 + Etag: d41d8cd98f00b204e9800998ecf8427e + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: tx468287ce4fc94eada96ec-0052e18c8c + X-Openstack-Request-Id: tx468287ce4fc94eada96ec-0052e18c8c + Date: Thu, 23 Jan 2014 21:41:32 GMT + +#. Issue a **GET** request to a versioned object to get the current + version of the object. You do not have to do any request redirects or + metadata lookups. + + List older versions of the object in the ``archive`` container: + + .. code:: console + + # curl -i $publicURL/archive?prefix=009my_object -X GET -H "X-Auth-Token: $token" + + .. code:: console + + HTTP/1.1 200 OK + Content-Length: 30 + X-Container-Object-Count: 1 + Accept-Ranges: bytes + X-Timestamp: 1390513280.79684 + X-Container-Bytes-Used: 0 + Content-Type: text/plain; charset=utf-8 + X-Trans-Id: tx9a441884997542d3a5868-0052e18d8e + X-Openstack-Request-Id: tx9a441884997542d3a5868-0052e18d8e + Date: Thu, 23 Jan 2014 21:45:50 GMT + + 009my_object/1390512682.92052 + + .. note:: + A **POST** request to a versioned object updates only the metadata + for the object and does not create a new version of the object. New + versions are created only when the content of the object changes. + +#. Issue a **DELETE** request to a versioned object to copy the + current version of the object to the archive container then delete it from + the current container. Subsequent **GET** requests to the object in the + current container will return ``404 Not Found``. + + .. code:: console + + # curl -i $publicURL/current/my_object -X DELETE -H "X-Auth-Token: $token" + + .. code:: console + + HTTP/1.1 204 No Content + Content-Length: 0 + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: tx006d944e02494e229b8ee-0052e18edd + X-Openstack-Request-Id: tx006d944e02494e229b8ee-0052e18edd + Date: Thu, 23 Jan 2014 21:51:25 GMT + + List older versions of the object in the ``archive`` container: + + .. code:: console + + # curl -i $publicURL/archive?prefix=009my_object -X GET -H "X-Auth-Token: $token" + + .. code:: console + + HTTP/1.1 200 OK + Content-Length: 90 + X-Container-Object-Count: 3 + Accept-Ranges: bytes + X-Timestamp: 1390513280.79684 + X-Container-Bytes-Used: 0 + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: tx044f2a05f56f4997af737-0052e18eed + X-Openstack-Request-Id: tx044f2a05f56f4997af737-0052e18eed + Date: Thu, 23 Jan 2014 21:51:41 GMT + + 009my_object/1390512682.92052 + 009my_object/1390512692.23062 + 009my_object/1390513885.67732 + + In addition to the two previous versions of the object, the archive + container has a "delete marker" to record when the object was deleted. + + To permanently delete a previous version, issue a **DELETE** to the version + in the archive container. + +Disabling Object Versioning +--------------------------- + +To disable object versioning for the ``current`` container, remove +its ``X-Versions-Location`` metadata header by sending an empty key +value. + +.. code:: console + + # curl -i $publicURL/current -X PUT -H "Content-Length: 0" -H "X-Auth-Token: $token" -H "X-Versions-Location: " + +.. code:: console + + HTTP/1.1 202 Accepted + Content-Length: 76 + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: txe2476de217134549996d0-0052e19038 + X-Openstack-Request-Id: txe2476de217134549996d0-0052e19038 + Date: Thu, 23 Jan 2014 21:57:12 GMT + +

Accepted

The request is accepted for processing.

+ diff --git a/doc/source/api/pagination.rst b/doc/source/api/pagination.rst new file mode 100644 index 0000000000..8af0f42baa --- /dev/null +++ b/doc/source/api/pagination.rst @@ -0,0 +1,137 @@ +================================================= +Page through large lists of containers or objects +================================================= + +If you have a large number of containers or objects, you can use the +``marker``, ``limit``, and ``end_marker`` parameters to control +how many items are returned in a list and where the list starts or ends. +If you want to page backwards you can use the ``reverse`` parameter. + +* marker + When you request a list of containers or objects, Object Storage + returns a maximum of 10,000 names for each request. To get + subsequent names, you must make another request with the + ``marker`` parameter. Set the ``marker`` parameter to the name of + the last item returned in the previous list. You must URL-encode the + ``marker`` value before you send the HTTP request. Object Storage + returns a maximum of 10,000 names starting after the last item + returned. + +* limit + To return fewer than 10,000 names, use the ``limit`` parameter. If + the number of names returned equals the specified ``limit`` (or + 10,000 if you omit the ``limit`` parameter), you can assume there + are more names to list. If the number of names in the list is + exactly divisible by the ``limit`` value, the last request has no + content. + +* end_marker + Limits the result set to names that are less than the + ``end_marker`` parameter value. You must URL-encode the + ``end_marker`` value before you send the HTTP request. + +* reverse + By default, listings are returned sorted by name, ascending. If you + include the ``reverse=true`` query parameter, the listing will be + returned sorted by name, descending. + +To page through a large list of containers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Assume the following list of container names: + +.. code-block:: console + + apples + bananas + kiwis + oranges + pears + +#. Use a ``limit`` of two: + + .. code-block:: console + + # curl -i $publicURL/?limit=2 -X GET -H "X-Auth-Token: $token" + + .. code-block:: console + + apples + bananas + + Because two container names are returned, there are more names to + list. + +#. Make another request with a ``marker`` parameter set to the name of + the last item returned: + + .. code-block:: console + + # curl -i $publicURL/?limit=2&marker=bananas -X GET -H \ + “X-Auth-Token: $token" + + .. code-block:: console + + kiwis + oranges + + Again, two items are returned, and there might be more. + +#. Make another request with a ``marker`` of the last item returned: + + .. code-block:: console + + # curl -i $publicURL/?limit=2&marker=oranges -X GET -H \" + X-Auth-Token: $token" + + .. code-block:: console + + pears + + You receive a one-item response, which is fewer than the ``limit`` + number of names. This indicates that this is the end of the list. + +#. Use the ``end_marker`` parameter to limit the result set to object + names that are less than the ``end_marker`` parameter value: + + .. code-block:: console + + # curl -i $publicURL/?end_marker=oranges -X GET -H \" + X-Auth-Token: $token" + + .. code-block:: console + + apples + bananas + kiwis + + You receive a result set of all container names before the + ``end-marker`` value. + +#. Use the ``reverse`` parameter to work from the back of the + list: + + .. code-block:: console + + # curl -i $publicURL/?reverse=true -X GET -H \" + X-Auth-Token: $token" + + .. code-block:: console + + pears + oranges + kiwis + bananas + apples + +#. You can also combine parameters: + + .. code-block:: console + + # curl -i $publicURL/?reverse=true&end_marker=kiwis -X GET -H \" + X-Auth-Token: $token" + + .. code-block:: console + + pears + oranges diff --git a/doc/source/api/pseudo-hierarchical-folders-directories.rst b/doc/source/api/pseudo-hierarchical-folders-directories.rst new file mode 100644 index 0000000000..c7e764f2db --- /dev/null +++ b/doc/source/api/pseudo-hierarchical-folders-directories.rst @@ -0,0 +1,155 @@ +=========================================== +Pseudo-hierarchical folders and directories +=========================================== + +Although you cannot nest directories in OpenStack Object Storage, you +can simulate a hierarchical structure within a single container by +adding forward slash characters (``/``) in the object name. To navigate +the pseudo-directory structure, you can use the ``delimiter`` query +parameter. This example shows you how to use pseudo-hierarchical folders +and directories. + +.. note:: + + In this example, the objects reside in a container called ``backups``. + Within that container, the objects are organized in a pseudo-directory + called ``photos``. The container name is not displayed in the example, + but it is a part of the object URLs. For instance, the URL of the + picture ``me.jpg`` is + ``https://swift.example.com/v1/CF_xer7_343/backups/photos/me.jpg``. + +List pseudo-hierarchical folders request: HTTP +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To display a list of all the objects in the storage container, use +``GET`` without a ``delimiter`` or ``prefix``. + +.. code-block:: console + + $ curl -X GET -i -H "X-Auth-Token: $token" \ + $publicurl/v1/AccountString/backups + +The system returns status code 2xx (between 200 and 299, inclusive) and +the requested list of the objects. + +.. code-block:: console + + photos/animals/cats/persian.jpg + photos/animals/cats/siamese.jpg + photos/animals/dogs/corgi.jpg + photos/animals/dogs/poodle.jpg + photos/animals/dogs/terrier.jpg + photos/me.jpg + photos/plants/fern.jpg + photos/plants/rose.jpg + +Use the delimiter parameter to limit the displayed results. To use +``delimiter`` with pseudo-directories, you must use the parameter slash +(``/``). + +.. code-block:: console + + $ curl -X GET -i -H "X-Auth-Token: $token" \ + $publicurl/v1/AccountString/backups?delimiter=/ + +The system returns status code 2xx (between 200 and 299, inclusive) and +the requested matching objects. Because you use the slash, only the +pseudo-directory ``photos/`` displays. The returned values from a slash +``delimiter`` query are not real objects. The value will refer to +a real object if it does not end with a slash. The pseudo-directories +have no content-type, rather, each pseudo-directory has +its own ``subdir`` entry in the response of JSON and XML results. +For example: + +.. code-block:: JSON + + [ + { + "subdir": "photos/" + } + ] + +.. code-block:: XML + + + + + photos/ + + + +Use the ``prefix`` and ``delimiter`` parameters to view the objects +inside a pseudo-directory, including further nested pseudo-directories. + +.. code-block:: console + + $ curl -X GET -i -H "X-Auth-Token: $token" \ + $publicurl/v1/AccountString/backups?prefix=photos/&delimiter=/ + +The system returns status code 2xx (between 200 and 299, inclusive) and +the objects and pseudo-directories within the top level +pseudo-directory. + +.. code-block:: console + + photos/animals/ + photos/me.jpg + photos/plants/ + +.. code-block:: JSON + + [ + { + "subdir": "photos/animals/" + }, + { + "hash": "b249a153f8f38b51e92916bbc6ea57ad", + "last_modified": "2015-12-03T17:31:28.187370", + "bytes": 2906, + "name": "photos/me.jpg", + "content_type": "image/jpeg" + }, + { + "subdir": "photos/plants/" + } + ] + +.. code-block:: XML + + + + + photos/animals/ + + + photos/me.jpg + b249a153f8f38b51e92916bbc6ea57ad + 2906 + image/jpeg + 2015-12-03T17:31:28.187370 + + + photos/plants/ + + + +You can create an unlimited number of nested pseudo-directories. To +navigate through them, use a longer ``prefix`` parameter coupled with +the ``delimiter`` parameter. In this sample output, there is a +pseudo-directory called ``dogs`` within the pseudo-directory +``animals``. To navigate directly to the files contained within +``dogs``, enter the following command: + +.. code-block:: console + + $ curl -X GET -i -H "X-Auth-Token: $token" \ + $publicurl/v1/AccountString/backups?prefix=photos/animals/dogs/&delimiter=/ + +The system returns status code 2xx (between 200 and 299, inclusive) and +the objects and pseudo-directories within the nested pseudo-directory. + +.. code-block:: console + + photos/animals/dogs/corgi.jpg + photos/animals/dogs/poodle.jpg + photos/animals/dogs/terrier.jpg diff --git a/doc/source/api/serialized-response-formats.rst b/doc/source/api/serialized-response-formats.rst new file mode 100644 index 0000000000..8e60c7fcf6 --- /dev/null +++ b/doc/source/api/serialized-response-formats.rst @@ -0,0 +1,119 @@ +=========================== +Serialized response formats +=========================== + +By default, the Object Storage API uses a ``text/plain`` response +format. In addition, both JSON and XML data serialization response +formats are supported. + +To define the response format, use one of these methods: + ++-------------------+-------------------------------------------------------+ +|Method |Description | ++===================+=======================================================+ +|format= ``format`` |Append this parameter to the URL for a ``GET`` request,| +|query parameter |where ``format`` is ``json`` or ``xml``. | ++-------------------+-------------------------------------------------------+ +|``Accept`` request |Include this header in the ``GET`` request. | +|header |The valid header values are: | +| | | +| |text/plain | +| | Plain text response format. The default. | +| |application/jsontext | +| | JSON data serialization response format. | +| |application/xml | +| | XML data serialization response format. | +| |text/xml | +| | XML data serialization response format. | ++-------------------+-------------------------------------------------------+ + +Example 1. JSON example with format query parameter +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For example, this request uses the ``format`` query parameter to ask +for a JSON response: + +.. code-block:: console + + $ curl -i $publicURL?format=json -X GET -H "X-Auth-Token: $token" + +.. code-block:: console + + HTTP/1.1 200 OK + Content-Length: 96 + X-Account-Object-Count: 1 + X-Timestamp: 1389453423.35964 + X-Account-Meta-Subject: Literature + X-Account-Bytes-Used: 14 + X-Account-Container-Count: 2 + Content-Type: application/json; charset=utf-8 + Accept-Ranges: bytes + X-Trans-Id: tx274a77a8975c4a66aeb24-0052d95365 + Date: Fri, 17 Jan 2014 15:59:33 GMT + +Object Storage lists container names with additional information in JSON +format: + +.. code-block:: json + + [ + { + "count":0, + "bytes":0, + "name":"janeausten" + }, + { + "count":1, + "bytes":14, + "name":"marktwain" + } + ] + + +Example 2. XML example with Accept header +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This request uses the ``Accept`` request header to ask for an XML +response: + +.. code-block:: console + + $ curl -i $publicURL -X GET -H "X-Auth-Token: $token" -H \ + "Accept: application/xml; charset=utf-8" + +.. code-block:: console + + HTTP/1.1 200 OK + Content-Length: 263 + X-Account-Object-Count: 3 + X-Account-Meta-Book: MobyDick + X-Timestamp: 1389453423.35964 + X-Account-Bytes-Used: 47 + X-Account-Container-Count: 2 + Content-Type: application/xml; charset=utf-8 + Accept-Ranges: bytes + X-Trans-Id: txf0b4c9727c3e491694019-0052e03420 + Date: Wed, 22 Jan 2014 21:12:00 GMT + +Object Storage lists container names with additional information in XML +format: + +.. code-block:: xml + + + + + janeausten + 2 + 33 + + + marktwain + 1 + 14 + + + +The remainder of the examples in this guide use standard, non-serialized +responses. However, all ``GET`` requests that perform list operations +accept the ``format`` query parameter or ``Accept`` request header. diff --git a/doc/source/api/static-website.rst b/doc/source/api/static-website.rst new file mode 100644 index 0000000000..48dd34c9df --- /dev/null +++ b/doc/source/api/static-website.rst @@ -0,0 +1,120 @@ +.. _static-website: + +===================== +Create static website +===================== + +To discover whether your Object Storage system supports this feature, +see :ref:`discoverability`. Alternatively, check with your service +provider. + +You can use your Object Storage account to create a static website. This +static website is created with Static Web middleware and serves container +data with a specified index file, error file resolution, and optional +file listings. This mode is normally active only for anonymous requests, +which provide no authentication token. To use it with authenticated +requests, set the header ``X-Web-Mode`` to ``TRUE`` on the request. + +The Static Web filter must be added to the pipeline in your +``/etc/swift/proxy-server.conf`` file below any authentication +middleware. You must also add a Static Web middleware configuration +section. + +Your publicly readable containers are checked for two headers, +``X-Container-Meta-Web-Index`` and ``X-Container-Meta-Web-Error``. The +``X-Container-Meta-Web-Error`` header is discussed below, in the +section called :ref:`set_error_static_website`. + +Use ``X-Container-Meta-Web-Index`` to determine the index file (or +default page served, such as ``index.html``) for your website. When +someone initially enters your site, the ``index.html`` file displays +automatically. If you create sub-directories for your site by creating +pseudo-directories in your container, the index page for each +sub-directory is displayed by default. If your pseudo-directory does not +have a file with the same name as your index file, visits to the +sub-directory return a 404 error. + +You also have the option of displaying a list of files in your +pseudo-directory instead of a web page. To do this, set the +``X-Container-Meta-Web-Listings`` header to ``TRUE``. You may add styles +to your file listing by setting ``X-Container-Meta-Web-Listings-CSS`` +to a style sheet (for example, ``lists.css``). + +Static Web middleware through Object Storage +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following sections show how to use Static Web middleware through +Object Storage. + +Make container publicly readable +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Make the container publicly readable. Once the container is publicly +readable, you can access your objects directly, but you must set the +index file to browse the main site URL and its sub-directories. + +.. code-block:: console + + $ swift post -r '.r:*,.rlistings' container + + +Set site index file +^^^^^^^^^^^^^^^^^^^ + +Set the index file. In this case, ``index.html`` is the default file +displayed when the site appears. + +.. code-block:: console + + $ swift post -m 'web-index:index.html' container + +Enable file listing +^^^^^^^^^^^^^^^^^^^ + +Turn on file listing. If you do not set the index file, the URL displays +a list of the objects in the container. Instructions on styling the list +with a CSS follow. + +.. code-block:: console + + $ swift post -m 'web-listings: true' container + +Enable CSS for file listing +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Style the file listing using a CSS. + +.. code-block:: console + + $ swift post -m 'web-listings-css:listings.css' container + +.. _set_error_static_website: + +Set error pages for static website +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can create and set custom error pages for visitors to your website; +currently, only 401 (Unauthorized) and 404 (Not Found) errors are +supported. To do this, set the metadata header, +``X-Container-Meta-Web-Error``. + +Error pages are served with the status code pre-pended to the name of +the error page you set. For instance, if you set +``X-Container-Meta-Web-Error`` to ``error.html``, 401 errors will +display the page ``401error.html``. Similarly, 404 errors will display +``404error.html``. You must have both of these pages created in your +container when you set the ``X-Container-Meta-Web-Error`` metadata, or +your site will display generic error pages. + +You only have to set the ``X-Container-Meta-Web-Error`` metadata once +for your entire static website. + +Set error pages for static website request +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: console + + $ swift post -m 'web-error:error.html' container + + +Any 2\ ``nn`` response indicates success. diff --git a/doc/source/api/temporary_url_middleware.rst b/doc/source/api/temporary_url_middleware.rst new file mode 100644 index 0000000000..767b4722aa --- /dev/null +++ b/doc/source/api/temporary_url_middleware.rst @@ -0,0 +1,230 @@ +======================== +Temporary URL middleware +======================== + +To discover whether your Object Storage system supports this feature, +check with your service provider or send a **GET** request using the ``/info`` +path. + +A temporary URL gives users temporary access to objects. For example, a +website might want to provide a link to download a large object in +Object Storage, but the Object Storage account has no public access. The +website can generate a URL that provides time-limited **GET** access to +the object. When the web browser user clicks on the link, the browser +downloads the object directly from Object Storage, eliminating the need +for the website to act as a proxy for the request. + +Furthermore, a temporary URL can be prefix-based. These URLs +contain a signature which is valid for all objects which share +a common prefix. They are useful for sharing a set of objects. + +Ask your cloud administrator to enable the temporary URL feature. For +information, see :ref:`tempurl` in the *Source Documentation*. + +.. note:: + + To use **POST** requests to upload objects to specific Object Storage + locations, use :doc:`form_post_middleware` instead of temporary URL middleware. + +Temporary URL format +~~~~~~~~~~~~~~~~~~~~ + +A temporary URL is comprised of the URL for an object with added query +parameters: + +**Example Temporary URL format** + +.. code:: none + + https://swift-cluster.example.com/v1/my_account/container/object + ?temp_url_sig=732fcac368abb10c78a4cbe95c3fab7f311584532bf779abd5074e13cbe8b88b + &temp_url_expires=1323479485 + &filename=My+Test+File.pdf + +The example shows these elements: + + +**Object URL**: Required. The full path URL to the object. + +**temp\_url\_sig**: Required. An HMAC cryptographic signature that defines +the allowed HTTP method, expiration date, full path to the object, and the +secret key for the temporary URL. The digest used (for example, SHA-256 or +SHA-512) must be supported by the cluster; supported digests will be listed +in the ``tempurl.allowed_digests`` key in the cluster's capabilities. + +**temp\_url\_expires**: Required. An expiration date as a UNIX Epoch timestamp +or ISO 8601 UTC timestamp. For example, ``1390852007`` or +``2014-01-27T19:46:47Z`` can be used to represent +``Mon, 27 Jan 2014 19:46:47 GMT``. + +For more information, see `Epoch & Unix Timestamp Conversion +Tools `__. + +**filename**: Optional. Overrides the default file name. Object Storage +generates a default file name for **GET** temporary URLs that is based on the +object name. Object Storage returns this value in the ``Content-Disposition`` +response header. Browsers can interpret this file name value as a file +attachment to be saved. + +A prefix-based temporary URL is similar but requires the parameter +``temp_url_prefix``, which must be equal to the common prefix shared +by all object names for which the URL is valid. + +.. code:: none + + https://swift-cluster.example.com/v1/my_account/container/my_prefix/object + ?temp_url_sig=732fcac368abb10c78a4cbe95c3fab7f311584532bf779abd5074e13cbe8b88b + &temp_url_expires=2011-12-10T01:11:25Z + &temp_url_prefix=my_prefix + +.. _secret_keys: + +Secret Keys +~~~~~~~~~~~ + +The cryptographic signature used in Temporary URLs and also in +:doc:`form_post_middleware` uses a secret key. Object Storage allows you to +store two secret key values per account, and two per container. When validating +a request, Object Storage checks signatures against all keys. Using two keys at +each level enables key rotation without invalidating existing temporary URLs. + +To set the keys at the account level, set one or both of the following +request headers to arbitrary values on a **POST** request to the account: + +- ``X-Account-Meta-Temp-URL-Key`` + +- ``X-Account-Meta-Temp-URL-Key-2`` + +To set the keys at the container level, set one or both of the following +request headers to arbitrary values on a **POST** or **PUT** request to the +container: + +- ``X-Container-Meta-Temp-URL-Key`` + +- ``X-Container-Meta-Temp-URL-Key-2`` + +The arbitrary values serve as the secret keys. + +For example, use the **swift post** command to set the secret key to +*``MYKEY``*: + +.. code:: console + + $ swift post -m "Temp-URL-Key:MYKEY" + +.. note:: + + Changing these headers invalidates any previously generated temporary + URLs within 60 seconds, which is the memcache time for the key. + +HMAC signature for temporary URLs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Temporary URL middleware uses an HMAC cryptographic signature. This +signature includes these elements: + +- The allowed method. Typically, **GET** or **PUT**. + +- Expiry time. In the example for the HMAC-SHA256 signature for temporary + URLs below, the expiry time is set to ``86400`` seconds (or 1 day) + into the future. Please be aware that you have to use a UNIX timestamp + for generating the signature (in the API request it is also allowed to + use an ISO 8601 UTC timestamp). + +- The path. Starting with ``/v1/`` onwards and including a container + name and object. The path for prefix-based signatures must start with + ``prefix:/v1/``. Do not URL-encode the path at this stage. + +- The secret key. Use one of the key values as described + in :ref:`secret_keys`. + +These sample Python codes show how to compute a signature for use with +temporary URLs: + +**Example HMAC-SHA256 signature for object-based temporary URLs** + +.. code:: python + + import hmac + from hashlib import sha256 + from time import time + method = 'GET' + duration_in_seconds = 60*60*24 + expires = int(time() + duration_in_seconds) + path = '/v1/my_account/container/object' + key = 'MYKEY' + hmac_body = '%s\n%s\n%s' % (method, expires, path) + signature = hmac.new(key, hmac_body, sha256).hexdigest() + +**Example HMAC-SHA512 signature for prefix-based temporary URLs** + +.. code:: python + + import hmac + from hashlib import sha512 + from time import time + method = 'GET' + duration_in_seconds = 60*60*24 + expires = int(time() + duration_in_seconds) + path = 'prefix:/v1/my_account/container/my_prefix' + key = 'MYKEY' + hmac_body = '%s\n%s\n%s' % (method, expires, path) + signature = hmac.new(key, hmac_body, sha512).hexdigest() + +Do not URL-encode the path when you generate the HMAC signature. +However, when you make the actual HTTP request, you should properly +URL-encode the URL. + +The *``MYKEY``* value is one of the key values as described +in :ref:`secret_keys`. + +For more information, see `RFC 2104: HMAC: Keyed-Hashing for Message +Authentication `__. + +If you want to transform a UNIX timestamp into an ISO 8601 UTC timestamp, +you can use following code snippet: + +.. code:: python + + import time + time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(timestamp)) + +Using the ``swift`` tool to generate a Temporary URL +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``swift`` tool provides the tempurl_ option that +auto-generates the *``temp_url_sig``* and *``temp_url_expires``* query +parameters. For example, you might run this command: + +.. code:: console + + $ swift tempurl GET 3600 /v1/my_account/container/object MYKEY + +.. note:: + + The ``swift`` tool is not yet updated and continues to use the + deprecated cipher SHA1. + +This command returns the path: + +.. code:: none + + /v1/my_account/container/object + ?temp_url_sig=5c4cc8886f36a9d0919d708ade98bf0cc71c9e91 + &temp_url_expires=1374497657 + +To create the temporary URL, prefix this path with the Object Storage +storage host name. For example, prefix the path with +``https://swift-cluster.example.com``, as follows: + +.. code:: none + + https://swift-cluster.example.com/v1/my_account/container/object + ?temp_url_sig=5c4cc8886f36a9d0919d708ade98bf0cc71c9e91 + &temp_url_expires=1374497657 + +Note that if the above example is copied exactly, and used in a command +shell, then the ampersand is interpreted as an operator and the URL +will be truncated. Enclose the URL in quotation marks to avoid this. + +.. _tempurl: https://docs.openstack.org/python-swiftclient/latest/cli/index.html#swift-tempurl diff --git a/doc/source/api/use_content-encoding_metadata.rst b/doc/source/api/use_content-encoding_metadata.rst new file mode 100644 index 0000000000..18c94878e6 --- /dev/null +++ b/doc/source/api/use_content-encoding_metadata.rst @@ -0,0 +1,22 @@ +============================= +Use Content-Encoding metadata +============================= + +When you create an object or update its metadata, you can optionally set +the ``Content-Encoding`` metadata. This metadata enables you to indicate +that the object content is compressed without losing the identity of the +underlying media type (``Content-Type``) of the file, such as a video. + +**Example Content-Encoding header request: HTTP** + +This example assigns an attachment type to the ``Content-Encoding`` +header that indicates how the file is downloaded: + +.. code:: none + + PUT //// HTTP/1.1 + Host: storage.clouddrive.com + X-Auth-Token: eaaafd18-0fed-4b3a-81b4-663c99ec1cbb + Content-Type: video/mp4 + Content-Encoding: gzip + diff --git a/doc/source/api/use_the_content-disposition_metadata.rst b/doc/source/api/use_the_content-disposition_metadata.rst new file mode 100644 index 0000000000..fc6cf95fc7 --- /dev/null +++ b/doc/source/api/use_the_content-disposition_metadata.rst @@ -0,0 +1,31 @@ +==================================== +Use the Content-Disposition metadata +==================================== + +To override the default behavior for a browser, use the +``Content-Disposition`` header to specify the override behavior and +assign this header to an object. For example, this header might specify +that the browser use a download program to save this file rather than +show the file, which is the default. + +**Example Override browser default behavior request: HTTP** + +This example assigns an attachment type to the ``Content-Disposition`` +header. This attachment type indicates that the file is to be downloaded +as ``goodbye.txt``: + +.. code:: console + + # curl -i $publicURL/marktwain/goodbye -X POST -H "X-Auth-Token: $token" -H "Content-Length: 14" -H "Content-Type: application/octet-stream" -H "Content-Disposition: attachment; filename=goodbye.txt" + +.. code:: console + + HTTP/1.1 202 Accepted + Content-Length: 76 + Content-Type: text/html; charset=UTF-8 + X-Trans-Id: txa9b5e57d7f354d7ea9f57-0052e17e13 + X-Openstack-Request-Id: txa9b5e57d7f354d7ea9f57-0052e17e13 + Date: Thu, 23 Jan 2014 20:39:47 GMT + +

Accepted

The request is accepted for processing.

+ diff --git a/doc/source/associated_projects.rst b/doc/source/associated_projects.rst index 10e01fbc4e..27db41fa3d 100644 --- a/doc/source/associated_projects.rst +++ b/doc/source/associated_projects.rst @@ -3,49 +3,89 @@ Associated Projects =================== +.. _application-bindings: Application Bindings -------------------- * OpenStack supported binding: - * `Python-SwiftClient `_ + * `Python-SwiftClient `_ -* Made for Cloud Files, but mostly work with Swift too: +* Unofficial libraries and bindings: - * `CSharp-CloudFiles `_ - * `Java-CloudFiles `_ - * `PHP-CloudFiles `_ - * `Python-CloudFiles `_ - * `Ruby-CloudFiles `_ + * PHP -* `RSwift `_ - Unofficial R API bindings. + * `PHP-opencloud `_ - Official Rackspace PHP + bindings that should work for other Swift deployments too. + + * Ruby + + * `swift_client `_ - + Small but powerful Ruby client to interact with OpenStack Swift + * `nightcrawler_swift `_ - + This Ruby gem teleports your assets to an OpenStack Swift bucket/container + * `swift storage `_ - + Simple OpenStack Swift storage client. + + * Java + + * `libcloud `_ - Apache Libcloud - a unified + interface in Python for different clouds with OpenStack Swift support. + * `jclouds `_ - + Java library offering bindings for all OpenStack projects + * `java-openstack-swift `_ - + Java bindings for OpenStack Swift + * `javaswift `_ - Collection of Java tools for Swift + + * Bash + + * `supload `_ - Bash script to + upload file to cloud storage based on OpenStack Swift API. + + * .NET + + * `openstacknetsdk.org `_ - An OpenStack + Cloud SDK for Microsoft .NET. + + * Go + + * `Go language bindings `_ + * `Gophercloud an OpenStack SDK for Go `_ -* `Go language bindings `_ Authentication -------------- -* `Keystone `_ - Official Identity Service for OpenStack. -* `Swauth `_ - Older Swift authentication service that only requires Swift itself. +* `Keystone `_ - Official Identity + Service for OpenStack. +* `Swauth `_ - **RETIRED**: An alternative Swift + authentication service that only requires Swift itself. +* `Basicauth `_ - HTTP Basic + authentication support (keystone backed). Command Line Access ------------------- -* `Swiftly `_ - Alternate command line access to Swift with direct (no proxy) access capabilities as well. +* `Swiftly `_ - Alternate command line + access to Swift with direct (no proxy) access capabilities as well. Log Processing -------------- -* `Slogging `_ - Basic stats and logging tools. +* `slogging `_ - Basic stats and + logging tools. Monitoring & Statistics ----------------------- -* `Swift Informant `_ - Swift Proxy Middleware to send events to a statsd instance. +* `Swift Informant `_ - + Swift proxy Middleware to send events to a statsd instance. +* `Swift Inspector `_ - + Swift middleware to relay information about a request back to the client. Content Distribution Network Integration @@ -57,8 +97,18 @@ Content Distribution Network Integration Alternative API --------------- -* `Swift3 `_ - Amazon S3 API emulation. -* `CDMI `_ - CDMI support +* `ProxyFS `_ - Integrated file and + object access for Swift object storage +* `SwiftHLM `_ - a middleware for + using OpenStack Swift with tape and other high latency media storage + backends. + + +Benchmarking/Load Generators +---------------------------- + +* `getput `_ - getput tool suite +* `COSbench `_ - COSbench tool suite .. _custom-logger-hooks-label: @@ -66,9 +116,51 @@ Alternative API Custom Logger Hooks ------------------- -* `swift-sentry `_ - Sentry exception reporting for Swift +* `swift-sentry `_ - + Sentry exception reporting for Swift + +Storage Backends (DiskFile API implementations) +----------------------------------------------- +* `Swift-on-File `_ - + Enables objects created using Swift API to be accessed as files on a POSIX + filesystem and vice versa. +* `swift-scality-backend `_ - + Scality sproxyd object server implementation for Swift. + +Developer Tools +--------------- +* `SAIO bash scripts `_ - + Well commented simple bash scripts for Swift all in one setup. +* `vagrant-swift-all-in-one + `_ - Quickly setup a + standard development environment using Vagrant and Chef cookbooks in an + Ubuntu virtual machine. +* `SAIO Ansible playbook `_ - + Quickly setup a standard development environment using Vagrant and Ansible in + a Fedora virtual machine (with built-in `Swift-on-File + `_ support). +* `Multi Swift `_ - + Bash scripts to spin up multiple Swift clusters sharing the same hardware + Other ----- -* `Glance `_ - Provides services for discovering, registering, and retrieving virtual machine images (for OpenStack Compute [Nova], for example). +* `Glance `_ - Provides services for + discovering, registering, and retrieving virtual machine images + (for OpenStack Compute [Nova], for example). +* `Django Swiftbrowser `_ - + Simple Django web app to access OpenStack Swift. +* `Swift-account-stats `_ - + Swift-account-stats is a tool to report statistics on Swift usage at + tenant and global levels. +* `PyECLib `_ - High-level erasure code + library used by Swift +* `liberasurecode `_ - Low-level + erasure code library used by PyECLib +* `Swift Browser `_ - JavaScript + interface for Swift +* `swift-ui `_ - OpenStack Swift + web browser +* `swiftbackmeup `_ - + Utility that allows one to create backups and upload them to OpenStack Swift diff --git a/doc/source/audit_watchers.rst b/doc/source/audit_watchers.rst new file mode 100644 index 0000000000..51ca39537e --- /dev/null +++ b/doc/source/audit_watchers.rst @@ -0,0 +1,12 @@ +.. _common_audit_watchers: + +********************* +Object Audit Watchers +********************* + +.. _dark_data: + +Dark Data +========= + +.. automodule:: swift.obj.watchers.dark_data diff --git a/doc/source/conf.py b/doc/source/conf.py index c1cb24ddf9..c8a0d7afb9 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -1,5 +1,18 @@ # -*- coding: utf-8 -*- -# Copyright (c) 2010-2012 OpenStack, LLC. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Copyright (c) 2010-2012 OpenStack Foundation. # # Swift documentation build configuration file, created by # sphinx-quickstart on Tue May 18 13:50:15 2010. @@ -13,8 +26,19 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys +import datetime +import logging import os +import sys + +# NOTE(amotoki): Our current doc build job uses an older version of +# liberasurecode which comes from Ubuntu 16.04. +# pyeclib emits a warning message if liberasurecode <1.3.1 is used [1] and +# this causes the doc build failure if warning-is-error is enabled in Sphinx. +# As a workaround we suppress the warning message from pyeclib until we use +# a newer version of liberasurecode in our doc build job. +# [1] https://github.com/openstack/pyeclib/commit/d163972b +logging.getLogger('pyeclib').setLevel(logging.ERROR) # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the @@ -26,56 +50,47 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', - 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.pngmath', - 'sphinx.ext.ifconfig'] +extensions = ['sphinx.ext.autodoc', + 'sphinx.ext.todo', + 'sphinx.ext.coverage', + 'sphinx.ext.ifconfig', + 'openstackdocstheme', + 'sphinxcontrib.rsvgconverter'] todo_include_todos = True # Add any paths that contain templates here, relative to this directory. -# Changing the path so that the Hudson build output contains GA code and the -# source docs do not contain the code so local, offline sphinx builds are -# "clean." -templates_path = [] -if os.getenv('HUDSON_PUBLISH_DOCS'): - templates_path = ['_ga', '_templates'] -else: - templates_path = ['_templates'] +# templates_path = [] # The suffix of source filenames. source_suffix = '.rst' # The encoding of source files. -#source_encoding = 'utf-8' +# source_encoding = 'utf-8' # The master toctree document. master_doc = 'index' # General information about the project. -project = u'Swift' -copyright = u'2011-present, OpenStack, LLC' - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -from swift import __version__ -version = __version__.rsplit('.', 1)[0] -# The full version, including alpha/beta/rc tags. -release = __version__ +project = 'Swift' +if 'SOURCE_DATE_EPOCH' in os.environ: + now = float(os.environ.get('SOURCE_DATE_EPOCH')) + now = datetime.datetime.fromtimestamp(now, tz=datetime.timezone.utc) +else: + now = datetime.date.today() +copyright = '%d, OpenStack Foundation' % now.year # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -#language = None +# language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of documents that shouldn't be included in the build. -#unused_docs = [] +# unused_docs = [] # List of directories, relative to source directory, that shouldn't be searched # for source files. @@ -83,21 +98,21 @@ # The reST default role (used for this markup: `text`) to use for all # documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. show_authors = True # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = 'native' # A list of ignored prefixes for module index sorting. modindex_common_prefix = ['swift.'] @@ -108,74 +123,75 @@ # The theme to use for HTML and HTML Help pages. Major themes that come with # Sphinx are currently 'default' and 'sphinxdoc'. # html_theme = 'default' -html_theme_path = ["."] -html_theme = '_theme' +# html_theme_path = ["."] +html_theme = 'openstackdocs' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -#html_theme_options = {} +html_theme_options = { + # turn off the "these docs aren't current" banner + 'display_badge': False, +} # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +# html_static_path = ['_static'] -# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, -# using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' -git_cmd = "git log --pretty=format:'%ad, commit %h' --date=local -n1" -html_last_updated_fmt = os.popen(git_cmd).read() +# Add any paths that contain "extra" files, such as .htaccess or +# robots.txt. +html_extra_path = ['_extra'] # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_use_modindex = True +# html_use_modindex = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = '' +# html_file_suffix = '' # Output file base name for HTML help builder. htmlhelp_basename = 'swiftdoc' @@ -184,37 +200,41 @@ # -- Options for LaTeX output ------------------------------------------------- # The paper size ('letter' or 'a4'). -#latex_paper_size = 'letter' +# latex_paper_size = 'letter' # The font size ('10pt', '11pt' or '12pt'). -#latex_font_size = '10pt' +# latex_font_size = '10pt' # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass # [howto/manual]). latex_documents = [ - ('index', 'Swift.tex', u'Swift Documentation', - u'Swift Team', 'manual'), + ('index', 'doc-swift.tex', 'Swift Documentation', + 'Swift Team', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # Additional stuff for the LaTeX preamble. -#latex_preamble = '' +# latex_preamble = '' # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_use_modindex = True +# latex_use_modindex = True + +latex_use_xindy = False -# Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {'python': ('http://docs.python.org/', None), - 'nova': ('http://nova.openstack.org', None), - 'glance': ('http://glance.openstack.org', None)} +# -- Options for openstackdocstheme ------------------------------------------- +openstackdocs_repo_name = 'openstack/swift' +openstackdocs_pdf_link = True +openstackdocs_auto_name = False +openstackdocs_bug_project = 'swift' +openstackdocs_bug_tag = '' diff --git a/doc/source/config/account_server_config.rst b/doc/source/config/account_server_config.rst new file mode 100644 index 0000000000..f305c021b0 --- /dev/null +++ b/doc/source/config/account_server_config.rst @@ -0,0 +1,377 @@ +.. _account-server-config: + +---------------------------- +Account Server Configuration +---------------------------- + +This document describes the configuration options available for the account +server. Documentation for other swift configuration options can be found at +:doc:`index`. + +An example Account Server configuration can be found at +etc/account-server.conf-sample in the source code repository. + +The following configuration sections are available: + +* :ref:`[DEFAULT] ` +* `[account-server]`_ +* `[account-replicator]`_ +* `[account-auditor]`_ +* `[account-reaper]`_ + +.. _account_server_default_options: + +********* +[DEFAULT] +********* + +=============================== ========== ============================================= +Option Default Description +------------------------------- ---------- --------------------------------------------- +swift_dir /etc/swift Swift configuration directory +devices /srv/node Parent directory or where devices are mounted +mount_check true Whether or not check if the devices are + mounted to prevent accidentally writing + to the root device +bind_ip 0.0.0.0 IP Address for server to bind to +bind_port 6202 Port for server to bind to +keep_idle 600 Value to set for socket TCP_KEEPIDLE +bind_timeout 30 Seconds to attempt bind before giving up +backlog 4096 Maximum number of allowed pending + connections +workers auto Override the number of pre-forked workers + that will accept connections. If set it + should be an integer, zero means no fork. If + unset, it will try to default to the number + of effective cpu cores and fallback to one. + Increasing the number of workers may reduce + the possibility of slow file system + operations in one request from negatively + impacting other requests. See + :ref:`general-service-tuning`. +max_clients 1024 Maximum number of clients one worker can + process simultaneously (it will actually + accept(2) N + 1). Setting this to one (1) + will only handle one request at a time, + without accepting another request + concurrently. +user swift User to run as +db_preallocation off If you don't mind the extra disk space usage in + overhead, you can turn this on to preallocate + disk space with SQLite databases to decrease + fragmentation. +disable_fallocate false Disable "fast fail" fallocate checks if the + underlying filesystem does not support it. +log_name swift Label used when logging +log_facility LOG_LOCAL0 Syslog log facility +log_level INFO Logging level +log_address /dev/log Logging directory +log_max_line_length 0 Caps the length of log lines to the + value given; no limit if set to 0, the + default. +log_custom_handlers None Comma-separated list of functions to call + to setup custom log handlers. +log_udp_host Override log_address +log_udp_port 514 UDP log port +log_statsd_host None Enables StatsD logging; IPv4/IPv6 + address or a hostname. If a + hostname resolves to an IPv4 and IPv6 + address, the IPv4 address will be + used. +log_statsd_port 8125 +log_statsd_default_sample_rate 1.0 +log_statsd_sample_rate_factor 1.0 +log_statsd_metric_prefix +eventlet_debug false If true, turn on debug logging for eventlet +fallocate_reserve 1% You can set fallocate_reserve to the + number of bytes or percentage of disk + space you'd like fallocate to reserve, + whether there is space for the given + file size or not. Percentage will be used + if the value ends with a '%'. This is + useful for systems that behave badly when + they completely run out of space; you can + make the services pretend they're out of + space early. +nice_priority None Scheduling priority of server processes. + Niceness values range from -20 (most + favorable to the process) to 19 (least + favorable to the process). The default + does not modify priority. +ionice_class None I/O scheduling class of server processes. + I/O niceness class values are IOPRIO_CLASS_RT + (realtime), IOPRIO_CLASS_BE (best-effort), + and IOPRIO_CLASS_IDLE (idle). + The default does not modify class and + priority. Linux supports io scheduling + priorities and classes since 2.6.13 with + the CFQ io scheduler. + Work only with ionice_priority. +ionice_priority None I/O scheduling priority of server processes. + I/O niceness priority is a number which + goes from 0 to 7. The higher the value, + the lower the I/O priority of the process. + Work only with ionice_class. + Ignored if IOPRIO_CLASS_IDLE is set. +=============================== ========== ============================================= + +**************** +[account-server] +**************** + +============================= ============== ========================================== +Option Default Description +----------------------------- -------------- ------------------------------------------ +use Entry point for paste.deploy for the account + server. For most cases, this should be + ``egg:swift#account``. +set log_name account-server Label used when logging +set log_facility LOG_LOCAL0 Syslog log facility +set log_level INFO Logging level +set log_requests True Whether or not to log each + request +set log_address /dev/log Logging directory +replication_server Configure parameter for creating + specific server. To handle all verbs, + including replication verbs, do not + specify "replication_server" + (this is the default). To only + handle replication, set to a True + value (e.g. "True" or "1"). + To handle only non-replication + verbs, set to "False". Unless you + have a separate replication network, you + should not specify any value for + "replication_server". +nice_priority None Scheduling priority of server processes. + Niceness values range from -20 (most + favorable to the process) to 19 (least + favorable to the process). The default + does not modify priority. +ionice_class None I/O scheduling class of server processes. + I/O niceness class values are IOPRIO_CLASS_RT + (realtime), IOPRIO_CLASS_BE (best-effort), + and IOPRIO_CLASS_IDLE (idle). + The default does not modify class and + priority. Linux supports io scheduling + priorities and classes since 2.6.13 with + the CFQ io scheduler. + Work only with ionice_priority. +ionice_priority None I/O scheduling priority of server + processes. I/O niceness priority is + a number which goes from 0 to 7. + The higher the value, the lower the I/O + priority of the process. Work only with + ionice_class. + Ignored if IOPRIO_CLASS_IDLE is set. +============================= ============== ========================================== + +******************** +[account-replicator] +******************** + +==================== ========================= ===================================== +Option Default Description +-------------------- ------------------------- ------------------------------------- +log_name account-replicator Label used when logging +log_facility LOG_LOCAL0 Syslog log facility +log_level INFO Logging level +log_address /dev/log Logging directory +per_diff 1000 Maximum number of database rows + that will be sync'd in a single + HTTP replication request. + Databases with less than or + equal to this number of + differing rows will always be + sync'd using an HTTP replication + request rather than using rsync. +max_diffs 100 Maximum number of HTTP + replication requests attempted + on each replication pass for any + one container. This caps how + long the replicator will spend + trying to sync a given database + per pass so the other databases + don't get starved. +concurrency 8 Number of replication workers + to spawn +interval 30 Time in seconds to wait between + replication passes +databases_per_second 50 Maximum databases to process + per second. Should be tuned + according to individual + system specs. 0 is unlimited. +node_timeout 10 Request timeout to external + services +conn_timeout 0.5 Connection timeout to external + services +reclaim_age 604800 Time elapsed in seconds before + an account can be reclaimed +rsync_module {replication_ip}::account Format of the rsync module where + the replicator will send data. + The configuration value can + include some variables that will + be extracted from the ring. + Variables must follow the format + {NAME} where NAME is one of: ip, + port, replication_ip, + replication_port, region, zone, + device, meta. See + etc/rsyncd.conf-sample for some + examples. +rsync_compress no Allow rsync to compress data + which is transmitted to + destination node during sync. + However, this is applicable only + when destination node is in a + different region than the local + one. NOTE: Objects that are + already compressed (for example: + .tar.gz, mp3) might slow down + the syncing process. +recon_cache_path /var/cache/swift Path to recon cache +nice_priority None Scheduling priority of server + processes. Niceness values + range from -20 (most favorable + to the process) to 19 (least + favorable to the process). + The default does not modify + priority. +ionice_class None I/O scheduling class of server + processes. I/O niceness class + values are IOPRIO_CLASS_RT + (realtime), IOPRIO_CLASS_BE + (best-effort), and IOPRIO_CLASS_IDLE + (idle). + The default does not modify + class and priority. Linux supports + io scheduling priorities and classes + since 2.6.13 with the CFQ io scheduler. + Work only with ionice_priority. +ionice_priority None I/O scheduling priority of server + processes. I/O niceness priority + is a number which goes from 0 to 7. + The higher the value, the lower + the I/O priority of the process. + Work only with ionice_class. + Ignored if IOPRIO_CLASS_IDLE + is set. +handoffs_only no When handoffs_only mode is enabled + the replicator will *only* replicate + from handoff nodes to primary nodes + and will not sync primary nodes + with other primary nodes. +handoff_delete auto the number of replicas which are + ensured in swift. If the number + less than the number of replicas + is set, account-replicator + could delete local handoffs even + if all replicas are not ensured in + the cluster. The replicator would + remove local handoff account database + after syncing when the number of + successful responses is greater than + or equal to this number. By default + handoff partitions will be removed + when it has successfully replicated + to all the canonical nodes. +==================== ========================= ===================================== + +***************** +[account-auditor] +***************** + +==================== ================ ======================================= +Option Default Description +-------------------- ---------------- --------------------------------------- +log_name account-auditor Label used when logging +log_facility LOG_LOCAL0 Syslog log facility +log_level INFO Logging level +log_address /dev/log Logging directory +interval 1800 Minimum time for a pass to take +accounts_per_second 200 Maximum accounts audited per second. + Should be tuned according to individual + system specs. 0 is unlimited. +recon_cache_path /var/cache/swift Path to recon cache +nice_priority None Scheduling priority of server processes. + Niceness values range from -20 (most + favorable to the process) to 19 (least + favorable to the process). The default + does not modify priority. +ionice_class None I/O scheduling class of server processes. + I/O niceness class values are + IOPRIO_CLASS_RT (realtime), + IOPRIO_CLASS_BE (best-effort), + and IOPRIO_CLASS_IDLE (idle). + The default does not modify class and + priority. Linux supports io scheduling + priorities and classes since 2.6.13 with + the CFQ io scheduler. + Work only with ionice_priority. +ionice_priority None I/O scheduling priority of server + processes. I/O niceness priority is + a number which goes from 0 to 7. + The higher the value, the lower the I/O + priority of the process. Work only with + ionice_class. + Ignored if IOPRIO_CLASS_IDLE is set. +==================== ================ ======================================= + +**************** +[account-reaper] +**************** + +================== =============== ========================================= +Option Default Description +------------------ --------------- ----------------------------------------- +log_name account-reaper Label used when logging +log_facility LOG_LOCAL0 Syslog log facility +log_level INFO Logging level +log_address /dev/log Logging directory +concurrency 25 Number of replication workers to spawn +interval 3600 Minimum time for a pass to take +node_timeout 10 Request timeout to external services +conn_timeout 0.5 Connection timeout to external services +delay_reaping 0 Normally, the reaper begins deleting + account information for deleted accounts + immediately; you can set this to delay + its work however. The value is in seconds, + 2592000 = 30 days, for example. The sum of + this value and the container-updater + ``interval`` should be less than the + account-replicator ``reclaim_age``. This + ensures that once the account-reaper has + deleted a container there is sufficient + time for the container-updater to report + to the account before the account DB is + removed. +reap_warn_after 2892000 If the account fails to be reaped due + to a persistent error, the account reaper + will log a message such as: + Account has not been reaped since + You can search logs for this message if + space is not being reclaimed after you + delete account(s). This is in addition to + any time requested by delay_reaping. +nice_priority None Scheduling priority of server processes. + Niceness values range from -20 (most + favorable to the process) to 19 (least + favorable to the process). The default + does not modify priority. +ionice_class None I/O scheduling class of server processes. + I/O niceness class values are IOPRIO_CLASS_RT + (realtime), IOPRIO_CLASS_BE (best-effort), + and IOPRIO_CLASS_IDLE (idle). + The default does not modify class and + priority. Linux supports io scheduling + priorities and classes since 2.6.13 with + the CFQ io scheduler. + Work only with ionice_priority. +ionice_priority None I/O scheduling priority of server + processes. I/O niceness priority is + a number which goes from 0 to 7. + The higher the value, the lower the I/O + priority of the process. Work only with + ionice_class. + Ignored if IOPRIO_CLASS_IDLE is set. +================== =============== ========================================= diff --git a/doc/source/config/container_server_config.rst b/doc/source/config/container_server_config.rst new file mode 100644 index 0000000000..6f7d6031a3 --- /dev/null +++ b/doc/source/config/container_server_config.rst @@ -0,0 +1,650 @@ +.. _container-server-config: + +------------------------------ +Container Server Configuration +------------------------------ + +This document describes the configuration options available for the container +server. Documentation for other swift configuration options can be found at +:doc:`index`. + +An example Container Server configuration can be found at +etc/container-server.conf-sample in the source code repository. + +The following configuration sections are available: + +* :ref:`[DEFAULT] ` +* `[container-server]`_ +* `[container-replicator]`_ +* `[container-sharder]`_ +* `[container-updater]`_ +* `[container-auditor]`_ + +.. _container_server_default_options: + +********* +[DEFAULT] +********* + +=============================== ========== ============================================ +Option Default Description +------------------------------- ---------- -------------------------------------------- +swift_dir /etc/swift Swift configuration directory +devices /srv/node Parent directory of where devices are mounted +mount_check true Whether or not check if the devices are + mounted to prevent accidentally writing + to the root device +bind_ip 0.0.0.0 IP Address for server to bind to +bind_port 6201 Port for server to bind to +keep_idle 600 Value to set for socket TCP_KEEPIDLE +bind_timeout 30 Seconds to attempt bind before giving up +backlog 4096 Maximum number of allowed pending + connections +workers auto Override the number of pre-forked workers + that will accept connections. If set it + should be an integer, zero means no fork. If + unset, it will try to default to the number + of effective cpu cores and fallback to one. + Increasing the number of workers may reduce + the possibility of slow file system + operations in one request from negatively + impacting other requests. See + :ref:`general-service-tuning`. +max_clients 1024 Maximum number of clients one worker can + process simultaneously (it will actually + accept(2) N + 1). Setting this to one (1) + will only handle one request at a time, + without accepting another request + concurrently. +user swift User to run as +disable_fallocate false Disable "fast fail" fallocate checks if the + underlying filesystem does not support it. +log_name swift Label used when logging +log_facility LOG_LOCAL0 Syslog log facility +log_level INFO Logging level +log_address /dev/log Logging directory +log_max_line_length 0 Caps the length of log lines to the + value given; no limit if set to 0, the + default. +log_custom_handlers None Comma-separated list of functions to call + to setup custom log handlers. +log_udp_host Override log_address +log_udp_port 514 UDP log port +log_statsd_host None Enables StatsD logging; IPv4/IPv6 + address or a hostname. If a + hostname resolves to an IPv4 and IPv6 + address, the IPv4 address will be + used. +log_statsd_port 8125 +log_statsd_default_sample_rate 1.0 +log_statsd_sample_rate_factor 1.0 +log_statsd_metric_prefix +eventlet_debug false If true, turn on debug logging for eventlet +fallocate_reserve 1% You can set fallocate_reserve to the + number of bytes or percentage of disk + space you'd like fallocate to reserve, + whether there is space for the given + file size or not. Percentage will be used + if the value ends with a '%'. This is + useful for systems that behave badly when + they completely run out of space; you can + make the services pretend they're out of + space early. +db_preallocation off If you don't mind the extra disk space usage + in overhead, you can turn this on to preallocate + disk space with SQLite databases to decrease + fragmentation. +nice_priority None Scheduling priority of server processes. + Niceness values range from -20 (most + favorable to the process) to 19 (least + favorable to the process). The default + does not modify priority. +ionice_class None I/O scheduling class of server processes. + I/O niceness class values are IOPRIO_CLASS_RT + (realtime), IOPRIO_CLASS_BE (best-effort), + and IOPRIO_CLASS_IDLE (idle). + The default does not modify class and + priority. Linux supports io scheduling + priorities and classes since 2.6.13 + with the CFQ io scheduler. + Work only with ionice_priority. +ionice_priority None I/O scheduling priority of server processes. + I/O niceness priority is a number which + goes from 0 to 7. The higher the value, + the lower the I/O priority of the process. + Work only with ionice_class. + Ignored if IOPRIO_CLASS_IDLE is set. +=============================== ========== ============================================ + +****************** +[container-server] +****************** + +============================== ================ ======================================== +Option Default Description +------------------------------ ---------------- ---------------------------------------- +use paste.deploy entry point for the + container server. For most cases, this + should be ``egg:swift#container``. +set log_name container-server Label used when logging +set log_facility LOG_LOCAL0 Syslog log facility +set log_level INFO Logging level +set log_requests True Whether or not to log each + request +set log_address /dev/log Logging directory +node_timeout 3 Request timeout to external services +conn_timeout 0.5 Connection timeout to external services +allow_versions false Enable/Disable object versioning feature +replication_server Configure parameter for creating + specific server. To handle all verbs, + including replication verbs, do not + specify "replication_server" + (this is the default). To only + handle replication, set to a True + value (e.g. "True" or "1"). + To handle only non-replication + verbs, set to "False". Unless you + have a separate replication network, you + should not specify any value for + "replication_server". +nice_priority None Scheduling priority of server processes. + Niceness values range from -20 (most + favorable to the process) to 19 (least + favorable to the process). The default + does not modify priority. +ionice_class None I/O scheduling class of server processes. + I/O niceness class values are + IOPRIO_CLASS_RT (realtime), + IOPRIO_CLASS_BE (best-effort), + and IOPRIO_CLASS_IDLE (idle). + The default does not modify class and + priority. Linux supports io scheduling + priorities and classes since 2.6.13 with + the CFQ io scheduler. + Work only with ionice_priority. +ionice_priority None I/O scheduling priority of server + processes. I/O niceness priority is + a number which goes from 0 to 7. + The higher the value, the lower the I/O + priority of the process. Work only with + ionice_class. + Ignored if IOPRIO_CLASS_IDLE is set. +============================== ================ ======================================== + +********************** +[container-replicator] +********************** + +==================== =========================== ======================================= +Option Default Description +-------------------- --------------------------- --------------------------------------- +log_name container-replicator Label used when logging +log_facility LOG_LOCAL0 Syslog log facility +log_level INFO Logging level +log_address /dev/log Logging directory +per_diff 1000 Maximum number of database + rows that will be sync'd in a + single HTTP replication + request. Databases with less + than or equal to this number + of differing rows will always + be sync'd using an HTTP + replication request rather + than using rsync. +max_diffs 100 Maximum number of HTTP + replication requests attempted + on each replication pass for + any one container. This caps + how long the replicator will + spend trying to sync a given + database per pass so the other + databases don't get starved. +concurrency 8 Number of replication workers + to spawn +interval 30 Time in seconds to wait + between replication passes +databases_per_second 50 Maximum databases to process + per second. Should be tuned + according to individual + system specs. 0 is unlimited. +node_timeout 10 Request timeout to external + services +conn_timeout 0.5 Connection timeout to external + services +reclaim_age 604800 Time elapsed in seconds before + a container can be reclaimed +rsync_module {replication_ip}::container Format of the rsync module + where the replicator will send + data. The configuration value + can include some variables + that will be extracted from + the ring. Variables must + follow the format {NAME} where + NAME is one of: ip, port, + replication_ip, + replication_port, region, + zone, device, meta. See + etc/rsyncd.conf-sample for + some examples. +rsync_compress no Allow rsync to compress data + which is transmitted to + destination node during sync. + However, this is applicable + only when destination node is + in a different region than the + local one. NOTE: Objects that + are already compressed (for + example: .tar.gz, mp3) might + slow down the syncing process. +recon_cache_path /var/cache/swift Path to recon cache +nice_priority None Scheduling priority of server + processes. Niceness values + range from -20 (most favorable + to the process) to 19 (least + favorable to the process). + The default does not modify + priority. +ionice_class None I/O scheduling class of server + processes. I/O niceness class + values are + IOPRIO_CLASS_RT (realtime), + IOPRIO_CLASS_BE (best-effort), + and IOPRIO_CLASS_IDLE (idle). + The default does not modify + class and priority. Linux + supports io scheduling + priorities and classes since + 2.6.13 with the CFQ io + scheduler. + Work only with ionice_priority. +ionice_priority None I/O scheduling priority of + server processes. I/O niceness + priority is a number which goes + from 0 to 7. + The higher the value, the lower + the I/O priority of the process. + Work only with ionice_class. + Ignored if IOPRIO_CLASS_IDLE + is set. +handoffs_only no When handoffs_only mode is enabled + the replicator will *only* replicate + from handoff nodes to primary nodes + and will not sync primary nodes + with other primary nodes. +handoff_delete auto the number of replicas which are + ensured in swift. If the number + less than the number of replicas + is set, container-replicator + could delete local handoffs even + if all replicas are not ensured in + the cluster. The replicator would + remove local handoff container database + after syncing when the number of + successful responses is greater than + or equal to this number. By default + handoff partitions will be removed + when it has successfully replicated + to all the canonical nodes. +==================== =========================== ======================================= + +******************* +[container-sharder] +******************* + +The container-sharder re-uses features of the container-replicator and inherits +the following configuration options defined for the `[container-replicator]`_: + +* interval +* databases_per_second +* per_diff +* max_diffs +* concurrency +* node_timeout +* conn_timeout +* reclaim_age +* rsync_compress +* rsync_module +* recon_cache_path + +Some config options in this section may also be used by the +:ref:`swift-manage-shard-ranges CLI tool `. + +================================= ================= ======================================= +Option Default Description +--------------------------------- ----------------- --------------------------------------- +log_name container-sharder Label used when logging +log_facility LOG_LOCAL0 Syslog log facility +log_level INFO Logging level +log_address /dev/log Logging directory + + +auto_shard false If the auto_shard option + is true then the sharder + will automatically select + containers to shard, scan + for shard ranges, and + select shards to shrink. + Warning: auto-sharding is + still under development + and should not be used in + production; do not set + this option to true in a + production cluster. + +shard_container_threshold 1000000 This defines the + object count at which a + container with + container-sharding + enabled will start to + shard. This also + indirectly determines the + the defaults for + rows_per_shard, + shrink_threshold and + expansion_limit. + +rows_per_shard 500000 This defines the initial + nominal size of shard + containers. The default + is shard_container_threshold // 2. + +minimum_shard_size 100000 Minimum size of the final + shard range. If this is + greater than one then the + final shard range may be + extended to more than + rows_per_shard in order + to avoid a further shard + range with less than + minimum_shard_size rows. + The default value is + rows_per_shard // 5. + +shrink_threshold This defines the + object count below which + a 'donor' shard container + will be considered for + shrinking into another + 'acceptor' shard + container. The default is + determined by + shard_shrink_point. If + set, shrink_threshold + will take precedence over + shard_shrink_point. + +shard_shrink_point 10 Deprecated: shrink_threshold + is recommended and if set + will take precedence over + shard_shrink_point. + This defines the + object count below which + a 'donor' shard container + will be considered for + shrinking into another + 'acceptor' shard + container. + shard_shrink_point is a + percentage of + shard_container_threshold + e.g. the default value of + 10 means 10% of the + shard_container_threshold. + +expansion_limit This defines the + maximum allowed size of + an acceptor shard + container after having a + donor merged into it. The + default is determined by + shard_shrink_merge_point. + If set, expansion_limit + will take precedence over + shard_shrink_merge_point. + +shard_shrink_merge_point 75 Deprecated: expansion_limit + is recommended and if set + will take precedence over + shard_shrink_merge_point. + This defines the + maximum allowed size of + an acceptor shard + container after having a + donor merged into it. + Shard_shrink_merge_point + is a percentage of + shard_container_threshold. + e.g. the default value of + 75 means that the + projected sum of a donor + object count and acceptor + count must be less than + 75% of shard_container_threshold + for the donor to be + allowed to merge into the + acceptor. + + For example, if + shard_container_threshold + is 1 million, + shard_shrink_point is 10, + and shard_shrink_merge_point + is 75 then a shard will + be considered for + shrinking if it has less + than or equal to 100 + thousand objects but will + only merge into an + acceptor if the combined + object count would be + less than or equal to 750 + thousand objects. + + +shard_scanner_batch_size 10 When auto-sharding is + enabled this defines the + maximum number of shard + ranges that will be found + each time the sharder + daemon visits a sharding + container. If necessary + the sharder daemon will + continue to search for + more shard ranges each + time it visits the + container. + +cleave_batch_size 2 Defines the number of + shard ranges that will be + cleaved each time the + sharder daemon visits a + sharding container. + +cleave_row_batch_size 10000 Defines the size of + batches of object rows + read from a sharding + container and merged to a + shard container during + cleaving. + +shard_replication_quorum auto Defines the number of + successfully replicated + shard dbs required when + cleaving a previously + uncleaved shard range + before the sharder will + progress to the next + shard range. The value + should be less than or + equal to the container + ring replica count. The + default of 'auto' causes + the container ring quorum + value to be used. This + option only applies to + the container-sharder + replication and does not + affect the number of + shard container replicas + that will eventually be + replicated by the + container-replicator. + + +existing_shard_replication_quorum auto Defines the number of + successfully replicated + shard dbs required when + cleaving a shard range + that has been previously + cleaved on another node + before the sharder will + progress to the next + shard range. The value + should be less than or + equal to the container + ring replica count. The + default of 'auto' causes + the shard_replication_quorum + value to be used. This + option only applies to + the container-sharder + replication and does not + affect the number of + shard container replicas + that will eventually be + replicated by the + container-replicator. + +internal_client_conf_path see description The sharder uses an + internal client to create + and make requests to + containers. The absolute + path to the client config + file can be configured. + Defaults to + /etc/swift/internal-client.conf + +request_tries 3 The number of time the + internal client will + retry requests. + +recon_candidates_limit 5 Each time the sharder + dumps stats to the recon + cache file it includes a + list of containers that + appear to need sharding + but are not yet sharding. + By default this list is + limited to the top 5 + containers, ordered by + object count. The limit + may be changed by setting + recon_candidates_limit to + an integer value. A + negative value implies no + limit. + +broker_timeout 60 Large databases tend to + take a while to work + with, but we want to make + sure we write down our + progress. Use a + larger-than-normal broker + timeout to make us less + likely to bomb out on a + LockTimeout. +================================= ================= ======================================= + +******************* +[container-updater] +******************* + +======================== ================= ================================== +Option Default Description +------------------------ ----------------- ---------------------------------- +log_name container-updater Label used when logging +log_facility LOG_LOCAL0 Syslog log facility +log_level INFO Logging level +log_address /dev/log Logging directory +interval 300 Minimum time for a pass to take +concurrency 4 Number of updater workers to spawn +node_timeout 3 Request timeout to external + services +conn_timeout 0.5 Connection timeout to external + services +containers_per_second 50 Maximum containers updated per second. + Should be tuned according to individual + system specs. 0 is unlimited. + +slowdown 0.01 Time in seconds to wait between + containers. Deprecated in favor of + containers_per_second. +account_suppression_time 60 Seconds to suppress updating an + account that has generated an + error (timeout, not yet found, + etc.) +recon_cache_path /var/cache/swift Path to recon cache +nice_priority None Scheduling priority of server + processes. Niceness values range + from -20 (most favorable to the + process) to 19 (least favorable + to the process). The default does + not modify priority. +ionice_class None I/O scheduling class of server + processes. I/O niceness class + values are IOPRIO_CLASS_RT (realtime), + IOPRIO_CLASS_BE (best-effort), + and IOPRIO_CLASS_IDLE (idle). + The default does not modify class and + priority. Linux supports io scheduling + priorities and classes since 2.6.13 with + the CFQ io scheduler. + Work only with ionice_priority. +ionice_priority None I/O scheduling priority of server + processes. I/O niceness priority is + a number which goes from 0 to 7. + The higher the value, the lower + the I/O priority of the process. + Work only with ionice_class. + Ignored if IOPRIO_CLASS_IDLE is set. +======================== ================= ================================== + +******************* +[container-auditor] +******************* + +===================== ================= ======================================= +Option Default Description +--------------------- ----------------- --------------------------------------- +log_name container-auditor Label used when logging +log_facility LOG_LOCAL0 Syslog log facility +log_level INFO Logging level +log_address /dev/log Logging directory +interval 1800 Minimum time for a pass to take +containers_per_second 200 Maximum containers audited per second. + Should be tuned according to individual + system specs. 0 is unlimited. +recon_cache_path /var/cache/swift Path to recon cache +nice_priority None Scheduling priority of server processes. + Niceness values range from -20 (most + favorable to the process) to 19 (least + favorable to the process). The default + does not modify priority. +ionice_class None I/O scheduling class of server processes. + I/O niceness class values are + IOPRIO_CLASS_RT (realtime), + IOPRIO_CLASS_BE (best-effort), + and IOPRIO_CLASS_IDLE (idle). + The default does not modify class and + priority. Linux supports io scheduling + priorities and classes since 2.6.13 with + the CFQ io scheduler. + Work only with ionice_priority. +ionice_priority None I/O scheduling priority of server + processes. I/O niceness priority is + a number which goes from 0 to 7. + The higher the value, the lower the I/O + priority of the process. Work only with + ionice_class. + Ignored if IOPRIO_CLASS_IDLE is set. +===================== ================= ======================================= diff --git a/doc/source/config/global_memcache_config.rst b/doc/source/config/global_memcache_config.rst new file mode 100644 index 0000000000..21ef6c43a2 --- /dev/null +++ b/doc/source/config/global_memcache_config.rst @@ -0,0 +1,73 @@ +.. _memcache-config: + +----------------------------- +Global Memcache Configuration +----------------------------- + +This document describes the configuration options available for the global swift memcache configuration +which usually lives under /etc/swift/memcache.conf. +Documentation for other swift configuration options can be found at +:doc:`index`. + +An example memcache.conf configuration can be found at +etc/memcache.conf-sample in the source code repository. + +There is only 1 configuration section available: + +* :ref:`[memcache] ` + + +.. _memcache_conf_memcache_section: + +********** +[memcache] +********** + +=========================== =============== ============================================= +Option Default Description +--------------------------- --------------- --------------------------------------------- +memcache_servers 127.0.0.1:11211 Comma separated list of memcached servers + ip:port or [ipv6addr]:port +memcache_max_connections 2 Max number of connections to each memcached + server per worker +connect_timeout 0.3 Timeout for connection +pool_timeout 1.0 Timeout for pooled connection +tries 3 Number of servers to retry on failures + getting a pooled connection +io_timeout 2.0 Timeout for read and writes +error_suppression_interval 60.0 How long without an error before a server's + error count is reset. This will also be how + long before a server is reenabled after + suppression is triggered. + Set to 0 to disable error-limiting. +error_suppression_limit 10 How many errors can accumulate before a + server is temporarily ignored +item_size_warning_threshold -1 If an item size ever gets above + item_size_warning_threshold then a warning + will be logged. This can be used to alert + when memcache item sizes are getting to + their limit. + It's an absolute size in bytes. Setting the + value to 0 will warn on every memcache set. + A value of -1 disables the warning +tls_enabled False (Optional) Global toggle for TLS usage + when comunicating with the caching servers +tls_cafile (Optional) Path to a file of concatenated + CA certificates in PEM format necessary to + establish the caching server's authenticity. + If tls_enabled is False, this option is + ignored. +tls_certfile (Optional) Path to a single file in PEM + format containing the client's certificate + as well as any number of CA certificates + needed to establish the certificate's + authenticity. This file is only required + when client side authentication is + necessary. If tls_enabled is False, + this option is ignored +tls_keyfile (Optional) Path to a single file containing + the client's private key in. Otherwhise the + private key will be taken from the file + specified in tls_certfile. If tls_enabled + is False, this option is ignored +=========================== =============== ============================================= \ No newline at end of file diff --git a/doc/source/config/index.rst b/doc/source/config/index.rst new file mode 100644 index 0000000000..bfec36b78a --- /dev/null +++ b/doc/source/config/index.rst @@ -0,0 +1,18 @@ +=========================== +Configuration Documentation +=========================== + +.. toctree:: + :maxdepth: 2 + + swift_common_config.rst + proxy_server_config.rst + account_server_config.rst + container_server_config.rst + object_server_config.rst + global_memcache_config.rst + +Configuration options for middleware can be found at: + +* :doc:`../middleware` +* :doc:`../overview_auth` diff --git a/doc/source/config/object_server_config.rst b/doc/source/config/object_server_config.rst new file mode 100644 index 0000000000..f85ee5df7e --- /dev/null +++ b/doc/source/config/object_server_config.rst @@ -0,0 +1,765 @@ +.. _object-server-config: + +--------------------------- +Object Server Configuration +--------------------------- + +This document describes the configuration options available for the object +server. Documentation for other swift configuration options can be found at +:doc:`index`. + +An Example Object Server configuration can be found at +etc/object-server.conf-sample in the source code repository. + +The following configuration sections are available: + +* :ref:`[DEFAULT] ` +* `[object-server]`_ +* `[object-replicator]`_ +* `[object-reconstructor]`_ +* `[object-updater]`_ +* `[object-auditor]`_ +* `[object-expirer]`_ + +.. _object-server-default-options: + +********* +[DEFAULT] +********* + +================================ ========== ============================================ +Option Default Description +-------------------------------- ---------- -------------------------------------------- +swift_dir /etc/swift Swift configuration directory +devices /srv/node Parent directory of where devices are + mounted +mount_check true Whether or not check if the devices are + mounted to prevent accidentally writing + to the root device +bind_ip 0.0.0.0 IP Address for server to bind to +bind_port 6200 Port for server to bind to +keep_idle 600 Value to set for socket TCP_KEEPIDLE +bind_timeout 30 Seconds to attempt bind before giving up +backlog 4096 Maximum number of allowed pending + connections +workers auto Override the number of pre-forked workers + that will accept connections. If set it + should be an integer, zero means no fork. + If unset, it will try to default to the + number of effective cpu cores and fallback + to one. Increasing the number of workers + helps slow filesystem operations in one + request from negatively impacting other + requests, but only the + :ref:`servers_per_port + ` option + provides complete I/O isolation with no + measurable overhead. +servers_per_port 0 If each disk in each storage policy ring + has unique port numbers for its "ip" + value, you can use this setting to have + each object-server worker only service + requests for the single disk matching the + port in the ring. The value of this + setting determines how many worker + processes run for each port (disk) in the + ring. If you have 24 disks per server, and + this setting is 4, then each storage node + will have 1 + (24 * 4) = 97 total + object-server processes running. This + gives complete I/O isolation, drastically + reducing the impact of slow disks on + storage node performance. The + object-replicator and object-reconstructor + need to see this setting too, so it must + be in the [DEFAULT] section. + See :ref:`server-per-port-configuration`. +max_clients 1024 Maximum number of clients one worker can + process simultaneously (it will actually + accept(2) N + 1). Setting this to one (1) + will only handle one request at a time, + without accepting another request + concurrently. +disable_fallocate false Disable "fast fail" fallocate checks if + the underlying filesystem does not support + it. +log_name swift Label used when logging +log_facility LOG_LOCAL0 Syslog log facility +log_level INFO Logging level +log_address /dev/log Logging directory +log_max_line_length 0 Caps the length of log lines to the + value given; no limit if set to 0, the + default. +log_custom_handlers None Comma-separated list of functions to call + to setup custom log handlers. +log_udp_host Override log_address +log_udp_port 514 UDP log port +log_statsd_host None Enables StatsD logging; IPv4/IPv6 + address or a hostname. If a + hostname resolves to an IPv4 and IPv6 + address, the IPv4 address will be + used. +log_statsd_port 8125 +log_statsd_default_sample_rate 1.0 +log_statsd_sample_rate_factor 1.0 +log_statsd_metric_prefix +eventlet_debug false If true, turn on debug logging for + eventlet +fallocate_reserve 1% You can set fallocate_reserve to the + number of bytes or percentage of disk + space you'd like fallocate to reserve, + whether there is space for the given + file size or not. Percentage will be used + if the value ends with a '%'. This is + useful for systems that behave badly when + they completely run out of space; you can + make the services pretend they're out of + space early. +conn_timeout 0.5 Time to wait while attempting to connect + to another backend node. +node_timeout 3 Time to wait while sending each chunk of + data to another backend node. +client_timeout 60 Time to wait while receiving each chunk of + data from a client or another backend node +network_chunk_size 65536 Size of chunks to read/write over the + network +disk_chunk_size 65536 Size of chunks to read/write to disk +container_update_timeout 1 Time to wait while sending a container + update on object update. +reclaim_age 604800 Time elapsed in seconds before the tombstone + file representing a deleted object can be + reclaimed. This is the maximum window for + your consistency engine. If a node that was + disconnected from the cluster because of a + fault is reintroduced into the cluster after + this window without having its data purged + it will result in dark data. This setting + should be consistent across all object + services. +commit_window 60 Non-durable data files may also + get reclaimed if they are older + than reclaim_age, but not if the + time they were written to disk + (i.e. mtime) is less than + commit_window seconds ago. A + commit_window greater than zero is + strongly recommended to avoid + unintended reclamation of data + files that were about to become + durable; commit_window should be + much less than reclaim_age. +nice_priority None Scheduling priority of server processes. + Niceness values range from -20 (most + favorable to the process) to 19 (least + favorable to the process). The default + does not modify priority. +ionice_class None I/O scheduling class of server processes. + I/O niceness class values are IOPRIO_CLASS_RT + (realtime), IOPRIO_CLASS_BE (best-effort), + and IOPRIO_CLASS_IDLE (idle). + The default does not modify class and + priority. Linux supports io scheduling + priorities and classes since 2.6.13 with + the CFQ io scheduler. + Work only with ionice_priority. +ionice_priority None I/O scheduling priority of server + processes. I/O niceness priority is + a number which goes from 0 to 7. + The higher the value, the lower the I/O + priority of the process. Work only with + ionice_class. + Ignored if IOPRIO_CLASS_IDLE is set. +================================ ========== ============================================ + +.. _object-server-options: + +*************** +[object-server] +*************** + +================================== ====================== =============================================== +Option Default Description +---------------------------------- ---------------------- ----------------------------------------------- +use paste.deploy entry point for the + object server. For most cases, + this should be + ``egg:swift#object``. +set log_name object-server Label used when logging +set log_facility LOG_LOCAL0 Syslog log facility +set log_level INFO Logging level +set log_requests True Whether or not to log each + request +set log_address /dev/log Logging directory +user swift User to run as +max_upload_time 86400 Maximum time allowed to upload an + object +slow 0 If > 0, Minimum time in seconds for a PUT or + DELETE request to complete. This is only + useful to simulate slow devices during testing + and development. +mb_per_sync 512 On PUT requests, sync file every + n MB +keep_cache_size 5242880 Largest object size to keep in + buffer cache +keep_cache_private false Allow non-public objects to stay + in kernel's buffer cache +keep_cache_slo_manifest false Allow SLO object's manifest file to stay in + kernel's buffer cache if its size is under + keep_cache_size. This config will only matter + when 'keep_cache_private' is false. +allowed_headers Content-Disposition, Comma separated list of headers + Content-Encoding, that can be set in metadata on an object. + X-Delete-At, This list is in addition to + X-Object-Manifest, X-Object-Meta-* headers and cannot include + X-Static-Large-Object Content-Type, etag, Content-Length, or deleted + Cache-Control, + Content-Language, + Expires, + X-Robots-Tag +replication_server Configure parameter for creating + specific server. To handle all verbs, + including replication verbs, do not + specify "replication_server" + (this is the default). To only + handle replication, set to a True + value (e.g. "True" or "1"). + To handle only non-replication + verbs, set to "False". Unless you + have a separate replication network, you + should not specify any value for + "replication_server". +replication_concurrency 4 Set to restrict the number of + concurrent incoming SSYNC + requests; set to 0 for unlimited +replication_concurrency_per_device 1 Set to restrict the number of + concurrent incoming SSYNC + requests per device; set to 0 for + unlimited requests per devices. + This can help control I/O to each + device. This does not override + replication_concurrency described + above, so you may need to adjust + both parameters depending on your + hardware or network capacity. +replication_lock_timeout 15 Number of seconds to wait for an + existing replication device lock + before giving up. +replication_failure_threshold 100 The number of subrequest failures + before the + replication_failure_ratio is + checked +replication_failure_ratio 1.0 If the value of failures / + successes of SSYNC + subrequests exceeds this ratio, + the overall SSYNC request + will be aborted +splice no Use splice() for zero-copy object + GETs. This requires Linux kernel + version 3.0 or greater. If you set + "splice = yes" but the kernel + does not support it, error messages + will appear in the object server + logs at startup, but your object + servers should continue to function. +nice_priority None Scheduling priority of server processes. + Niceness values range from -20 (most + favorable to the process) to 19 (least + favorable to the process). The default + does not modify priority. +ionice_class None I/O scheduling class of server processes. + I/O niceness class values are IOPRIO_CLASS_RT + (realtime), IOPRIO_CLASS_BE (best-effort), + and IOPRIO_CLASS_IDLE (idle). + The default does not modify class and + priority. Linux supports io scheduling + priorities and classes since 2.6.13 with + the CFQ io scheduler. + Work only with ionice_priority. +ionice_priority None I/O scheduling priority of server + processes. I/O niceness priority is + a number which goes from 0 to 7. + The higher the value, the lower the I/O + priority of the process. Work only with + ionice_class. + Ignored if IOPRIO_CLASS_IDLE is set. +eventlet_tpool_num_threads auto The number of threads in eventlet's thread pool. + Most IO will occur in the object server's main + thread, but certain "heavy" IO operations will + occur in separate IO threads, managed by + eventlet. + The default value is auto, whose actual value + is dependent on the servers_per_port value. + If servers_per_port is zero then it uses + eventlet's default (currently 20 threads). + If the servers_per_port is nonzero then it'll + only use 1 thread per process. + This value can be overridden with an integer + value. +================================== ====================== =============================================== + +******************* +[object-replicator] +******************* + +=========================== ======================== ================================ +Option Default Description +--------------------------- ------------------------ -------------------------------- +log_name object-replicator Label used when logging +log_facility LOG_LOCAL0 Syslog log facility +log_level INFO Logging level +log_address /dev/log Logging directory +daemonize yes Whether or not to run replication + as a daemon +interval 30 Time in seconds to wait between + replication passes +concurrency 1 Number of replication jobs to + run per worker process +replicator_workers 0 Number of worker processes to use. + No matter how big this number is, + at most one worker per disk will + be used. The default value of 0 + means no forking; all work is done + in the main process. +sync_method rsync The sync method to use; default + is rsync but you can use ssync to + try the EXPERIMENTAL + all-swift-code-no-rsync-callouts + method. Once ssync is verified as + or better than, rsync, we plan to + deprecate rsync so we can move on + with more features for + replication. +rsync_timeout 900 Max duration of a partition rsync +rsync_bwlimit 0 Bandwidth limit for rsync in kB/s. + 0 means unlimited. +rsync_io_timeout 30 Timeout value sent to rsync + --timeout and --contimeout + options +rsync_compress no Allow rsync to compress data + which is transmitted to destination + node during sync. However, this + is applicable only when destination + node is in a different region + than the local one. + NOTE: Objects that are already + compressed (for example: .tar.gz, + .mp3) might slow down the syncing + process. +stats_interval 300 Interval in seconds between + logging replication statistics +handoffs_first false If set to True, partitions that + are not supposed to be on the + node will be replicated first. + The default setting should not be + changed, except for extreme + situations. +handoff_delete auto By default handoff partitions + will be removed when it has + successfully replicated to all + the canonical nodes. If set to an + integer n, it will remove the + partition if it is successfully + replicated to n nodes. The + default setting should not be + changed, except for extreme + situations. +node_timeout DEFAULT or 10 Request timeout to external + services. This uses what's set + here, or what's set in the + DEFAULT section, or 10 (though + other sections use 3 as the final + default). +http_timeout 60 Max duration of an http request. + This is for REPLICATE finalization + calls and so should be longer + than node_timeout. +lockup_timeout 1800 Attempts to kill all workers if + nothing replicates for + lockup_timeout seconds +rsync_module {replication_ip}::object Format of the rsync module where + the replicator will send data. + The configuration value can + include some variables that will + be extracted from the ring. + Variables must follow the format + {NAME} where NAME is one of: ip, + port, replication_ip, + replication_port, region, zone, + device, meta. See + etc/rsyncd.conf-sample for some + examples. +rsync_error_log_line_length 0 Limits how long rsync error log + lines are +ring_check_interval 15 Interval for checking new ring + file +recon_cache_path /var/cache/swift Path to recon cache +nice_priority None Scheduling priority of server + processes. Niceness values + range from -20 (most favorable + to the process) to 19 (least + favorable to the process). + The default does not modify + priority. +ionice_class None I/O scheduling class of server + processes. I/O niceness class + values are IOPRIO_CLASS_RT (realtime), + IOPRIO_CLASS_BE (best-effort), + and IOPRIO_CLASS_IDLE (idle). + The default does not modify + class and priority. + Linux supports io scheduling + priorities and classes since + 2.6.13 with the CFQ io scheduler. + Work only with ionice_priority. +ionice_priority None I/O scheduling priority of server + processes. I/O niceness priority + is a number which goes from + 0 to 7. The higher the value, + the lower the I/O priority of + the process. + Work only with ionice_class. + Ignored if IOPRIO_CLASS_IDLE + is set. +=========================== ======================== ================================ + +********************** +[object-reconstructor] +********************** + +=========================== ======================== ================================ +Option Default Description +--------------------------- ------------------------ -------------------------------- +log_name object-reconstructor Label used when logging +log_facility LOG_LOCAL0 Syslog log facility +log_level INFO Logging level +log_address /dev/log Logging directory +daemonize yes Whether or not to run + reconstruction as a daemon +interval 30 Time in seconds to wait between + reconstruction passes +reconstructor_workers 0 Maximum number of worker processes + to spawn. Each worker will handle + a subset of devices. Devices will + be assigned evenly among the workers + so that workers cycle at similar + intervals (which can lead to fewer + workers than requested). You can not + have more workers than devices. If + you have no devices only a single + worker is spawned. +concurrency 1 Number of reconstruction threads to + spawn per reconstructor process. +stats_interval 300 Interval in seconds between + logging reconstruction statistics +handoffs_only false The handoffs_only mode option is for + special case emergency situations + during rebalance such as disk full in + the cluster. This option SHOULD NOT + BE CHANGED, except for extreme + situations. When handoffs_only mode + is enabled the reconstructor will + *only* revert fragments from handoff + nodes to primary nodes and will not + sync primary nodes with neighboring + primary nodes. This will force the + reconstructor to sync and delete + handoffs' fragments more quickly and + minimize the time of the rebalance by + limiting the number of rebuilds. The + handoffs_only option is only for + temporary use and should be disabled + as soon as the emergency situation + has been resolved. +rebuild_handoff_node_count 2 The default strategy for unmounted + drives will stage + rebuilt data on a + handoff node until + updated rings are + deployed. Because + fragments are rebuilt on + offset handoffs based on + fragment index and the + proxy limits how deep it + will search for EC frags + we restrict how many + nodes we'll try. + Setting to 0 will + disable rebuilds to + handoffs and only + rebuild fragments for + unmounted devices to + mounted primaries after + a ring change. Setting + to -1 means "no limit". +max_objects_per_revert 0 By default the reconstructor + attempts to revert all + objects from handoff + partitions in a single + batch using a single + SSYNC request. In + exceptional + circumstances + max_objects_per_revert + can be used to + temporarily limit the + number of objects + reverted by each + reconstructor revert + type job. If more than + max_objects_per_revert + are available in a + sender's handoff + partition, the remaining + objects will remain in + the handoff partition + and will not be reverted + until the next time the + reconstructor visits + that handoff partition + i.e. with this option + set, a single cycle of + the reconstructor may + not completely revert + all handoff partitions. + The option has no effect + on reconstructor sync + type jobs between + primary partitions. A + value of 0 (the default) + means there is no limit. +node_timeout DEFAULT or 10 Request timeout to external + services. The value used is the value + set in this section, or the value set + in the DEFAULT section, or 10. +http_timeout 60 Max duration of an http request. + This is for REPLICATE finalization + calls and so should be longer + than node_timeout. +lockup_timeout 1800 Attempts to kill all threads if + no fragment has been reconstructed + for lockup_timeout seconds. +ring_check_interval 15 Interval for checking new ring + file +recon_cache_path /var/cache/swift Path to recon cache +nice_priority None Scheduling priority of server + processes. Niceness values + range from -20 (most favorable + to the process) to 19 (least + favorable to the process). + The default does not modify + priority. +ionice_class None I/O scheduling class of server + processes. I/O niceness class + values are IOPRIO_CLASS_RT (realtime), + IOPRIO_CLASS_BE (best-effort), + and IOPRIO_CLASS_IDLE (idle). + The default does not modify + class and priority. + Linux supports io scheduling + priorities and classes since + 2.6.13 with the CFQ io scheduler. + Work only with ionice_priority. +ionice_priority None I/O scheduling priority of server + processes. I/O niceness priority + is a number which goes from + 0 to 7. The higher the value, + the lower the I/O priority of + the process. + Work only with ionice_class. + Ignored if IOPRIO_CLASS_IDLE + is set. +quarantine_threshold 0 The reconstructor may quarantine + stale isolated fragments + when it fails to fetch + more than the + quarantine_threshold + number of fragments + (including the stale + fragment) during an + attempt to reconstruct. +quarantine_age reclaim_age Fragments are not quarantined + until they are older than + quarantine_age, which defaults + to the value of reclaim_age. +=========================== ======================== ================================ + +**************** +[object-updater] +**************** + +=================== =================== ========================================== +Option Default Description +------------------- ------------------- ------------------------------------------ +log_name object-updater Label used when logging +log_facility LOG_LOCAL0 Syslog log facility +log_level INFO Logging level +log_address /dev/log Logging directory +interval 300 Minimum time for a pass to take +updater_workers 1 Number of worker processes +concurrency 8 Number of updates to run concurrently in + each worker process +node_timeout DEFAULT or 10 Request timeout to external services. This + uses what's set here, or what's set in the + DEFAULT section, or 10 (though other + sections use 3 as the final default). +objects_per_second 50 Maximum objects updated per second. + Should be tuned according to individual + system specs. 0 is unlimited. +slowdown 0.01 Time in seconds to wait between objects. + Deprecated in favor of objects_per_second. +report_interval 300 Interval in seconds between logging + statistics about the current update pass. +recon_cache_path /var/cache/swift Path to recon cache +nice_priority None Scheduling priority of server processes. + Niceness values range from -20 (most + favorable to the process) to 19 (least + favorable to the process). The default + does not modify priority. +ionice_class None I/O scheduling class of server processes. + I/O niceness class values are IOPRIO_CLASS_RT + (realtime), IOPRIO_CLASS_BE (best-effort), + and IOPRIO_CLASS_IDLE (idle). + The default does not modify class and + priority. Linux supports io scheduling + priorities and classes since 2.6.13 with + the CFQ io scheduler. + Work only with ionice_priority. +ionice_priority None I/O scheduling priority of server + processes. I/O niceness priority is + a number which goes from 0 to 7. + The higher the value, the lower the I/O + priority of the process. Work only with + ionice_class. + Ignored if IOPRIO_CLASS_IDLE is set. +=================== =================== ========================================== + +**************** +[object-auditor] +**************** + +=========================== =================== ========================================== +Option Default Description +--------------------------- ------------------- ------------------------------------------ +log_name object-auditor Label used when logging +log_facility LOG_LOCAL0 Syslog log facility +log_level INFO Logging level +log_address /dev/log Logging directory +log_time 3600 Frequency of status logs in seconds. +interval 30 Time in seconds to wait between + auditor passes +disk_chunk_size 65536 Size of chunks read during auditing +files_per_second 20 Maximum files audited per second per + auditor process. Should be tuned according + to individual system specs. 0 is unlimited. +bytes_per_second 10000000 Maximum bytes audited per second per + auditor process. Should be tuned according + to individual system specs. 0 is unlimited. +concurrency 1 The number of parallel processes to use + for checksum auditing. +zero_byte_files_per_second 50 +object_size_stats +recon_cache_path /var/cache/swift Path to recon cache +rsync_tempfile_timeout auto Time elapsed in seconds before rsync + tempfiles will be unlinked. Config value + of "auto" try to use object-replicator's + rsync_timeout + 900 or fallback to 86400 + (1 day). +nice_priority None Scheduling priority of server processes. + Niceness values range from -20 (most + favorable to the process) to 19 (least + favorable to the process). The default + does not modify priority. +ionice_class None I/O scheduling class of server processes. + I/O niceness class values are IOPRIO_CLASS_RT + (realtime), IOPRIO_CLASS_BE (best-effort), + and IOPRIO_CLASS_IDLE (idle). + The default does not modify class and + priority. Linux supports io scheduling + priorities and classes since 2.6.13 with + the CFQ io scheduler. + Work only with ionice_priority. +ionice_priority None I/O scheduling priority of server + processes. I/O niceness priority is + a number which goes from 0 to 7. + The higher the value, the lower the I/O + priority of the process. Work only with + ionice_class. + Ignored if IOPRIO_CLASS_IDLE is set. +=========================== =================== ========================================== + +**************** +[object-expirer] +**************** + +============================= =============================== ========================================== +Option Default Description +----------------------------- ------------------------------- ------------------------------------------ +log_name object-expirer Label used when logging +log_facility LOG_LOCAL0 Syslog log facility +log_level INFO Logging level +log_address /dev/log Logging directory +interval 300 Time in seconds to wait between + expirer passes +report_interval 300 Frequency of status logs in seconds. +concurrency 1 Level of concurrency to use to do the work, + this value must be set to at least 1 +dequeue_from_legacy False This service will look for jobs on the + legacy expirer task queue. +round_robin_task_cache_size 100000 Number of tasks objects to cache before processing. +processes 0 How many parts to divide the legacy work into, + one part per process that will be doing the work. + When set 0 means that a single legacy + process will be doing all the work. + This can only be used in conjunction with + ``dequeue_from_legacy``. +process 0 Which of the parts a particular legacy process will + work on. It is "zero based", if you want to use 3 + processes, you should run processes with process + set to 0, 1, and 2. + This can only be used in conjunction with + ``dequeue_from_legacy``. +reclaim_age 604800 How long an un-processable expired object + marker will be retried before it is abandoned. + It is not coupled with the tombstone reclaim age + in the consistency engine. +request_tries 3 The number of times the expirer's internal client + will attempt any given request in the event + of failure +recon_cache_path /var/cache/swift Path to recon cache +nice_priority None Scheduling priority of server processes. + Niceness values range from -20 (most + favorable to the process) to 19 (least + favorable to the process). The default + does not modify priority. +ionice_class None I/O scheduling class of server processes. + I/O niceness class values are IOPRIO_CLASS_RT + (realtime), IOPRIO_CLASS_BE (best-effort), + and IOPRIO_CLASS_IDLE (idle). + The default does not modify class and + priority. Linux supports io scheduling + priorities and classes since 2.6.13 with + the CFQ io scheduler. + Work only with ionice_priority. +ionice_priority None I/O scheduling priority of server + processes. I/O niceness priority is + a number which goes from 0 to 7. + The higher the value, the lower the I/O + priority of the process. Work only with + ionice_class. + Ignored if IOPRIO_CLASS_IDLE is set. +delay_reaping_ 0.0 A dynamic configuration option for + setting account level delay_reaping values. + The delay_reaping value is configured for + the account with the name placed in + . The object expirer will reap objects in + this account from disk only after this delay + following their x-delete-at time. +delay_reaping_/ 0.0 A dynamic configuration option for + setting container level delay_reaping values. + The delay_reaping value is configured for + the container with the account name placed + in and the container name in . + The object expirer will reap objects in this + container from disk only after this delay + following their x-delete-at time. +============================= =============================== ========================================== diff --git a/doc/source/config/proxy_server_config.rst b/doc/source/config/proxy_server_config.rst new file mode 100644 index 0000000000..877cd32559 --- /dev/null +++ b/doc/source/config/proxy_server_config.rst @@ -0,0 +1,392 @@ +.. _proxy-server-config: + +-------------------------- +Proxy Server Configuration +-------------------------- + +This document describes the configuration options available for the proxy +server. Some proxy server options may be configured on a :ref:`per-policy +` basis. Additional documentation for +proxy-server middleware can be found at :doc:`../middleware` and +:doc:`../overview_auth`. + +Documentation for other swift configuration options can be found at +:doc:`index`. + +An example Proxy Server configuration can be found at +etc/proxy-server.conf-sample in the source code repository. + +The following configuration sections are available: + +* :ref:`[DEFAULT] ` +* `[proxy-server]`_ + + +.. _proxy_server_default_options: + +********* +[DEFAULT] +********* + +==================================== ======================== ======================================== +Option Default Description +------------------------------------ ------------------------ ---------------------------------------- +bind_ip 0.0.0.0 IP Address for server to + bind to +bind_port 80 Port for server to bind to +keep_idle 600 Value to set for socket TCP_KEEPIDLE +bind_timeout 30 Seconds to attempt bind before + giving up +backlog 4096 Maximum number of allowed pending + connections +swift_dir /etc/swift Swift configuration directory +workers auto Override the number of + pre-forked workers that will + accept connections. If set it + should be an integer, zero + means no fork. If unset, it + will try to default to the + number of effective cpu cores + and fallback to one. See + :ref:`general-service-tuning`. +max_clients 1024 Maximum number of clients one + worker can process + simultaneously (it will + actually accept(2) N + + 1). Setting this to one (1) + will only handle one request at + a time, without accepting + another request + concurrently. +user swift User to run as +cert_file Path to the ssl .crt. This + should be enabled for testing + purposes only. +key_file Path to the ssl .key. This + should be enabled for testing + purposes only. +cors_allow_origin List of origin hosts that are allowed + for CORS requests in addition to what + the container has set. +strict_cors_mode True If True (default) then CORS + requests are only allowed if their + Origin header matches an allowed + origin. Otherwise, any Origin is + allowed. +cors_expose_headers This is a list of headers that + are included in the header + Access-Control-Expose-Headers + in addition to what the container + has set. +client_timeout 60 +trans_id_suffix This optional suffix (default is empty) + that would be appended to the swift + transaction id allows one to easily + figure out from which cluster that + X-Trans-Id belongs to. This is very + useful when one is managing more than + one swift cluster. +log_name swift Label used when logging +log_facility LOG_LOCAL0 Syslog log facility +log_level INFO Logging level +log_headers False +log_address /dev/log Logging directory +log_max_line_length 0 Caps the length of log + lines to the value given; + no limit if set to 0, the + default. +log_custom_handlers None Comma separated list of functions + to call to setup custom log + handlers. +log_udp_host Override log_address +log_udp_port 514 UDP log port +log_statsd_host None Enables StatsD logging; IPv4/IPv6 + address or a hostname. If a + hostname resolves to an IPv4 and IPv6 + address, the IPv4 address will be + used. +log_statsd_port 8125 +log_statsd_default_sample_rate 1.0 +log_statsd_sample_rate_factor 1.0 +log_statsd_metric_prefix +eventlet_debug false If true, turn on debug logging + for eventlet + +expose_info true Enables exposing configuration + settings via HTTP GET /info. +admin_key Key to use for admin calls that + are HMAC signed. Default + is empty, which will + disable admin calls to + /info. +disallowed_sections swift.valid_api_versions Allows the ability to withhold + sections from showing up in the + public calls to /info. You can + withhold subsections by separating + the dict level with a ".". +nice_priority None Scheduling priority of server + processes. + Niceness values range from -20 (most + favorable to the process) to 19 (least + favorable to the process). The default + does not modify priority. +ionice_class None I/O scheduling class of server + processes. I/O niceness class values + are IOPRIO_CLASS_RT (realtime), + IOPRIO_CLASS_BE (best-effort) and + IOPRIO_CLASS_IDLE (idle). + The default does not + modify class and priority. Linux + supports io scheduling priorities + and classes since 2.6.13 with + the CFQ io scheduler. + Work only with ionice_priority. +ionice_priority None I/O scheduling priority of server + processes. I/O niceness priority is + a number which goes from 0 to 7. + The higher the value, the lower + the I/O priority of the process. + Work only with ionice_class. + Ignored if IOPRIO_CLASS_IDLE is set. +==================================== ======================== ======================================== + +************** +[proxy-server] +************** + +============================================== =============== ===================================== +Option Default Description +---------------------------------------------- --------------- ------------------------------------- +use Entry point for paste.deploy for + the proxy server. For most + cases, this should be + ``egg:swift#proxy``. +set log_name proxy-server Label used when logging +set log_facility LOG_LOCAL0 Syslog log facility +set log_level INFO Log level +set log_headers True If True, log headers in each + request +set log_handoffs True If True, the proxy will log + whenever it has to failover to a + handoff node +recheck_account_existence 60 Cache timeout in seconds to + send memcached for account + existence +recheck_container_existence 60 Cache timeout in seconds to + send memcached for container + existence +account_existence_skip_cache_pct 0.0 Periodically, bypass the cache + for account info requests and + goto disk to refresh the data + in the cache. This is a percentage + of requests should randomly skip. + Values around 0.0 - 0.1 (1 in every + 1000) are recommended. +container_existence_skip_cache_pct 0.0 Periodically, bypass the cache + for container info requests and + goto disk to refresh the data + in the cache. This is a percentage + of requests should randomly skip. + Values around 0.0 - 0.1 (1 in every + 1000) are recommended. +container_updating_shard_ranges_skip_cache_pct 0.0 Periodically, bypass the cache + for shard_range update requests and + goto disk to refresh the data + in the cache. This is a percentage + of requests should randomly skip. + Values around 0.0 - 0.1 (1 in every + 1000) are recommended. +container_listing_shard_ranges_skip_cache_pct 0.0 Periodically, bypass the cache + for shard_range listing info requests + and goto disk to refresh the data + in the cache. This is a percentage + of requests should randomly skip. + Values around 0.0 - 0.1 (1 in every + 1000) are recommended. +object_chunk_size 65536 Chunk size to read from + object servers +client_chunk_size 65536 Chunk size to read from + clients +memcache_servers 127.0.0.1:11211 Comma separated list of + memcached servers + ip:port or [ipv6addr]:port, + if this value is + empty, the memcache client will look + for a :ref:`[memcache.conf] ` +memcache_max_connections 2 Max number of connections to + each memcached server per + worker +node_timeout 10 Request timeout to external + services +recoverable_node_timeout node_timeout Request timeout to external + services for requests that, on + failure, can be recovered + from. For example, object GET. +client_timeout 60 Timeout to read one chunk + from a client +conn_timeout 0.5 Connection timeout to + external services +error_suppression_interval 60 Time in seconds that must + elapse since the last error + for a node to be considered + no longer error limited +error_suppression_limit 10 Error count to consider a + node error limited +allow_account_management false Whether account PUTs and DELETEs + are even callable +account_autocreate false If set to 'true' authorized + accounts that do not yet exist + within the Swift cluster will + be automatically created. +max_containers_per_account 0 If set to a positive value, + trying to create a container + when the account already has at + least this maximum containers + will result in a 403 Forbidden. + Note: This is a soft limit, + meaning a user might exceed the + cap for + recheck_account_existence before + the 403s kick in. +max_containers_whitelist This is a comma separated list + of account names that ignore + the max_containers_per_account + cap. +rate_limit_after_segment 10 Rate limit the download of + large object segments after + this segment is downloaded. +rate_limit_segments_per_sec 1 Rate limit large object + downloads at this rate. +request_node_count 2 * replicas Set to the number of nodes to + contact for a normal request. + You can use '* replicas' at the + end to have it use the number + given times the number of + replicas for the ring being used + for the request. +swift_owner_headers up to the auth system in use, + but usually indicates + administrative responsibilities. +sorting_method shuffle Storage nodes can be chosen at + random (shuffle), by using timing + measurements (timing), or by using + an explicit match (affinity). + Using timing measurements may allow + for lower overall latency, while + using affinity allows for finer + control. In both the timing and + affinity cases, equally-sorting nodes + are still randomly chosen to spread + load. This option may be overridden + in a per-policy configuration + section. +timing_expiry 300 If the "timing" sorting_method is + used, the timings will only be valid + for the number of seconds configured + by timing_expiry. +concurrent_gets off Use replica count number of + threads concurrently during a + GET/HEAD and return with the + first successful response. In + the EC case, this parameter only + affects an EC HEAD as an EC GET + behaves differently. +concurrency_timeout conn_timeout This parameter controls how long + to wait before firing off the + next concurrent_get thread. A + value of 0 would we fully concurrent, + any other number will stagger the + firing of the threads. This number + should be between 0 and node_timeout. + The default is conn_timeout (0.5). +nice_priority None Scheduling priority of server + processes. + Niceness values range from -20 (most + favorable to the process) to 19 (least + favorable to the process). The default + does not modify priority. +ionice_class None I/O scheduling class of server + processes. I/O niceness class values + are IOPRIO_CLASS_RT (realtime), + IOPRIO_CLASS_BE (best-effort), + and IOPRIO_CLASS_IDLE (idle). + The default does not modify class and + priority. Linux supports io scheduling + priorities and classes since 2.6.13 + with the CFQ io scheduler. + Work only with ionice_priority. +ionice_priority None I/O scheduling priority of server + processes. I/O niceness priority is + a number which goes from 0 to 7. + The higher the value, the lower the + I/O priority of the process. Work + only with ionice_class. + Ignored if IOPRIO_CLASS_IDLE is set. +read_affinity None Specifies which backend servers to + prefer on reads; used in conjunction + with the sorting_method option being + set to 'affinity'. Format is a comma + separated list of affinity descriptors + of the form =. + The may be r for + selecting nodes in region N or + rz for selecting nodes in + region N, zone M. The + value should be a whole number + that represents the priority to + be given to the selection; lower + numbers are higher priority. + Default is empty, meaning no + preference. This option may be + overridden in a per-policy + configuration section. +write_affinity None Specifies which backend servers to + prefer on writes. Format is a comma + separated list of affinity + descriptors of the form r for + region N or rz for region N, + zone M. Default is empty, meaning no + preference. This option may be + overridden in a per-policy + configuration section. +write_affinity_node_count 2 * replicas The number of local (as governed by + the write_affinity setting) nodes to + attempt to contact first on writes, + before any non-local ones. The value + should be an integer number, or use + '* replicas' at the end to have it + use the number given times the number + of replicas for the ring being used + for the request. This option may be + overridden in a per-policy + configuration section. +write_affinity_handoff_delete_count auto The number of local (as governed by + the write_affinity setting) handoff + nodes to attempt to contact on + deletion, in addition to primary + nodes. Example: in geographically + distributed deployment, If replicas=3, + sometimes there may be 1 primary node + and 2 local handoff nodes in one region + holding the object after uploading but + before object replicated to the + appropriate locations in other regions. + In this case, include these handoff + nodes to send request when deleting + object could help make correct decision + for the response. The default value 'auto' + means Swift will calculate the number + automatically, the default value is + (replicas - len(local_primary_nodes)). + This option may be overridden in a + per-policy configuration section. +allow_open_expired false If true (default is false), an object that + has expired but not yet been reaped can be + can be accessed by setting the + 'x-open-expired' header to true in + GET, HEAD, and POST requests. +============================================== =============== ===================================== diff --git a/doc/source/config/swift_common_config.rst b/doc/source/config/swift_common_config.rst new file mode 100644 index 0000000000..8bb6eabe91 --- /dev/null +++ b/doc/source/config/swift_common_config.rst @@ -0,0 +1,35 @@ +.. _swift-common-config: + +-------------------- +Common configuration +-------------------- + +This document describes the configuration options common to all swift servers. +Documentation for other swift configuration options can be found at +:doc:`index`. + +An example of common configuration file can be found at etc/swift.conf-sample + +The following configuration options are available: + +========================== ========== ============================================= +Option Default Description +-------------------------- ---------- --------------------------------------------- +max_header_size 8192 max_header_size is the max number of bytes in + the utf8 encoding of each header. Using 8192 + as default because eventlet use 8192 as max + size of header line. This value may need to + be increased when using identity v3 API + tokens including more than 7 catalog entries. + See also include_service_catalog in + proxy-server.conf-sample (documented in + overview_auth.rst). +extra_header_count 0 By default the maximum number of allowed + headers depends on the number of max + allowed metadata settings plus a default + value of 32 for regular http headers. + If for some reason this is not enough (custom + middleware for example) it can be increased + with the extra_header_count constraint. +========================== ========== ============================================= + diff --git a/doc/source/container.rst b/doc/source/container.rst index d80adcaa32..bc95753852 100644 --- a/doc/source/container.rst +++ b/doc/source/container.rst @@ -4,6 +4,36 @@ Container ********* +.. _container-auditor: + +Container Auditor +================= + +.. automodule:: swift.container.auditor + :members: + :undoc-members: + :show-inheritance: + +.. _container-backend: + +Container Backend +================= + +.. automodule:: swift.container.backend + :members: + :undoc-members: + :show-inheritance: + +.. _container-replicator: + +Container Replicator +==================== + +.. automodule:: swift.container.replicator + :members: + :undoc-members: + :show-inheritance: + .. _container-server: Container Server @@ -14,26 +44,28 @@ Container Server :undoc-members: :show-inheritance: -.. _container-updater: +.. _container-reconciler: -Container Updater -================= +Container Reconciler +==================== -.. automodule:: swift.container.updater +.. automodule:: swift.container.reconciler :members: :undoc-members: :show-inheritance: -.. _container-auditor: +.. _container-sharder: -Container Auditor +Container Sharder ================= -.. automodule:: swift.container.auditor +.. automodule:: swift.container.sharder :members: :undoc-members: :show-inheritance: +.. _container-sync-daemon: + Container Sync ============== @@ -41,3 +73,13 @@ Container Sync :members: :undoc-members: :show-inheritance: + +.. _container-updater: + +Container Updater +================= + +.. automodule:: swift.container.updater + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/contributor/contributing.rst b/doc/source/contributor/contributing.rst new file mode 100644 index 0000000000..52561addb0 --- /dev/null +++ b/doc/source/contributor/contributing.rst @@ -0,0 +1,75 @@ +.. include:: ../../../CONTRIBUTING.rst + +Community +========= + +Communication +------------- +IRC + People working on the Swift project may be found in the + ``#openstack-swift`` channel on OFTC during working hours + in their timezone. The channel is logged, so if you ask + a question when no one is around, you can + `check the log `__ + to see if it's been answered. + +weekly meeting + This is a Swift team meeting. The discussion in this meeting is about + all things related to the Swift project: + +- time: http://eavesdrop.openstack.org/#Swift_Team_Meeting +- agenda: https://wiki.openstack.org/wiki/Meetings/Swift + +mailing list + We use the openstack-discuss@lists.openstack.org mailing list for + asynchronous discussions or to communicate with other OpenStack teams. + Use the prefix ``[swift]`` in your subject line (it's a high-volume + list, so most people use email filters). + + You can use our `mailing list info page `__ to read the archives and subscribe. + +Contacting the Core Team +------------------------ + +The swift-core team is an active group of contributors who are responsible +for directing and maintaining the Swift project. As a new contributor, your +interaction with this group will be mostly through code reviews, because +only members of swift-core can approve a code change to be merged into the +code repository. But the swift-core team also spend time on IRC so feel +free to drop in to ask questions or just to meet us. + +.. note:: + Although your contribution will require reviews by members of + swift-core, these aren't the only people whose reviews matter. + Anyone with a gerrit account can post reviews, so you can ask + other developers you know to review your code ... and you can + review theirs. (A good way to learn your way around the codebase + is to review other people's patches.) + + If you're thinking, "I'm new at this, how can I possibly provide + a helpful review?", take a look at `How to Review Changes the + OpenStack Way + `_. + + Or for more specifically in a Swift context read :doc:`review_guidelines` + +You can read about the role of core reviewers in the +`OpenStack governance documentation +`_, learn +more about the swift-core members in our gerrit +`membership list `__, and +also find the team at our `Swift weekly meetings <#communication>`__. + +Getting Your Patch Merged +------------------------- +Understanding how reviewers review and what they look for will help getting +your code merged. See `Swift Review Guidelines `_ +for how we review code. + +Keep in mind that reviewers are also human; if something feels stalled, then +come and poke us on IRC or add it to our meeting agenda. + +Project Team Lead Duties +------------------------ +All common PTL duties are enumerated in the `PTL guide +`_. diff --git a/doc/source/contributor/review_guidelines.rst b/doc/source/contributor/review_guidelines.rst new file mode 100644 index 0000000000..314e764f49 --- /dev/null +++ b/doc/source/contributor/review_guidelines.rst @@ -0,0 +1 @@ +.. include:: ../../../REVIEW_GUIDELINES.rst diff --git a/doc/source/cors.rst b/doc/source/cors.rst new file mode 100644 index 0000000000..91e1611b5d --- /dev/null +++ b/doc/source/cors.rst @@ -0,0 +1,106 @@ +==== +CORS +==== + +CORS_ is a mechanism to allow code running in a browser (Javascript for +example) make requests to a domain other than the one from where it originated. + +Swift supports CORS requests to containers and objects. + +CORS metadata is held on the container only. The values given apply to the +container itself and all objects within it. + +The supported headers are, + ++------------------------------------------------+------------------------------+ +| Metadata | Use | ++================================================+==============================+ +| X-Container-Meta-Access-Control-Allow-Origin | Origins to be allowed to | +| | make Cross Origin Requests, | +| | space separated. | ++------------------------------------------------+------------------------------+ +| X-Container-Meta-Access-Control-Max-Age | Max age for the Origin to | +| | hold the preflight results. | ++------------------------------------------------+------------------------------+ +| X-Container-Meta-Access-Control-Expose-Headers | Headers exposed to the user | +| | agent (e.g. browser) in the | +| | actual request response. | +| | Space separated. | ++------------------------------------------------+------------------------------+ + +In addition the values set in container metadata, some cluster-wide values +may also be configured using the ``strict_cors_mode``, ``cors_allow_origin`` +and ``cors_expose_headers`` in ``proxy-server.conf``. See +``proxy-server.conf-sample`` for more information. + +Before a browser issues an actual request it may issue a `preflight request`_. +The preflight request is an OPTIONS call to verify the Origin is allowed to +make the request. The sequence of events are, + +* Browser makes OPTIONS request to Swift +* Swift returns 200/401 to browser based on allowed origins +* If 200, browser makes the "actual request" to Swift, i.e. PUT, POST, DELETE, + HEAD, GET + +When a browser receives a response to an actual request it only exposes those +headers listed in the ``Access-Control-Expose-Headers`` header. By default Swift +returns the following values for this header, + +* "simple response headers" as listed on + http://www.w3.org/TR/cors/#simple-response-header +* the headers ``etag``, ``x-timestamp``, ``x-trans-id``, + ``x-openstack-request-id`` +* all metadata headers (``X-Container-Meta-*`` for containers and + ``X-Object-Meta-*`` for objects) +* headers listed in ``X-Container-Meta-Access-Control-Expose-Headers`` +* headers configured using the ``cors_expose_headers`` option in + ``proxy-server.conf`` + +.. note:: + An OPTIONS request to a symlink object will respond with the options for + the symlink only, the request will not be redirected to the target object. + Therefore, if the symlink's target object is in another container with + CORS settings, the response will not reflect the settings. + + +----------------- +Sample Javascript +----------------- + +To see some CORS Javascript in action download the `test CORS page`_ (source +below). Host it on a webserver and take note of the protocol and hostname +(origin) you'll be using to request the page, e.g. http://localhost. + +Locate a container you'd like to query. Needless to say the Swift cluster +hosting this container should have CORS support. Append the origin of the +test page to the container's ``X-Container-Meta-Access-Control-Allow-Origin`` +header,:: + + curl -X POST -H 'X-Auth-Token: xxx' \ + -H 'X-Container-Meta-Access-Control-Allow-Origin: http://localhost' \ + http://192.168.56.3:8080/v1/AUTH_test/cont1 + +At this point the container is now accessible to CORS clients hosted on +http://localhost. Open the test CORS page in your browser. + +#. Populate the Token field +#. Populate the URL field with the URL of either a container or object +#. Select the request method +#. Hit Submit + +Assuming the request succeeds you should see the response header and body. If +something went wrong the response status will be 0. + +.. _test CORS page: + +-------------- +Test CORS Page +-------------- + +A sample cross-site test page is located in the project source tree +``doc/source/test-cors.html``. + +.. literalinclude:: test-cors.html + +.. _CORS: https://developer.mozilla.org/en-US/docs/HTTP/Access_control_CORS +.. _preflight request: https://developer.mozilla.org/en-US/docs/HTTP/Access_control_CORS#Preflighted_requests diff --git a/doc/source/crossdomain.rst b/doc/source/crossdomain.rst new file mode 100644 index 0000000000..31915d3689 --- /dev/null +++ b/doc/source/crossdomain.rst @@ -0,0 +1,69 @@ +======================== +Cross-domain Policy File +======================== + +A cross-domain policy file allows web pages hosted elsewhere to use client +side technologies such as Flash, Java and Silverlight to interact +with the Swift API. + +See https://www.adobe.com/devnet-docs/acrobatetk/tools/AppSec/xdomain.html for +a description of the purpose and structure of the cross-domain policy +file. The cross-domain policy file is installed in the root of a web +server (i.e., the path is ``/crossdomain.xml``). + +The crossdomain middleware responds to a path of ``/crossdomain.xml`` with an +XML document such as: + +.. code:: xml + + + + + + + +You should use a policy appropriate to your site. The examples and the +default policy are provided to indicate how to syntactically construct +a cross domain policy file -- they are not recommendations. + +------------- +Configuration +------------- + +To enable this middleware, add it to the pipeline in your proxy-server.conf +file. It should be added before any authentication (e.g., tempauth or +keystone) middleware. In this example ellipsis (...) indicate other +middleware you may have chosen to use: + +.. code:: cfg + + [pipeline:main] + pipeline = ... crossdomain ... authtoken ... proxy-server + +And add a filter section, such as: + +.. code:: cfg + + [filter:crossdomain] + use = egg:swift#crossdomain + cross_domain_policy = + + +For continuation lines, put some whitespace before the continuation +text. Ensure you put a completely blank line to terminate the +``cross_domain_policy`` value. + +The ``cross_domain_policy`` name/value is optional. If omitted, the policy +defaults as if you had specified: + +.. code:: cfg + + cross_domain_policy = + +.. note:: + + The default policy is very permissive; this is appropriate + for most public cloud deployments, but may not be appropriate + for all deployments. See also: + `CWE-942 `__ + diff --git a/doc/source/deployment_guide.rst b/doc/source/deployment_guide.rst index badcd22ba2..2a8d52bed4 100644 --- a/doc/source/deployment_guide.rst +++ b/doc/source/deployment_guide.rst @@ -1,24 +1,25 @@ -================ + Deployment Guide ================ +This document provides general guidance for deploying and configuring Swift. +Detailed descriptions of configuration options can be found in the +:doc:`configuration documentation `. + ----------------------- Hardware Considerations ----------------------- -Swift is designed to run on commodity hardware. At Rackspace, our storage -servers are currently running fairly generic 4U servers with 24 2T SATA -drives and 8 cores of processing power. RAID on the storage drives is not -required and not recommended. Swift's disk usage pattern is the worst -case possible for RAID, and performance degrades very quickly using RAID 5 -or 6. +Swift is designed to run on commodity hardware. RAID on the storage drives is +not required and not recommended. Swift's disk usage pattern is the worst case +possible for RAID, and performance degrades very quickly using RAID 5 or 6. ------------------ Deployment Options ------------------ -The swift services run completely autonomously, which provides for a lot of -flexibility when architecting the hardware deployment for swift. The 4 main +The Swift services run completely autonomously, which provides for a lot of +flexibility when architecting the hardware deployment for Swift. The 4 main services are: #. Proxy Services @@ -36,21 +37,35 @@ and network I/O intensive. The easiest deployment is to install all services on each server. There is nothing wrong with doing this, as it scales each service out horizontally. -At Rackspace, we put the Proxy Services on their own servers and all of the -Storage Services on the same server. This allows us to send 10g networking to -the proxy and 1g to the storage servers, and keep load balancing to the -proxies more manageable. Storage Services scale out horizontally as storage -servers are added, and we can scale overall API throughput by adding more -Proxies. +Alternatively, one set of servers may be dedicated to the Proxy Services and a +different set of servers dedicated to the Storage Services. This allows faster +networking to be configured to the proxy than the storage servers, and keeps +load balancing to the proxies more manageable. Storage Services scale out +horizontally as storage servers are added, and the overall API throughput can +be scaled by adding more proxies. If you need more throughput to either Account or Container Services, they may each be deployed to their own servers. For example you might use faster (but more expensive) SAS or even SSD drives to get faster disk I/O to the databases. +A high-availability (HA) deployment of Swift requires that multiple proxy +servers are deployed and requests are load-balanced between them. Each proxy +server instance is stateless and able to respond to requests for the entire +cluster. + Load balancing and network design is left as an exercise to the reader, but this is a very important part of the cluster, so time should be spent designing the network for a Swift cluster. + +--------------------- +Web Front End Options +--------------------- + +Swift comes with an integral web front end. However, it can also be deployed +as a request processor of an Apache2 using mod_wsgi as described in +:doc:`Apache Deployment Guide `. + .. _ring-preparing: ------------------ @@ -87,8 +102,12 @@ into consideration can include physical location, power availability, and network connectivity. For example, in a small cluster you might decide to split the zones up by cabinet, with each cabinet having its own power and network connectivity. The zone concept is very abstract, so feel free to use -it in whatever way best isolates your data from failure. Zones are referenced -by number, beginning with 1. +it in whatever way best isolates your data from failure. Each zone exists +in a region. + +A region is also an abstract concept that may be used to distinguish between +geographically separated areas as well as can be used within same datacenter. +Regions and zones are referenced by a positive integer. You can now start building the ring with:: @@ -100,17 +119,18 @@ specific partition can be moved in succession (24 is a good value for this). Devices can be added to the ring with:: - swift-ring-builder add z-:/_ + swift-ring-builder add rz-:/_ This will add a device to the ring where is the name of the -builder file that was created previously, is the number of the zone -this device is in, is the ip address of the server the device is in, - is the port number that the server is running on, is -the name of the device on the server (for example: sdb1), is a string -of metadata for the device (optional), and is a float weight that -determines how many partitions are put on the device relative to the rest of -the devices in the cluster (a good starting point is 100.0 x TB on the drive). -Add each device that will be initially in the cluster. +builder file that was created previously, is the number of the region +the zone is in, is the number of the zone this device is in, is +the ip address of the server the device is in, is the port number that +the server is running on, is the name of the device on the server +(for example: sdb1), is a string of metadata for the device (optional), +and is a float weight that determines how many partitions are put on +the device relative to the rest of the devices in the cluster (a good starting +point is 100.0 x TB on the drive).Add each device that will be initially in the +cluster. Once all of the devices are added to the ring, run:: @@ -130,15 +150,164 @@ swift-ring-builder with no options will display help text with available commands and options. More information on how the ring works internally can be found in the :doc:`Ring Overview `. +.. _server-per-port-configuration: + +------------------------------- +Running object-servers Per Disk +------------------------------- + +The lack of true asynchronous file I/O on Linux leaves the object-server +workers vulnerable to misbehaving disks. Because any object-server worker can +service a request for any disk, and a slow I/O request blocks the eventlet hub, +a single slow disk can impair an entire storage node. This also prevents +object servers from fully utilizing all their disks during heavy load. + +Another way to get full I/O isolation is to give each disk on a storage node a +different port in the storage policy rings. Then set the +:ref:`servers_per_port ` +option in the object-server config. NOTE: while the purpose of this config +setting is to run one or more object-server worker processes per *disk*, the +implementation just runs object-servers per unique port of local devices in the +rings. The deployer must combine this option with appropriately-configured +rings to benefit from this feature. + +Here's an example (abbreviated) old-style ring (2 node cluster with 2 disks +each):: + + Devices: id region zone ip address port replication ip replication port name + 0 1 1 1.1.0.1 6200 1.1.0.1 6200 d1 + 1 1 1 1.1.0.1 6200 1.1.0.1 6200 d2 + 2 1 2 1.1.0.2 6200 1.1.0.2 6200 d3 + 3 1 2 1.1.0.2 6200 1.1.0.2 6200 d4 + +And here's the same ring set up for ``servers_per_port``:: + + Devices: id region zone ip address port replication ip replication port name + 0 1 1 1.1.0.1 6200 1.1.0.1 6200 d1 + 1 1 1 1.1.0.1 6201 1.1.0.1 6201 d2 + 2 1 2 1.1.0.2 6200 1.1.0.2 6200 d3 + 3 1 2 1.1.0.2 6201 1.1.0.2 6201 d4 + +When migrating from normal to ``servers_per_port``, perform these steps in order: + +#. Upgrade Swift code to a version capable of doing ``servers_per_port``. + +#. Enable ``servers_per_port`` with a value greater than zero. + +#. Restart ``swift-object-server`` processes with a SIGHUP. At this point, you + will have the ``servers_per_port`` number of ``swift-object-server`` processes + serving all requests for all disks on each node. This preserves + availability, but you should perform the next step as quickly as possible. + +#. Push out new rings that actually have different ports per disk on each + server. One of the ports in the new ring should be the same as the port + used in the old ring ("6200" in the example above). This will cover + existing proxy-server processes who haven't loaded the new ring yet. They + can still talk to any storage node regardless of whether or not that + storage node has loaded the ring and started object-server processes on the + new ports. + +If you do not run a separate object-server for replication, then this setting +must be available to the object-replicator and object-reconstructor (i.e. +appear in the [DEFAULT] config section). + +.. _general-service-configuration: + +----------------------------- +General Service Configuration +----------------------------- + +Most Swift services fall into two categories. Swift's wsgi servers and +background daemons. + +For more information specific to the configuration of Swift's wsgi servers +with paste deploy see :ref:`general-server-configuration`. + +Configuration for servers and daemons can be expressed together in the same +file for each type of server, or separately. If a required section for the +service trying to start is missing there will be an error. The sections not +used by the service are ignored. + +Consider the example of an object storage node. By convention, configuration +for the object-server, object-updater, object-replicator, object-auditor, and +object-reconstructor exist in a single file ``/etc/swift/object-server.conf``:: + + [DEFAULT] + reclaim_age = 604800 + + [pipeline:main] + pipeline = object-server + + [app:object-server] + use = egg:swift#object + + [object-replicator] + + [object-updater] + + [object-auditor] + +Swift services expect a configuration path as the first argument:: + + $ swift-object-auditor + Usage: swift-object-auditor CONFIG [options] + + Error: missing config path argument + +If you omit the object-auditor section this file could not be used as the +configuration path when starting the ``swift-object-auditor`` daemon:: + + $ swift-object-auditor /etc/swift/object-server.conf + Unable to find object-auditor config section in /etc/swift/object-server.conf + +If the configuration path is a directory instead of a file all of the files in +the directory with the file extension ".conf" will be combined to generate the +configuration object which is delivered to the Swift service. This is +referred to generally as "directory based configuration". + +Directory based configuration leverages ConfigParser's native multi-file +support. Files ending in ".conf" in the given directory are parsed in +lexicographical order. Filenames starting with '.' are ignored. A mixture of +file and directory configuration paths is not supported - if the configuration +path is a file only that file will be parsed. + +The Swift service management tool ``swift-init`` has adopted the convention of +looking for ``/etc/swift/{type}-server.conf.d/`` if the file +``/etc/swift/{type}-server.conf`` file does not exist. + +When using directory based configuration, if the same option under the same +section appears more than once in different files, the last value parsed is +said to override previous occurrences. You can ensure proper override +precedence by prefixing the files in the configuration directory with +numerical values.:: + + /etc/swift/ + default.base + object-server.conf.d/ + 000_default.conf -> ../default.base + 001_default-override.conf + 010_server.conf + 020_replicator.conf + 030_updater.conf + 040_auditor.conf + +You can inspect the resulting combined configuration object using the +``swift-config`` command line tool + +.. _general-server-configuration: + ---------------------------- General Server Configuration ---------------------------- -Swift uses paste.deploy (http://pythonpaste.org/deploy/) to manage server -configurations. Default configuration options are set in the `[DEFAULT]` -section, and any options specified there can be overridden in any of the other -sections BUT ONLY BY USING THE SYNTAX ``set option_name = value``. This is the -unfortunate way paste.deploy works and I'll try to explain it in full. +Swift uses paste.deploy (https://pypi.org/project/Paste/) to manage server +configurations. Detailed descriptions of configuration options can be found in +the :doc:`configuration documentation `. + +Default configuration options are set in the ``[DEFAULT]`` section, and any +options specified there can be overridden in any of the other sections BUT +ONLY BY USING THE SYNTAX ``set option_name = value``. This is the unfortunate +way paste.deploy works and I'll try to explain it in full. First, here's an example paste.deploy configuration file:: @@ -169,30 +338,30 @@ The resulting configuration that myapp receives is:: 'set name4': 'globalvalue'} local {'name6': 'localvalue'} -So, `name1` got the global value which is fine since it's only in the `DEFAULT` +So, ``name1`` got the global value which is fine since it's only in the ``DEFAULT`` section anyway. -`name2` got the global value from `DEFAULT` even though it appears to be -overridden in the `app:myapp` subsection. This is just the unfortunate way +``name2`` got the global value from ``DEFAULT`` even though it appears to be +overridden in the ``app:myapp`` subsection. This is just the unfortunate way paste.deploy works (at least at the time of this writing.) -`name3` got the local value from the `app:myapp` subsection because it is using +``name3`` got the local value from the ``app:myapp`` subsection because it is using the special paste.deploy syntax of ``set option_name = value``. So, if you want -a default value for most app/filters but want to overridde it in one +a default value for most app/filters but want to override it in one subsection, this is how you do it. -`name4` got the global value from `DEFAULT` since it's only in that section -anyway. But, since we used the ``set`` syntax in the `DEFAULT` section even +``name4`` got the global value from ``DEFAULT`` since it's only in that section +anyway. But, since we used the ``set`` syntax in the ``DEFAULT`` section even though we shouldn't, notice we also got a ``set name4`` variable. Weird, but probably not harmful. -`name5` got the local value from the `app:myapp` subsection since it's only +``name5`` got the local value from the ``app:myapp`` subsection since it's only there anyway, but notice that it is in the global configuration and not the local configuration. This is because we used the ``set`` syntax to set the value. Again, weird, but not harmful since Swift just treats the two sets of configuration values as one set anyway. -`name6` got the local value from `app:myapp` subsection since it's only there, +``name6`` got the local value from ``app:myapp`` subsection since it's only there, and since we didn't use the ``set`` syntax, it's only in the local configuration and not the global one. Though, as indicated above, there is no special distinction with Swift. @@ -209,522 +378,148 @@ The main rule to remember when working with Swift configuration files is: configuration files. ---------------------------- -Object Server Configuration ---------------------------- - -An Example Object Server configuration can be found at -etc/object-server.conf-sample in the source code repository. - -The following configuration options are available: - -[DEFAULT] - -=================== ========== ============================================= -Option Default Description -------------------- ---------- --------------------------------------------- -swift_dir /etc/swift Swift configuration directory -devices /srv/node Parent directory of where devices are mounted -mount_check true Whether or not check if the devices are - mounted to prevent accidentally writing - to the root device -bind_ip 0.0.0.0 IP Address for server to bind to -bind_port 6000 Port for server to bind to -bind_timeout 30 Seconds to attempt bind before giving up -workers 1 Number of workers to fork -disable_fallocate false Disable "fast fail" fallocate checks if the - underlying filesystem does not support it. -log_custom_handlers None Comma-separated list of functions to call - to setup custom log handlers. -eventlet_debug false If true, turn on debug logging for eventlet -=================== ========== ============================================= - -[object-server] - -================== ============= =========================================== -Option Default Description ------------------- ------------- ------------------------------------------- -use paste.deploy entry point for the object - server. For most cases, this should be - `egg:swift#object`. -set log_name object-server Label used when logging -set log_facility LOG_LOCAL0 Syslog log facility -set log_level INFO Logging level -set log_requests True Whether or not to log each request -user swift User to run as -node_timeout 3 Request timeout to external services -conn_timeout 0.5 Connection timeout to external services -network_chunk_size 65536 Size of chunks to read/write over the - network -disk_chunk_size 65536 Size of chunks to read/write to disk -max_upload_time 86400 Maximum time allowed to upload an object -slow 0 If > 0, Minimum time in seconds for a PUT - or DELETE request to complete -mb_per_sync 512 On PUT requests, sync file every n MB -keep_cache_size 5242880 Largest object size to keep in buffer cache -keep_cache_private false Allow non-public objects to stay in - kernel's buffer cache -================== ============= =========================================== - -[object-replicator] - -================== ================= ======================================= -Option Default Description ------------------- ----------------- --------------------------------------- -log_name object-replicator Label used when logging -log_facility LOG_LOCAL0 Syslog log facility -log_level INFO Logging level -daemonize yes Whether or not to run replication as a - daemon -run_pause 30 Time in seconds to wait between - replication passes -concurrency 1 Number of replication workers to spawn -timeout 5 Timeout value sent to rsync --timeout - and --contimeout options -stats_interval 3600 Interval in seconds between logging - replication statistics -reclaim_age 604800 Time elapsed in seconds before an - object can be reclaimed -================== ================= ======================================= - -[object-updater] - -================== ============== ========================================== -Option Default Description ------------------- -------------- ------------------------------------------ -log_name object-updater Label used when logging -log_facility LOG_LOCAL0 Syslog log facility -log_level INFO Logging level -interval 300 Minimum time for a pass to take -concurrency 1 Number of updater workers to spawn -node_timeout 10 Request timeout to external services -conn_timeout 0.5 Connection timeout to external services -slowdown 0.01 Time in seconds to wait between objects -================== ============== ========================================== - -[object-auditor] - -================== ============== ========================================== -Option Default Description ------------------- -------------- ------------------------------------------ -log_name object-auditor Label used when logging -log_facility LOG_LOCAL0 Syslog log facility -log_level INFO Logging level -log_time 3600 Frequency of status logs in seconds. -files_per_second 20 Maximum files audited per second. Should - be tuned according to individual system - specs. 0 is unlimited. -bytes_per_second 10000000 Maximum bytes audited per second. Should - be tuned according to individual system - specs. 0 is unlimited. -================== ============== ========================================== - ------------------------------- -Container Server Configuration ------------------------------- - -An example Container Server configuration can be found at -etc/container-server.conf-sample in the source code repository. - -The following configuration options are available: - -[DEFAULT] - -=================== ========== ============================================ -Option Default Description -------------------- ---------- -------------------------------------------- -swift_dir /etc/swift Swift configuration directory -devices /srv/node Parent directory of where devices are mounted -mount_check true Whether or not check if the devices are - mounted to prevent accidentally writing - to the root device -bind_ip 0.0.0.0 IP Address for server to bind to -bind_port 6001 Port for server to bind to -bind_timeout 30 Seconds to attempt bind before giving up -workers 1 Number of workers to fork -user swift User to run as -disable_fallocate false Disable "fast fail" fallocate checks if the - underlying filesystem does not support it. -log_custom_handlers None Comma-separated list of functions to call - to setup custom log handlers. -eventlet_debug false If true, turn on debug logging for eventlet -=================== ========== ============================================ - -[container-server] - -================== ================ ======================================== -Option Default Description ------------------- ---------------- ---------------------------------------- -use paste.deploy entry point for the - container server. For most cases, this - should be `egg:swift#container`. -set log_name container-server Label used when logging -set log_facility LOG_LOCAL0 Syslog log facility -set log_level INFO Logging level -node_timeout 3 Request timeout to external services -conn_timeout 0.5 Connection timeout to external services -allow_versions false Enable/Disable object versioning feature -================== ================ ======================================== - -[container-replicator] - -================== ==================== ==================================== -Option Default Description ------------------- -------------------- ------------------------------------ -log_name container-replicator Label used when logging -log_facility LOG_LOCAL0 Syslog log facility -log_level INFO Logging level -per_diff 1000 -concurrency 8 Number of replication workers to - spawn -run_pause 30 Time in seconds to wait between - replication passes -node_timeout 10 Request timeout to external services -conn_timeout 0.5 Connection timeout to external - services -reclaim_age 604800 Time elapsed in seconds before a - container can be reclaimed -================== ==================== ==================================== - -[container-updater] - -======================== ================= ================================== -Option Default Description ------------------------- ----------------- ---------------------------------- -log_name container-updater Label used when logging -log_facility LOG_LOCAL0 Syslog log facility -log_level INFO Logging level -interval 300 Minimum time for a pass to take -concurrency 4 Number of updater workers to spawn -node_timeout 3 Request timeout to external - services -conn_timeout 0.5 Connection timeout to external - services -slowdown 0.01 Time in seconds to wait between - containers -account_suppression_time 60 Seconds to suppress updating an - account that has generated an - error (timeout, not yet found, - etc.) -======================== ================= ================================== - -[container-auditor] - -================== ================= ======================================= -Option Default Description ------------------- ----------------- --------------------------------------- -log_name container-auditor Label used when logging -log_facility LOG_LOCAL0 Syslog log facility -log_level INFO Logging level -interval 1800 Minimum time for a pass to take -================== ================= ======================================= +.. _proxy_server_per_policy_config: ----------------------------- -Account Server Configuration ----------------------------- +************************ +Per policy configuration +************************ + +Some proxy-server configuration options may be overridden for individual +:doc:`overview_policies` by including per-policy config section(s). These +options are: + +- ``sorting_method`` +- ``read_affinity`` +- ``write_affinity`` +- ``write_affinity_node_count`` +- ``write_affinity_handoff_delete_count`` + +The per-policy config section name must be of the form:: + + [proxy-server:policy:] + +.. note:: + + The per-policy config section name should refer to the policy index, not + the policy name. + +.. note:: + + The first part of proxy-server config section name must match the name of + the proxy-server config section. This is typically ``proxy-server`` as + shown above, but if different then the names of any per-policy config + sections must be changed accordingly. + +The value of an option specified in a per-policy section will override any +value given in the proxy-server section for that policy only. Otherwise the +value of these options will be that specified in the proxy-server section. + +For example, the following section provides policy-specific options for a +policy with index ``3``:: + + [proxy-server:policy:3] + sorting_method = affinity + read_affinity = r2=1 + write_affinity = r2 + write_affinity_node_count = 1 * replicas + write_affinity_handoff_delete_count = 2 + +.. note:: + + It is recommended that per-policy config options are *not* included in the + ``[DEFAULT]`` section. If they are then the following behavior applies. + + Per-policy config sections will inherit options in the ``[DEFAULT]`` + section of the config file, and any such inheritance will take precedence + over inheriting options from the proxy-server config section. + + Per-policy config section options will override options in the + ``[DEFAULT]`` section. Unlike the behavior described under `General Server + Configuration`_ for paste-deploy ``filter`` and ``app`` sections, the + ``set`` keyword is not required for options to override in per-policy + config sections. + + For example, given the following settings in a config file:: + + [DEFAULT] + sorting_method = affinity + read_affinity = r0=100 + write_affinity = r0 + + [app:proxy-server] + use = egg:swift#proxy + # use of set keyword here overrides [DEFAULT] option + set read_affinity = r1=100 + # without set keyword, [DEFAULT] option overrides in a paste-deploy section + write_affinity = r1 + + [proxy-server:policy:0] + sorting_method = affinity + # set keyword not required here to override [DEFAULT] option + write_affinity = r1 + + would result in policy with index ``0`` having settings: + + * ``read_affinity = r0=100`` (inherited from the ``[DEFAULT]`` section) + * ``write_affinity = r1`` (specified in the policy 0 section) + + and any other policy would have the default settings of: + + * ``read_affinity = r1=100`` (set in the proxy-server section) + * ``write_affinity = r0`` (inherited from the ``[DEFAULT]`` section) + +***************** +Proxy Middlewares +***************** + +Many features in Swift are implemented as middleware in the proxy-server +pipeline. See :doc:`middleware` and the ``proxy-server.conf-sample`` file for +more information. In particular, the use of some type of :doc:`authentication +and authorization middleware ` is highly recommended. -An example Account Server configuration can be found at -etc/account-server.conf-sample in the source code repository. - -The following configuration options are available: - -[DEFAULT] - -=================== ========== ============================================= -Option Default Description -------------------- ---------- --------------------------------------------- -swift_dir /etc/swift Swift configuration directory -devices /srv/node Parent directory or where devices are mounted -mount_check true Whether or not check if the devices are - mounted to prevent accidentally writing - to the root device -bind_ip 0.0.0.0 IP Address for server to bind to -bind_port 6002 Port for server to bind to -bind_timeout 30 Seconds to attempt bind before giving up -workers 1 Number of workers to fork -user swift User to run as -db_preallocation off If you don't mind the extra disk space usage in - overhead, you can turn this on to preallocate - disk space with SQLite databases to decrease - fragmentation. -disable_fallocate false Disable "fast fail" fallocate checks if the - underlying filesystem does not support it. -log_custom_handlers None Comma-separated list of functions to call - to setup custom log handlers. -eventlet_debug false If true, turn on debug logging for eventlet -=================== ========== ============================================= - -[account-server] - -================== ============== ========================================== -Option Default Description ------------------- -------------- ------------------------------------------ -use Entry point for paste.deploy for the account - server. For most cases, this should be - `egg:swift#account`. -set log_name account-server Label used when logging -set log_facility LOG_LOCAL0 Syslog log facility -set log_level INFO Logging level -================== ============== ========================================== - -[account-replicator] - -================== ================== ====================================== -Option Default Description ------------------- ------------------ -------------------------------------- -log_name account-replicator Label used when logging -log_facility LOG_LOCAL0 Syslog log facility -log_level INFO Logging level -per_diff 1000 -concurrency 8 Number of replication workers to spawn -run_pause 30 Time in seconds to wait between - replication passes -node_timeout 10 Request timeout to external services -conn_timeout 0.5 Connection timeout to external services -reclaim_age 604800 Time elapsed in seconds before an - account can be reclaimed -================== ================== ====================================== - -[account-auditor] - -==================== =============== ======================================= -Option Default Description --------------------- --------------- --------------------------------------- -log_name account-auditor Label used when logging -log_facility LOG_LOCAL0 Syslog log facility -log_level INFO Logging level -interval 1800 Minimum time for a pass to take -==================== =============== ======================================= - -[account-reaper] - -================== =============== ========================================= -Option Default Description ------------------- --------------- ----------------------------------------- -log_name account-auditor Label used when logging -log_facility LOG_LOCAL0 Syslog log facility -log_level INFO Logging level -concurrency 25 Number of replication workers to spawn -interval 3600 Minimum time for a pass to take -node_timeout 10 Request timeout to external services -conn_timeout 0.5 Connection timeout to external services -delay_reaping 0 Normally, the reaper begins deleting - account information for deleted accounts - immediately; you can set this to delay - its work however. The value is in seconds, - 2592000 = 30 days, for example. -================== =============== ========================================= - --------------------------- -Proxy Server Configuration --------------------------- - -An example Proxy Server configuration can be found at -etc/proxy-server.conf-sample in the source code repository. - -The following configuration options are available: - -[DEFAULT] - -============================ =============== ============================= -Option Default Description ----------------------------- --------------- ----------------------------- -bind_ip 0.0.0.0 IP Address for server to - bind to -bind_port 80 Port for server to bind to -bind_timeout 30 Seconds to attempt bind before - giving up -swift_dir /etc/swift Swift configuration directory -workers 1 Number of workers to fork -user swift User to run as -cert_file Path to the ssl .crt. This - should be enabled for testing - purposes only. -key_file Path to the ssl .key. This - should be enabled for testing - purposes only. -cors_allow_origin This is a list of hosts that - are included with any CORS - request by default and - returned with the - Access-Control-Allow-Origin - header in addition to what - the container has set. -log_custom_handlers None Comma separated list of functions - to call to setup custom log - handlers. -eventlet_debug false If true, turn on debug logging - for eventlet -============================ =============== ============================= - -[proxy-server] - -============================ =============== ============================= -Option Default Description ----------------------------- --------------- ----------------------------- -use Entry point for paste.deploy for - the proxy server. For most - cases, this should be - `egg:swift#proxy`. -set log_name proxy-server Label used when logging -set log_facility LOG_LOCAL0 Syslog log facility -set log_level INFO Log level -set log_headers True If True, log headers in each - request -set log_handoffs True If True, the proxy will log - whenever it has to failover to a - handoff node -recheck_account_existence 60 Cache timeout in seconds to - send memcached for account - existence -recheck_container_existence 60 Cache timeout in seconds to - send memcached for container - existence -object_chunk_size 65536 Chunk size to read from - object servers -client_chunk_size 65536 Chunk size to read from - clients -memcache_servers 127.0.0.1:11211 Comma separated list of - memcached servers ip:port -node_timeout 10 Request timeout to external - services -client_timeout 60 Timeout to read one chunk - from a client -conn_timeout 0.5 Connection timeout to - external services -error_suppression_interval 60 Time in seconds that must - elapse since the last error - for a node to be considered - no longer error limited -error_suppression_limit 10 Error count to consider a - node error limited -allow_account_management false Whether account PUTs and DELETEs - are even callable -object_post_as_copy true Set object_post_as_copy = false - to turn on fast posts where only - the metadata changes are stored - anew and the original data file - is kept in place. This makes for - quicker posts; but since the - container metadata isn't updated - in this mode, features like - container sync won't be able to - sync posts. -account_autocreate false If set to 'true' authorized - accounts that do not yet exist - within the Swift cluster will - be automatically created. -max_containers_per_account 0 If set to a positive value, - trying to create a container - when the account already has at - least this maximum containers - will result in a 403 Forbidden. - Note: This is a soft limit, - meaning a user might exceed the - cap for - recheck_account_existence before - the 403s kick in. -max_containers_whitelist This is a comma separated list - of account names that ignore - the max_containers_per_account - cap. -rate_limit_after_segment 10 Rate limit the download of - large object segments after - this segment is downloaded. -rate_limit_segments_per_sec 1 Rate limit large object - downloads at this rate. -============================ =============== ============================= - -[tempauth] - -===================== =============================== ======================= -Option Default Description ---------------------- ------------------------------- ----------------------- -use Entry point for - paste.deploy to use for - auth. To use tempauth - set to: - `egg:swift#tempauth` -set log_name tempauth Label used when logging -set log_facility LOG_LOCAL0 Syslog log facility -set log_level INFO Log level -set log_headers True If True, log headers in - each request -reseller_prefix AUTH The naming scope for the - auth service. Swift - storage accounts and - auth tokens will begin - with this prefix. -auth_prefix /auth/ The HTTP request path - prefix for the auth - service. Swift itself - reserves anything - beginning with the - letter `v`. -token_life 86400 The number of seconds a - token is valid. -storage_url_scheme default Scheme to return with - storage urls: http, - https, or default - (chooses based on what - the server is running - as) This can be useful - with an SSL load - balancer in front of a - non-SSL server. -===================== =============================== ======================= - -Additionally, you need to list all the accounts/users you want here. The format -is:: - - user__ = [group] [group] [...] [storage_url] - -or if you want to be able to include underscores in the ```` or -```` portions, you can base64 encode them (with *no* equal signs) in a -line like this:: - - user64__ = [group] [group] [...] [storage_url] - -There are special groups of:: - - .reseller_admin = can do anything to any account for this auth - .admin = can do anything within the account - -If neither of these groups are specified, the user can only access containers -that have been explicitly allowed for them by a .admin or .reseller_admin. - -The trailing optional storage_url allows you to specify an alternate url to -hand back to the user upon authentication. If not specified, this defaults to:: - - $HOST/v1/_ - -Where $HOST will do its best to resolve to what the requester would need to use -to reach this host, is from this section, and is -from the user__ name. Note that $HOST cannot possibly handle -when you have a load balancer in front of it that does https while TempAuth -itself runs with http; in such a case, you'll have to specify the -storage_url_scheme configuration value as an override. - -Here are example entries, required for running the tests:: - - user_admin_admin = admin .admin .reseller_admin - user_test_tester = testing .admin - user_test2_tester2 = testing2 .admin - user_test_tester3 = testing3 - - # account "test_y" and user "tester_y" (note the lack of padding = chars) - user64_dGVzdF95_dGVzdGVyX3k = testing4 .admin ------------------------ Memcached Considerations ------------------------ -Several of the Services rely on Memcached for caching certain types of -lookups, such as auth tokens, and container/account existence. Swift does -not do any caching of actual object data. Memcached should be able to run -on any servers that have available RAM and CPU. At Rackspace, we run -Memcached on the proxy servers. The `memcache_servers` config option -in the `proxy-server.conf` should contain all memcached servers. +Several of the Services rely on Memcached for caching certain types of lookups, +such as auth tokens, and container/account existence. Swift does not do any +caching of actual object data. Memcached should be able to run on any servers +that have available RAM and CPU. Typically Memcached is run on the proxy +servers. The ``memcache_servers`` config option in the ``proxy-server.conf`` +should contain all memcached servers. + +************************* +Shard Range Listing Cache +************************* + +When a container gets :ref:`sharded` the root container will still be the +primary entry point to many container requests, as it provides the list of shards. +To take load off the root container Swift by default caches the list of shards returned. + +As the number of shards for a root container grows to more than 3k the memcache default max +size of 1MB can be reached. + +If you over-run your max configured memcache size you'll see messages like:: + + Error setting value in memcached: 127.0.0.1:11211: SERVER_ERROR object too large for cache + +When you see these messages your root containers are getting hammered and +probably returning 503 reponses to clients. Override the default 1MB limit to +5MB with something like:: + + /usr/bin/memcached -I 5000000 ... + +Memcache has a ``stats sizes`` option that can point out the current size usage. As this +reaches the current max an increase might be in order:: + + # telnet 11211 + > stats sizes + STAT 160 2 + STAT 448 1 + STAT 576 1 + END + ----------- System Time @@ -734,30 +529,58 @@ Time may be relative but it is relatively important for Swift! Swift uses timestamps to determine which is the most recent version of an object. It is very important for the system time on each server in the cluster to by synced as closely as possible (more so for the proxy server, but in general -it is a good idea for all the servers). At Rackspace, we use NTP with a local -NTP server to ensure that the system times are as close as possible. This -should also be monitored to ensure that the times do not vary too much. +it is a good idea for all the servers). Typical deployments use NTP with a +local NTP server to ensure that the system times are as close as possible. +This should also be monitored to ensure that the times do not vary too much. + +.. _general-service-tuning: ---------------------- General Service Tuning ---------------------- -Most services support either a worker or concurrency value in the settings. -This allows the services to make effective use of the cores available. A good -starting point to set the concurrency level for the proxy and storage services -to 2 times the number of cores available. If more than one service is -sharing a server, then some experimentation may be needed to find the best -balance. - -At Rackspace, our Proxy servers have dual quad core processors, giving us 8 -cores. Our testing has shown 16 workers to be a pretty good balance when -saturating a 10g network and gives good CPU utilization. - -Our Storage servers all run together on the same servers. These servers have -dual quad core processors, for 8 cores total. We run the Account, Container, -and Object servers with 8 workers each. Most of the background jobs are run -at a concurrency of 1, with the exception of the replicators which are run at -a concurrency of 2. +Most services support either a ``workers`` or ``concurrency`` value in the +settings. This allows the services to make effective use of the cores +available. A good starting point is to set the concurrency level for the proxy +and storage services to 2 times the number of cores available. If more than +one service is sharing a server, then some experimentation may be needed to +find the best balance. + +For example, one operator reported using the following settings in a production +Swift cluster: + +- Proxy servers have dual quad core processors (i.e. 8 cores); testing has + shown 16 workers to be a pretty good balance when saturating a 10g network + and gives good CPU utilization. + +- Storage server processes all run together on the same servers. These servers + have dual quad core processors, for 8 cores total. The Account, Container, + and Object servers are run with 8 workers each. Most of the background jobs + are run at a concurrency of 1, with the exception of the replicators which + are run at a concurrency of 2. + +The ``max_clients`` parameter can be used to adjust the number of client +requests an individual worker accepts for processing. The fewer requests being +processed at one time, the less likely a request that consumes the worker's +CPU time, or blocks in the OS, will negatively impact other requests. The more +requests being processed at one time, the more likely one worker can utilize +network and disk capacity. + +On systems that have more cores, and more memory, where one can afford to run +more workers, raising the number of workers and lowering the maximum number of +clients serviced per worker can lessen the impact of CPU intensive or stalled +requests. + +The ``nice_priority`` parameter can be used to set program scheduling priority. +The ``ionice_class`` and ``ionice_priority`` parameters can be used to set I/O scheduling +class and priority on the systems that use an I/O scheduler that supports +I/O priorities. As at kernel 2.6.17 the only such scheduler is the Completely +Fair Queuing (CFQ) I/O scheduler. If you run your Storage servers all together +on the same servers, you can slow down the auditors or prioritize +object-server I/O via these parameters (but probably do not need to change +it on the proxy). It is a new feature and the best practices are still +being developed. On some systems it may be required to run the daemons as root. +For more info also see setpriority(2) and ioprio_set(2). The above configuration setting should be taken as suggestions and testing of configuration settings should be done to ensure best utilization of CPU, @@ -773,43 +596,69 @@ thorough testing with our use cases and hardware configurations, XFS was the best all-around choice. If you decide to use a filesystem other than XFS, we highly recommend thorough testing. -If you are using XFS, some settings that can dramatically impact -performance. We recommend the following when creating the XFS -partition:: +For distros with more recent kernels (for example Ubuntu 12.04 Precise), +we recommend using the default settings (including the default inode size +of 256 bytes) when creating the file system:: + + mkfs.xfs -L D1 /dev/sda1 + +In the last couple of years, XFS has made great improvements in how inodes +are allocated and used. Using the default inode size no longer has an +impact on performance. + +For distros with older kernels (for example Ubuntu 10.04 Lucid), +some settings can dramatically impact performance. We recommend the +following when creating the file system:: - mkfs.xfs -i size=1024 -f /dev/sda1 + mkfs.xfs -i size=1024 -L D1 /dev/sda1 Setting the inode size is important, as XFS stores xattr data in the inode. If the metadata is too large to fit in the inode, a new extent is created, which can cause quite a performance problem. Upping the inode size to 1024 bytes provides enough room to write the default metadata, plus a little -headroom. We do not recommend running Swift on RAID, but if you are using -RAID it is also important to make sure that the proper sunit and swidth -settings get set so that XFS can make most efficient use of the RAID array. +headroom. -We also recommend the following example mount options when using XFS:: +The following example mount options are recommended when using XFS:: - mount -t xfs -o noatime,nodiratime,nobarrier,logbufs=8 /dev/sda1 /srv/node/sda + mount -t xfs -o noatime -L D1 /srv/node/d1 -For a standard swift install, all data drives are mounted directly under -/srv/node (as can be seen in the above example of mounting /def/sda1 as -/srv/node/sda). If you choose to mount the drives in another directory, -be sure to set the `devices` config option in all of the server configs to +We do not recommend running Swift on RAID, but if you are using +RAID it is also important to make sure that the proper sunit and swidth +settings get set so that XFS can make most efficient use of the RAID array. + +For a standard Swift install, all data drives are mounted directly under +``/srv/node`` (as can be seen in the above example of mounting label ``D1`` +as ``/srv/node/d1``). If you choose to mount the drives in another directory, +be sure to set the ``devices`` config option in all of the server configs to point to the correct directory. +The mount points for each drive in ``/srv/node/`` should be owned by the root user +almost exclusively (``root:root 755``). This is required to prevent rsync from +syncing files into the root drive in the event a drive is unmounted. + Swift uses system calls to reserve space for new objects being written into -the system. If your filesystem does not support `fallocate()` or -`posix_fallocate()`, be sure to set the `disable_fallocate = true` config +the system. If your filesystem does not support ``fallocate()`` or +``posix_fallocate()``, be sure to set the ``disable_fallocate = true`` config parameter in account, container, and object server configs. +Most current Linux distributions ship with a default installation of updatedb. +This tool runs periodically and updates the file name database that is used by +the GNU locate tool. However, including Swift object and container database +files is most likely not required and the periodic update affects the +performance quite a bit. To disable the inclusion of these files add the path +where Swift stores its data to the setting PRUNEPATHS in ``/etc/updatedb.conf``:: + + PRUNEPATHS="... /tmp ... /var/spool ... /srv/node" + + --------------------- General System Tuning --------------------- -Rackspace currently runs Swift on Ubuntu Server 10.04, and the following -changes have been found to be useful for our use cases. +The following changes have been found to be useful when running Swift on Ubuntu +Server 10.04. -The following settings should be in `/etc/sysctl.conf`:: +The following settings should be in ``/etc/sysctl.conf``:: # disable TIME_WAIT.. wait.. net.ipv4.tcp_tw_recycle=1 @@ -819,9 +668,9 @@ The following settings should be in `/etc/sysctl.conf`:: net.ipv4.tcp_syncookies = 0 # double amount of allowed conntrack - net.ipv4.netfilter.ip_conntrack_max = 262144 + net.netfilter.nf_conntrack_max = 262144 -To load the updated sysctl settings, run ``sudo sysctl -p`` +To load the updated sysctl settings, run ``sudo sysctl -p``. A note about changing the TIME_WAIT values. By default the OS will hold a port open for 60 seconds to ensure that any remaining packets can be @@ -835,7 +684,7 @@ Logging Considerations ---------------------- Swift is set up to log directly to syslog. Every service can be configured -with the `log_facility` option to set the syslog log facility destination. +with the ``log_facility`` option to set the syslog log facility destination. We recommended using syslog-ng to route the logs to specific log files locally on the server and also to remote log collecting servers. Additionally, custom log handlers can be used via the custom_log_handlers diff --git a/doc/source/development_auth.rst b/doc/source/development_auth.rst index e913850b5a..53c50b6743 100644 --- a/doc/source/development_auth.rst +++ b/doc/source/development_auth.rst @@ -37,7 +37,7 @@ will be omitted. It is highly recommended that authentication server implementers prefix their tokens and Swift storage accounts they create with a configurable reseller -prefix (`AUTH_` by default with the included TempAuth). This prefix will avoid +prefix (``AUTH_`` by default with the included TempAuth). This prefix will avoid conflicts with other authentication servers that might be using the same Swift cluster. Otherwise, the Swift cluster will have to try all the resellers until one validates a token or all fail. @@ -48,18 +48,18 @@ designations as you'll see later). Example Authentication with TempAuth: - * Token AUTH_tkabcd is given to the TempAuth middleware in a request's - X-Auth-Token header. - * The TempAuth middleware validates the token AUTH_tkabcd and discovers - it matches the "tester" user within the "test" account for the storage - account "AUTH_storage_xyz". - * The TempAuth middleware sets the REMOTE_USER to - "test:tester,test,AUTH_storage_xyz" - * Now this user will have full access (via authorization procedures later) - to the AUTH_storage_xyz Swift storage account and access to containers in - other storage accounts, provided the storage account begins with the same - `AUTH_` reseller prefix and the container has an ACL specifying at least - one of those three groups. +* Token AUTH_tkabcd is given to the TempAuth middleware in a request's + X-Auth-Token header. +* The TempAuth middleware validates the token AUTH_tkabcd and discovers + it matches the "tester" user within the "test" account for the storage + account "AUTH_storage_xyz". +* The TempAuth middleware sets the REMOTE_USER to + "test:tester,test,AUTH_storage_xyz" +* Now this user will have full access (via authorization procedures later) + to the AUTH_storage_xyz Swift storage account and access to containers in + other storage accounts, provided the storage account begins with the same + ``AUTH_`` reseller prefix and the container has an ACL specifying at least + one of those three groups. Authorization is performed through callbacks by the Swift Proxy server to the WSGI environment's swift.authorize value, if one is set. The swift.authorize @@ -352,7 +352,7 @@ folks a start on their own code if they want to use repoze.what:: self.ssl = \ conf.get('ssl', 'false').lower() in ('true', 'on', '1', 'yes') self.auth_prefix = conf.get('prefix', '/') - self.timeout = int(conf.get('node_timeout', 10)) + self.timeout = float(conf.get('node_timeout', 10)) def authenticate(self, env, identity): token = identity.get('token') @@ -375,7 +375,7 @@ folks a start on their own code if they want to use repoze.what:: expiration = float(resp.getheader('x-auth-ttl')) user = resp.getheader('x-auth-user') memcache_client.set(key, (time(), expiration, user), - timeout=expiration) + time=expiration) return user return None @@ -487,7 +487,8 @@ folks a start on their own code if they want to use repoze.what:: Allowing CORS with Auth ----------------------- -Cross Origin RequestS require that the auth system allow the OPTIONS method to -pass through without a token. The preflight request will make an OPTIONS call -against the object or container and will not work if the auth system stops it. +Cross Origin Resource Sharing (CORS) require that the auth system allow the +OPTIONS method to pass through without a token. The preflight request will +make an OPTIONS call against the object or container and will not work if +the auth system stops it. See TempAuth for an example of how OPTIONS requests are handled. diff --git a/doc/source/development_guidelines.rst b/doc/source/development_guidelines.rst index 5682233bfc..6af3e49513 100644 --- a/doc/source/development_guidelines.rst +++ b/doc/source/development_guidelines.rst @@ -6,65 +6,301 @@ Development Guidelines Coding Guidelines ----------------- -For the most part we try to follow PEP 8 guidelines which can be viewed +For the most part we try to follow PEP 8 guidelines which can be viewed here: http://www.python.org/dev/peps/pep-0008/ -There is a useful pep8 command line tool for checking files for pep8 -compliance which can be installed with ``easy_install pep8``. - ------------------ Testing Guidelines ------------------ -Swift has a comprehensive suite of tests that are run on all submitted code, -and it is recommended that developers execute the tests themselves to -catch regressions early. Developers are also expected to keep the -test suite up-to-date with any submitted code changes. +Swift has a comprehensive suite of tests and pep8 checks that are run on all +submitted code, and it is recommended that developers execute the tests +themselves to catch regressions early. Developers are also expected to keep +the test suite up-to-date with any submitted code changes. + +Swift's tests and pep8 checks can be executed in an isolated environment +with ``tox``: http://tox.testrun.org/ + +To execute the tests: + +* Ensure ``pip`` and ``virtualenv`` are upgraded to satisfy the version + requirements listed in the OpenStack `global requirements`_:: + + pip install pip -U + pip install virtualenv -U + +.. _`global requirements`: https://github.com/openstack/requirements/blob/master/global-requirements.txt + +* Install ``tox``:: + + pip install tox + +* Generate list of distribution packages to install for testing:: + + tox -e bindep + + Now install these packages using your distribution package manager + like apt-get, dnf, yum, or zypper. + +* Run ``tox`` from the root of the swift repo:: + + tox + +To run a selected subset of unit tests with ``pytest``: + +* Create a virtual environment with ``tox``:: + + tox devenv -e py3 .env + +.. note:: + Alternatively, here are the steps of manual preparation of the virtual environment:: + + virtualenv .env + source .env/bin/activate + pip3 install -r requirements.txt -r test-requirements.txt -c py36-constraints.txt + pip3 install -e . + deactivate + +* Activate the virtual environment:: + + source .env/bin/activate + +* Run some unit tests, for example:: + + pytest test/unit/common/middleware/crypto + +* Run all unit tests:: + + pytest test/unit + +.. note:: + If you installed using ``cd ~/swift; sudo python setup.py develop``, you may + need to do ``cd ~/swift; sudo chown -R ${USER}:${USER} swift.egg-info`` prior + to running ``tox``. + +* By default ``tox`` will run **all of the unit test** and pep8 checks listed in + the ``tox.ini`` file ``envlist`` option. A subset of the test environments + can be specified on the ``tox`` command line or by setting the ``TOXENV`` + environment variable. For example, to run only the pep8 checks and python3 + unit tests use:: + + tox -e pep8,py3 + + or:: + + TOXENV=py3,pep8 tox + + To run unit tests with python3.12 specifically:: + + tox -e py312 + +.. note:: + As of ``tox`` version 2.0.0, most environment variables are not automatically + passed to the test environment. Swift's ``tox.ini`` overrides this default + behavior so that variable names matching ``SWIFT_*`` and ``*_proxy`` will be + passed, but you may need to run ``tox --recreate`` for this to take effect + after upgrading from ``tox`` <2.0.0. + + Conversely, if you do not want those environment variables to be passed to + the test environment then you will need to unset them before calling ``tox``. + + Also, if you ever encounter DistributionNotFound, try to use ``tox + --recreate`` or remove the ``.tox`` directory to force ``tox`` to recreate the + dependency list. + + Swift's tests require having an XFS directory available in ``/tmp`` or + in the ``TMPDIR`` environment variable. + +Swift's functional tests may be executed against a :doc:`development_saio` or +other running Swift cluster using the command:: + + tox -e func + +The endpoint and authorization credentials to be used by functional tests +should be configured in the ``test.conf`` file as described in the section +:ref:`setup_scripts`. + +The environment variable ``SWIFT_TEST_POLICY`` may be set to specify a +particular storage policy *name* that will be used for testing. When set, tests +that would otherwise not specify a policy or choose a random policy from +those available will instead use the policy specified. Tests that use more than +one policy will include the specified policy in the set of policies used. The +specified policy must be available on the cluster under test. -Swift's suite of unit tests can be executed in an isolated environment -with Tox: http://tox.testrun.org/ +For example, this command would run the functional tests using policy +'silver':: -To execute the unit tests: + SWIFT_TEST_POLICY=silver tox -e func -* Install Tox: +To run a single functional test, use the ``--no-discover`` option together with +a path to a specific test method, for example:: - - `pip install tox` + tox -e func -- --no-discover test.functional.tests.TestFile.testCopy -* Run Tox from the root of the swift repo: - - `tox` +In-process functional testing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* Optionally, run only specific tox builds: +If the ``test.conf`` file is not found then the functional test framework will +instantiate a set of Swift servers in the same process that executes the +functional tests. This 'in-process test' mode may also be enabled (or disabled) +by setting the environment variable ``SWIFT_TEST_IN_PROCESS`` to a true (or +false) value prior to executing ``tox -e func``. - - `tox -e pep8,py26` +When using the 'in-process test' mode some server configuration options may be +set using environment variables: + +- the optional in-memory object server may be selected by setting the + environment variable ``SWIFT_TEST_IN_MEMORY_OBJ`` to a true value. + +- encryption may be added to the proxy pipeline by setting the + environment variable ``SWIFT_TEST_IN_PROCESS_CONF_LOADER`` to + ``encryption``. + +- a 2+1 EC policy may be installed as the default policy by setting the + environment variable ``SWIFT_TEST_IN_PROCESS_CONF_LOADER`` to + ``ec``. + +- logging to stdout may be enabled by setting ``SWIFT_TEST_DEBUG_LOGS``. + +For example, this command would run the in-process mode functional tests with +encryption enabled in the proxy-server:: + + SWIFT_TEST_IN_PROCESS=1 SWIFT_TEST_IN_PROCESS_CONF_LOADER=encryption \ + tox -e func + +This particular example may also be run using the ``func-encryption`` +tox environment:: + + tox -e func-encryption + +The ``tox.ini`` file also specifies test environments for running other +in-process functional test configurations, e.g.:: + + tox -e func-ec + +To debug the functional tests, use the 'in-process test' mode and pass the +``--pdb`` flag to ``tox``:: + + SWIFT_TEST_IN_PROCESS=1 tox -e func -- --pdb \ + test.functional.tests.TestFile.testCopy + +The 'in-process test' mode searches for ``proxy-server.conf`` and +``swift.conf`` config files from which it copies config options and overrides +some options to suit in process testing. The search will first look for config +files in a ```` that may optionally be specified using +the environment variable:: + + SWIFT_TEST_IN_PROCESS_CONF_DIR= + +If ``SWIFT_TEST_IN_PROCESS_CONF_DIR`` is not set, or if a config file is not +found in ````, the search will then look in the +``etc/`` directory in the source tree. If the config file is still not found, +the corresponding sample config file from ``etc/`` is used (e.g. +``proxy-server.conf-sample`` or ``swift.conf-sample``). + +When using the 'in-process test' mode ``SWIFT_TEST_POLICY`` may be set to +specify a particular storage policy *name* that will be used for testing as +described above. When set, this policy must exist in the ``swift.conf`` file +and its corresponding ring file must exist in ```` (if +specified) or ``etc/``. The test setup will set the specified policy to be the +default and use its ring file properties for constructing the test object ring. +This allows in-process testing to be run against various policy types and ring +files. + +For example, this command would run the in-process mode functional tests +using config files found in ``$HOME/my_tests`` and policy 'silver':: + + SWIFT_TEST_IN_PROCESS=1 SWIFT_TEST_IN_PROCESS_CONF_DIR=$HOME/my_tests \ + SWIFT_TEST_POLICY=silver tox -e func + + +S3 API cross-compatibility tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The cross-compatibility tests in directory `test/s3api` are intended to verify +that the Swift S3 API behaves in the same way as the AWS S3 API. They should +pass when run against either a Swift endpoint (with S3 API enabled) or an AWS +S3 endpoint. + +To run against an AWS S3 endpoint, the `/etc/swift/test.conf` file must be +edited to provide AWS key IDs and secrets. Alternatively, an AWS CLI style +credentials file can be loaded by setting the ``SWIFT_TEST_AWS_CONFIG_FILE`` +environment variable, e.g.:: + + SWIFT_TEST_AWS_CONFIG_FILE=~/.aws/credentials pytest ./test/s3api + +.. note:: + When using ``SWIFT_TEST_AWS_CONFIG_FILE``, the region defaults to + ``us-east-1`` and only the default credentials are loaded. + + +------------ +Coding Style +------------ + +Swift uses flake8 with the OpenStack `hacking`_ module to enforce +coding style. + +Install flake8 and hacking with pip or by the packages of your +Operating System. + +It is advised to integrate flake8+hacking with your editor to get it +automated and not get `caught` by Jenkins. + +For example for Vim the `syntastic`_ plugin can do this for you. + +.. _`hacking`: https://pypi.org/project/hacking +.. _`syntastic`: https://github.com/scrooloose/syntastic ------------------------ Documentation Guidelines ------------------------ -The documentation in docstrings should follow the PEP 257 conventions +The documentation in docstrings should follow the PEP 257 conventions (as mentioned in the PEP 8 guidelines). More specifically: - 1. Triple qutes should be used for all docstrings. - 2. If the docstring is simple and fits on one line, then just use - one line. - 3. For docstrings that take multiple lines, there should be a newline - after the opening quotes, and before the closing quotes. - 4. Sphinx is used to build documentation, so use the restructured text - markup to designate parameters, return values, etc. Documentation on - the sphinx specific markup can be found here: - http://sphinx.pocoo.org/markup/index.html +#. Triple quotes should be used for all docstrings. +#. If the docstring is simple and fits on one line, then just use + one line. +#. For docstrings that take multiple lines, there should be a newline + after the opening quotes, and before the closing quotes. +#. Sphinx is used to build documentation, so use the restructured text + markup to designate parameters, return values, etc. Documentation on + the sphinx specific markup can be found here: + https://www.sphinx-doc.org/en/master/ + +To build documentation run:: + + pip install -r requirements.txt -r doc/requirements.txt + sphinx-build -W -b html doc/source doc/build/html + +and then browse to doc/build/html/index.html. These docs are auto-generated +after every commit and available online at +https://docs.openstack.org/swift/latest/. + +-------- +Manpages +-------- + +For sanity check of your change in manpage, use this command in the root +of your Swift repo:: + + ./.manpages --------------------- License and Copyright --------------------- -Every source file should have the following copyright and license statement at -the top:: +You can have the following copyright and license statement at +the top of each source file. Copyright assignment is optional. + +New files should contain the current year. Substantial updates can have +another year added, and date ranges are not needed.:: - # Copyright (c) 2010-2012 OpenStack, LLC. + # Copyright (c) 2013 OpenStack Foundation. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/doc/source/development_middleware.rst b/doc/source/development_middleware.rst new file mode 100644 index 0000000000..41e6ace20c --- /dev/null +++ b/doc/source/development_middleware.rst @@ -0,0 +1,383 @@ +======================= +Middleware and Metadata +======================= + +---------------- +Using Middleware +---------------- + +`Python WSGI Middleware`_ (or just "middleware") can be used to "wrap" +the request and response of a Python WSGI application (i.e. a webapp, +or REST/HTTP API), like Swift's WSGI servers (proxy-server, +account-server, container-server, object-server). Swift uses middleware +to add (sometimes optional) behaviors to the Swift WSGI servers. + +.. _Python WSGI Middleware: http://www.python.org/dev/peps/pep-0333/#middleware-components-that-play-both-sides + +Middleware can be added to the Swift WSGI servers by modifying their +`paste`_ configuration file. The majority of Swift middleware is applied +to the :ref:`proxy-server`. + +.. _paste: https://pypi.org/project/Paste/ + +Given the following basic configuration:: + + [DEFAULT] + log_level = DEBUG + user = + + [pipeline:main] + pipeline = proxy-server + + [app:proxy-server] + use = egg:swift#proxy + +You could add the :ref:`healthcheck` middleware by adding a section for +that filter and adding it to the pipeline:: + + [DEFAULT] + log_level = DEBUG + user = + + [pipeline:main] + pipeline = healthcheck proxy-server + + [filter:healthcheck] + use = egg:swift#healthcheck + + [app:proxy-server] + use = egg:swift#proxy + + +Some middleware is required and will be inserted into your pipeline +automatically by core swift code (e.g. the proxy-server will insert +:ref:`catch_errors` and :ref:`gatekeeper` at the start of the pipeline if they +are not already present). You can see which features are available on a given +Swift endpoint (including middleware) using the :ref:`discoverability` +interface. + + +---------------------------- +Creating Your Own Middleware +---------------------------- + +The best way to see how to write middleware is to look at examples. + +Many optional features in Swift are implemented as +:ref:`common_middleware` and provided in ``swift.common.middleware``, but +Swift middleware may be packaged and distributed as a separate project. +Some examples are listed on the :ref:`associated_projects` page. + +A contrived middleware example that modifies request behavior by +inspecting custom HTTP headers (e.g. X-Webhook) and uses :ref:`sysmeta` +to persist data to backend storage as well as common patterns like a +:func:`.get_container_info` cache/query and :func:`.wsgify` decorator is +presented below:: + + from swift.common.http import is_success + from swift.common.swob import wsgify + from swift.common.utils import split_path, get_logger + from swift.common.request_helpers import get_sys_meta_prefix + from swift.proxy.controllers.base import get_container_info + from eventlet import Timeout + from eventlet.green.urllib import urllib_request + + # x-container-sysmeta-webhook + SYSMETA_WEBHOOK = get_sys_meta_prefix('container') + 'webhook' + + + class WebhookMiddleware(object): + def __init__(self, app, conf): + self.app = app + self.logger = get_logger(conf, log_route='webhook') + + @wsgify + def __call__(self, req): + obj = None + try: + (version, account, container, obj) = \ + split_path(req.path_info, 4, 4, True) + except ValueError: + # not an object request + pass + if 'x-webhook' in req.headers: + # translate user's request header to sysmeta + req.headers[SYSMETA_WEBHOOK] = \ + req.headers['x-webhook'] + if 'x-remove-webhook' in req.headers: + # empty value will tombstone sysmeta + req.headers[SYSMETA_WEBHOOK] = '' + # account and object storage will ignore x-container-sysmeta-* + resp = req.get_response(self.app) + if obj and is_success(resp.status_int) and req.method == 'PUT': + container_info = get_container_info(req.environ, self.app) + # container_info may have our new sysmeta key + webhook = container_info['sysmeta'].get('webhook') + if webhook: + # create a POST request with obj name as body + webhook_req = urllib_request.Request(webhook, data=obj) + with Timeout(20): + try: + urllib_request.urlopen(webhook_req).read() + except (Exception, Timeout): + self.logger.exception( + 'failed POST to webhook %s' % webhook) + else: + self.logger.info( + 'successfully called webhook %s' % webhook) + if 'x-container-sysmeta-webhook' in resp.headers: + # translate sysmeta from the backend resp to + # user-visible client resp header + resp.headers['x-webhook'] = resp.headers[SYSMETA_WEBHOOK] + return resp + + + def webhook_factory(global_conf, **local_conf): + conf = global_conf.copy() + conf.update(local_conf) + + def webhook_filter(app): + return WebhookMiddleware(app, conf) + return webhook_filter + +In practice this middleware will call the URL stored on the container as +X-Webhook on all successful object uploads. + +If this example was at ``/swift/common/middleware/webhook.py`` - +you could add it to your proxy by creating a new filter section and +adding it to the pipeline:: + + [DEFAULT] + log_level = DEBUG + user = + + [pipeline:main] + pipeline = healthcheck webhook proxy-server + + [filter:webhook] + paste.filter_factory = swift.common.middleware.webhook:webhook_factory + + [filter:healthcheck] + use = egg:swift#healthcheck + + [app:proxy-server] + use = egg:swift#proxy + +Most python packages expose middleware as entrypoints. See `PasteDeploy`_ +documentation for more information about the syntax of the ``use`` option. +All middleware included with Swift is installed to support the ``egg:swift`` +syntax. + +.. _PasteDeploy: https://pypi.org/project/PasteDeploy/ + +Middleware may advertize its availability and capabilities via Swift's +:ref:`discoverability` support by using +:func:`.register_swift_info`:: + + from swift.common.registry import register_swift_info + def webhook_factory(global_conf, **local_conf): + register_swift_info('webhook') + def webhook_filter(app): + return WebhookMiddleware(app) + return webhook_filter + +If a middleware handles sensitive information in headers or query parameters +that may need redaction when logging, use the :func:`.register_sensitive_header` +and :func:`.register_sensitive_param` functions. This should be done in the +filter factory:: + + from swift.common.registry import register_sensitive_header + def webhook_factory(global_conf, **local_conf): + register_sensitive_header('webhook-api-key') + def webhook_filter(app): + return WebhookMiddleware(app) + return webhook_filter + + +Middlewares can override the status integer that is logged by proxy_logging +middleware by setting ``swift.proxy_logging_status`` in the request WSGI +environment. The value should be an integer. The value will replace the default +status integer in the log message, unless the proxy_logging middleware detects +a client disconnect or exception while handling the request, in which case +``swift.proxy_logging_status`` is overridden by a 499 or 500 respectively. + +-------------- +Swift Metadata +-------------- + +Generally speaking metadata is information about a resource that is +associated with the resource but is not the data contained in the +resource itself - which is set and retrieved via HTTP headers. (e.g. the +"Content-Type" of a Swift object that is returned in HTTP response +headers) + +All user resources in Swift (i.e. account, container, objects) can have +user metadata associated with them. Middleware may also persist custom +metadata to accounts and containers safely using System Metadata. Some +core Swift features which predate sysmeta have added exceptions for +custom non-user metadata headers (e.g. :ref:`acls`, +:ref:`large-objects`) + +.. _usermeta: + +^^^^^^^^^^^^^ +User Metadata +^^^^^^^^^^^^^ + +User metadata takes the form of ``X--Meta-: ``, where +```` depends on the resources type (i.e. Account, Container, Object) +and ```` and ```` are set by the client. + +User metadata should generally be reserved for use by the client or +client applications. A perfect example use-case for user metadata is +`python-swiftclient`_'s ``X-Object-Meta-Mtime`` which it stores on +object it uploads to implement its ``--changed`` option which will only +upload files that have changed since the last upload. + +.. _python-swiftclient: https://opendev.org/openstack/python-swiftclient + +New middleware should avoid storing metadata within the User Metadata +namespace to avoid potential conflict with existing user metadata when +introducing new metadata keys. An example of legacy middleware that +borrows the user metadata namespace is :ref:`tempurl`. An example of +middleware which uses custom non-user metadata to avoid the user +metadata namespace is :ref:`slo-doc`. + +User metadata that is stored by a PUT or POST request to a container or account +resource persists until it is explicitly removed by a subsequent PUT or POST +request that includes a header ``X--Meta-`` with no value or a +header ``X-Remove--Meta-: ``. In the latter case the +```` is not stored. All user metadata stored with an account or +container resource is deleted when the account or container is deleted. + +User metadata that is stored with an object resource has a different semantic; +object user metadata persists until any subsequent PUT or POST request is made +to the same object, at which point all user metadata stored with that object is +deleted en-masse and replaced with any user metadata included with the PUT or +POST request. As a result, it is not possible to update a subset of the user +metadata items stored with an object while leaving some items unchanged. + +.. _sysmeta: + +^^^^^^^^^^^^^^^^^^^^^^^^^ +System Metadata (Sysmeta) +^^^^^^^^^^^^^^^^^^^^^^^^^ + +System metadata takes the form of ``X--Sysmeta-: ``, +where ```` depends on the resources type (i.e. Account, Container, +Object) and ```` and ```` are set by trusted code running in a +Swift WSGI Server. + +All headers on client requests in the form of ``X--Sysmeta-`` +will be dropped from the request before being processed by any +middleware. All headers on responses from back-end systems in the form +of ``X--Sysmeta-`` will be removed after all middlewares have +processed the response but before the response is sent to the client. +See :ref:`gatekeeper` middleware for more information. + +System metadata provides a means to store potentially private custom +metadata with associated Swift resources in a safe and secure fashion +without actually having to plumb custom metadata through the core swift +servers. The incoming filtering ensures that the namespace can not be +modified directly by client requests, and the outgoing filter ensures +that removing middleware that uses a specific system metadata key +renders it benign. New middleware should take advantage of system +metadata. + +System metadata may be set on accounts and containers by including headers with +a PUT or POST request. Where a header name matches the name of an existing item +of system metadata, the value of the existing item will be updated. Otherwise +existing items are preserved. A system metadata header with an empty value will +cause any existing item with the same name to be deleted. + +System metadata may be set on objects using only PUT requests. All items of +existing system metadata will be deleted and replaced en-masse by any system +metadata headers included with the PUT request. System metadata is neither +updated nor deleted by a POST request: updating individual items of system +metadata with a POST request is not yet supported in the same way that updating +individual items of user metadata is not supported. In cases where middleware +needs to store its own metadata with a POST request, it may use Object Transient +Sysmeta. + + +^^^^^^^^^^^^^^^ +Object Metadata +^^^^^^^^^^^^^^^ + +Objects have other metadata in addition to the user metadata and system +metadata described above. + +****************** +Immutable Metadata +****************** + +Objects have several items of immutable metadata. Like system metadata, these +may only be set using PUT requests. However, they do not follow the general +``X-Object-Sysmeta-`` naming scheme and they are not automatically removed +from client responses. + +Object immutable metadata includes:: + + X-Timestamp + Content-Length + Etag + +``X-Timestamp`` and ``Content-Length`` metadata MUST be included in PUT +requests to object servers. ``Etag`` metadata is generated by object servers +when they handle a PUT request, but checked against any ``Etag`` header sent +with the PUT request. + +Object immutable metadata, along with ``Content-Type``, is the only object +metadata that is stored by container servers and returned in object listings. + +************ +Content-Type +************ + +Object ``Content-Type`` metadata is treated differently from immutable +metadata, system metadata and user metadata. + +``Content-Type`` MUST be included in PUT requests to object servers. Unlike +immutable metadata or system metadata, ``Content-Type`` is mutable and may be +included in POST requests to object servers. However, unlike object user +metadata, existing ``Content-Type`` metadata persists if a POST request does +not include new ``Content-Type`` metadata. This is because an object must have +``Content-Type`` metadata, which is also stored by container servers and +returned in object listings. + +``Content-Type`` is the only item of object metadata that is both mutable and +yet also persists when not specified in a POST request. + +.. _transient_sysmeta: + +************************ +Object Transient-Sysmeta +************************ + +If middleware needs to store object metadata with a POST request it may do so +using headers of the form ``X-Object-Transient-Sysmeta-: ``. + +All headers on client requests in the form of +``X-Object-Transient-Sysmeta-`` will be dropped from the request before +being processed by any middleware. All headers on responses from back-end +systems in the form of ``X-Object-Transient-Sysmeta-`` will be removed +after all middlewares have processed the response but before the response is +sent to the client. See :ref:`gatekeeper` middleware for more information. + +Transient-sysmeta updates on an object have the same semantic as user +metadata updates on an object (see :ref:`usermeta`) i.e. whenever any PUT or +POST request is made to an object, all existing items of transient-sysmeta are +deleted en-masse and replaced with any transient-sysmeta included with the PUT +or POST request. Transient-sysmeta set by a middleware is therefore prone to +deletion by a subsequent client-generated POST request unless the middleware is +careful to include its transient-sysmeta with every POST. Likewise, user +metadata set by a client is prone to deletion by a subsequent +middleware-generated POST request, and for that reason middleware should avoid +generating POST requests that are independent of any client request. + +Transient-sysmeta deliberately uses a different header prefix to user metadata +so that middlewares can avoid potential conflict with user metadata keys. + +Transient-sysmeta deliberately uses a different header prefix to system +metadata to emphasize the fact that the data is only persisted until a +subsequent POST. diff --git a/doc/source/development_ondisk_backends.rst b/doc/source/development_ondisk_backends.rst new file mode 100644 index 0000000000..14934d7b6c --- /dev/null +++ b/doc/source/development_ondisk_backends.rst @@ -0,0 +1,39 @@ +=============================== +Pluggable On-Disk Back-end APIs +=============================== + +The internal REST API used between the proxy server and the account, container +and object server is almost identical to public Swift REST API, but with a few +internal extensions (for example, update an account with a new container). + +The pluggable back-end APIs for the three REST API servers (account, +container, object) abstracts the needs for servicing the various REST APIs +from the details of how data is laid out and stored on-disk. + +The APIs are documented in the reference implementations for all three +servers. For historical reasons, the object server backend reference +implementation module is named ``diskfile``, while the account and container +server backend reference implementation modules are named appropriately. + +This API is still under development and not yet finalized. + +----------------------------------------- +Back-end API for Account Server REST APIs +----------------------------------------- +.. automodule:: swift.account.backend + :noindex: + :members: + +------------------------------------------- +Back-end API for Container Server REST APIs +------------------------------------------- +.. automodule:: swift.container.backend + :noindex: + :members: + +---------------------------------------- +Back-end API for Object Server REST APIs +---------------------------------------- +.. automodule:: swift.obj.diskfile + :noindex: + :members: diff --git a/doc/source/development_saio.rst b/doc/source/development_saio.rst old mode 100755 new mode 100644 index 18de01c105..a9f1e6fffe --- a/doc/source/development_saio.rst +++ b/doc/source/development_saio.rst @@ -1,56 +1,115 @@ +.. _saio: + ======================= -SAIO - Swift All In One +SAIO (Swift All In One) ======================= +.. note:: + This guide assumes an existing Linux server. A physical machine or VM will + work. We recommend configuring it with at least 2GB of memory and 40GB of + storage space. We recommend using a VM in order to isolate Swift and its + dependencies from other projects you may be working on. + --------------------------------------------- Instructions for setting up a development VM --------------------------------------------- -This section documents setting up a virtual machine for doing Swift development. -The virtual machine will emulate running a four node Swift cluster. +This section documents setting up a virtual machine for doing Swift +development. The virtual machine will emulate running a four node Swift +cluster. To begin: -* Get the *Ubuntu 10.04 LTS (Lucid Lynx)* server image: +* Get a Linux system server image, this guide will cover: - - Ubuntu Server ISO: http://releases.ubuntu.com/lucid/ubuntu-10.04.4-server-amd64.iso (717 MB) - - Ubuntu Live/Install: http://cdimage.ubuntu.com/releases/lucid/release/ubuntu-10.04.4-dvd-amd64.iso (4.2 GB) - - Ubuntu Mirrors: https://launchpad.net/ubuntu/+cdmirrors + * Ubuntu 24.04 LTS + * CentOS Stream 9 + * Fedora + * OpenSuse -* Create guest virtual machine from the Ubuntu image. +- Create guest virtual machine from the image. -Additional information about setting up a Swift development snapshot on other distributions is -available on the wiki at http://wiki.openstack.org/SAIOInstructions. +---------------------------- +What's in a +---------------------------- ------------------------------------------ -Installing dependencies and the core code ------------------------------------------ -* As root on guest (you'll have to log in as you, then `sudo su -`): +Much of the configuration described in this guide requires escalated +administrator (``root``) privileges; however, we assume that administrator logs +in as an unprivileged user and can use ``sudo`` to run privileged commands. + +Swift processes also run under a separate user and group, set by configuration +option, and referenced as ``:``. The default user +is ``swift``, which may not exist on your system. These instructions are +intended to allow a developer to use his/her username for +``:``. + +.. note:: + For OpenSuse users, a user's primary group is ``users``, so you have 2 options: + + * Change ``${USER}:${USER}`` to ``${USER}:users`` in all references of this guide; or + * Create a group for your username and add yourself to it:: + + sudo groupadd ${USER} && sudo gpasswd -a ${USER} ${USER} && newgrp ${USER} + +----------------------- +Installing dependencies +----------------------- + +* On ``apt`` based systems:: + + sudo apt-get update + sudo apt-get install curl gcc memcached rsync sqlite3 xfsprogs \ + git-core libffi-dev python3-setuptools \ + liberasurecode-dev libssl-dev + sudo apt-get install python3-coverage python3-dev python3-pytest \ + python3-xattr python3-eventlet \ + python3-greenlet python3-pastedeploy \ + python3-pip python3-dnspython + +* On ``CentOS`` (requires additional repositories):: + + sudo dnf update + sudo dnf install epel-release + sudo dnf config-manager --enable epel extras + sudo dnf install centos-release-openstack-epoxy + sudo dnf install curl gcc memcached rsync-daemon sqlite xfsprogs git-core \ + libffi-devel liberasurecode-devel \ + openssl-devel python3-setuptools \ + python3-coverage python3-devel python3-pytest \ + python3-pyxattr python3-eventlet \ + python3-greenlet python3-paste-deploy \ + python3-pip python3-dns + +* On ``Fedora``:: + + sudo dnf update + sudo dnf install curl gcc memcached rsync-daemon sqlite xfsprogs git-core \ + libffi-devel liberasurecode-devel python3-pyeclib \ + openssl-devel python3-setuptools \ + python3-coverage python3-devel python3-pytest \ + python3-pyxattr python3-eventlet \ + python3-greenlet python3-paste-deploy \ + python3-pip python3-dns - #. `apt-get install python-software-properties` - #. `add-apt-repository ppa:swift-core/release` - #. `apt-get update` - #. `apt-get install curl gcc git-core memcached python-coverage python-dev - python-nose python-setuptools python-simplejson python-xattr sqlite3 - xfsprogs python-eventlet python-greenlet python-pastedeploy - python-netifaces python-pip` - #. `pip install mock` - #. Install anything else you want, like screen, ssh, vim, etc. +* On ``OpenSuse``:: -* On Fedora, log in as root and do: + sudo zypper install curl gcc memcached rsync sqlite3 xfsprogs git-core \ + libffi-devel liberasurecode-devel python3-setuptools \ + libopenssl-devel + sudo zypper install python3-coverage python3-devel python3-nose \ + python3-xattr python3-eventlet python3-greenlet \ + python3-pip python3-dnspython - #. `yum install openstack-swift openstack-swift-proxy - openstack-swift-account openstack-swift-container openstack-swift-object` - #. `yum install xinetd rsync` - #. `yum install memcached` - #. `yum install python-netifaces python-nose python-mock` +.. note:: + This installs necessary system dependencies and *most* of the python + dependencies. Later in the process setuptools/distribute or pip will install + and/or upgrade packages. - This installs all necessary dependencies, and also creates user `swift` - and group `swift`. So, `swift:swift` ought to be used in every place where - this manual calls for `:`. +------------------- +Configuring storage +------------------- - Ensure that you are installing the version of Swift that corresponds to - this document. If not, enable the correct update repositories. +Swift requires some space on XFS filesystems to store data and run tests. -Next, choose either :ref:`partition-section` or :ref:`loopback-section`. +Choose either :ref:`partition-section` or :ref:`loopback-section`. .. _partition-section: @@ -58,731 +117,685 @@ Using a partition for storage ============================= If you are going to use a separate partition for Swift data, be sure to add -another device when creating the VM, and follow these instructions. - - #. `fdisk /dev/sdb` (set up a single partition) - #. `mkfs.xfs -i size=1024 /dev/sdb1` - #. Edit `/etc/fstab` and add - `/dev/sdb1 /mnt/sdb1 xfs noatime,nodiratime,nobarrier,logbufs=8 0 0` - #. `mkdir /mnt/sdb1` - #. `mount /mnt/sdb1` - #. `mkdir /mnt/sdb1/1 /mnt/sdb1/2 /mnt/sdb1/3 /mnt/sdb1/4` - #. `chown : /mnt/sdb1/*` - #. `mkdir /srv` - #. `for x in {1..4}; do ln -s /mnt/sdb1/$x /srv/$x; done` - #. `mkdir -p /etc/swift/object-server /etc/swift/container-server /etc/swift/account-server /srv/1/node/sdb1 /srv/2/node/sdb2 /srv/3/node/sdb3 /srv/4/node/sdb4 /var/run/swift` - #. `chown -R : /etc/swift /srv/[1-4]/ /var/run/swift` -- **Make sure to include the trailing slash after /srv/[1-4]/** - #. Add to `/etc/rc.local` (before the `exit 0`):: +another device when creating the VM, and follow these instructions: - mkdir -p /var/cache/swift /var/cache/swift2 /var/cache/swift3 /var/cache/swift4 - chown : /var/cache/swift* - mkdir -p /var/run/swift - chown : /var/run/swift - #. Next, skip to :ref:`rsync-section`. +.. note:: + The disk does not have to be ``/dev/sdb1`` (for example, it could be + ``/dev/vdb1``) however the mount point should still be ``/mnt/sdb1``. + +#. Set up a single partition on the device (this will wipe the drive):: + + sudo parted /dev/sdb mklabel msdos mkpart p xfs 0% 100% + +#. Create an XFS file system on the partition:: + + sudo mkfs.xfs /dev/sdb1 + +#. Find the UUID of the new partition:: + + sudo blkid + +#. Edit ``/etc/fstab`` and add:: + + UUID="" /mnt/sdb1 xfs noatime 0 0 + +#. Create the Swift data mount point and test that mounting works:: + + sudo mkdir /mnt/sdb1 + sudo mount -a +#. Next, skip to :ref:`common-dev-section`. .. _loopback-section: Using a loopback device for storage =================================== -If you want to use a loopback device instead of another partition, follow these instructions. - - #. `mkdir /srv` - #. `dd if=/dev/zero of=/srv/swift-disk bs=1024 count=0 seek=1000000` - (modify seek to make a larger or smaller partition) - #. `mkfs.xfs -i size=1024 /srv/swift-disk` - #. Edit `/etc/fstab` and add - `/srv/swift-disk /mnt/sdb1 xfs loop,noatime,nodiratime,nobarrier,logbufs=8 0 0` - #. `mkdir /mnt/sdb1` - #. `mount /mnt/sdb1` - #. `mkdir /mnt/sdb1/1 /mnt/sdb1/2 /mnt/sdb1/3 /mnt/sdb1/4` - #. `chown : /mnt/sdb1/*` - #. `for x in {1..4}; do ln -s /mnt/sdb1/$x /srv/$x; done` - #. `mkdir -p /etc/swift/object-server /etc/swift/container-server /etc/swift/account-server /srv/1/node/sdb1 /srv/2/node/sdb2 /srv/3/node/sdb3 /srv/4/node/sdb4 /var/run/swift` - #. `chown -R : /etc/swift /srv/[1-4]/ /var/run/swift` -- **Make sure to include the trailing slash after /srv/[1-4]/** - #. Add to `/etc/rc.local` (before the `exit 0`):: +If you want to use a loopback device instead of another partition, follow +these instructions: + +#. Create the file for the loopback device:: + + sudo mkdir -p /srv + sudo truncate -s 1GB /srv/swift-disk + sudo mkfs.xfs /srv/swift-disk + + Modify size specified in the ``truncate`` command to make a larger or + smaller partition as needed. + +#. Edit `/etc/fstab` and add:: + + /srv/swift-disk /mnt/sdb1 xfs loop,noatime 0 0 + +#. Create the Swift data mount point and test that mounting works:: + + sudo mkdir /mnt/sdb1 + sudo mount -a + +.. _common-dev-section: + +Common Post-Device Setup +======================== + +#. Create the individualized data links:: + + sudo mkdir /mnt/sdb1/1 /mnt/sdb1/2 /mnt/sdb1/3 /mnt/sdb1/4 + sudo chown ${USER}:${USER} /mnt/sdb1/* + for x in {1..4}; do sudo ln -s /mnt/sdb1/$x /srv/$x; done + sudo mkdir -p /srv/1/node/sdb1 /srv/1/node/sdb5 \ + /srv/2/node/sdb2 /srv/2/node/sdb6 \ + /srv/3/node/sdb3 /srv/3/node/sdb7 \ + /srv/4/node/sdb4 /srv/4/node/sdb8 + sudo mkdir -p /var/run/swift + sudo mkdir -p /var/cache/swift /var/cache/swift2 \ + /var/cache/swift3 /var/cache/swift4 + sudo chown -R ${USER}:${USER} /var/run/swift + sudo chown -R ${USER}:${USER} /var/cache/swift* + # **Make sure to include the trailing slash after /srv/$x/** + for x in {1..4}; do sudo chown -R ${USER}:${USER} /srv/$x/; done + + .. note:: + We create the mount points and mount the loopback file under + /mnt/sdb1. This file will contain one directory per simulated Swift node, + each owned by the current Swift user. + + We then create symlinks to these directories under /srv. + If the disk sdb or loopback file is unmounted, files will not be written under + /srv/\*, because the symbolic link destination /mnt/sdb1/* will not + exist. This prevents disk sync operations from writing to the root + partition in the event a drive is unmounted. + +#. Restore appropriate permissions on reboot. + + * On traditional Linux systems, add the following lines to ``/etc/rc.local`` (before the ``exit 0``):: mkdir -p /var/cache/swift /var/cache/swift2 /var/cache/swift3 /var/cache/swift4 chown : /var/cache/swift* mkdir -p /var/run/swift chown : /var/run/swift -.. _rsync-section: + * On CentOS and Fedora we can use systemd (rc.local is deprecated):: ----------------- -Setting up rsync ----------------- + cat << EOF |sudo tee /etc/tmpfiles.d/swift.conf + d /var/cache/swift 0755 ${USER} ${USER} - - + d /var/cache/swift2 0755 ${USER} ${USER} - - + d /var/cache/swift3 0755 ${USER} ${USER} - - + d /var/cache/swift4 0755 ${USER} ${USER} - - + d /var/run/swift 0755 ${USER} ${USER} - - + EOF - #. Create /etc/rsyncd.conf:: - - uid = - gid = - log file = /var/log/rsyncd.log - pid file = /var/run/rsyncd.pid - address = 127.0.0.1 - - [account6012] - max connections = 25 - path = /srv/1/node/ - read only = false - lock file = /var/lock/account6012.lock - - [account6022] - max connections = 25 - path = /srv/2/node/ - read only = false - lock file = /var/lock/account6022.lock - - [account6032] - max connections = 25 - path = /srv/3/node/ - read only = false - lock file = /var/lock/account6032.lock - - [account6042] - max connections = 25 - path = /srv/4/node/ - read only = false - lock file = /var/lock/account6042.lock - - - [container6011] - max connections = 25 - path = /srv/1/node/ - read only = false - lock file = /var/lock/container6011.lock - - [container6021] - max connections = 25 - path = /srv/2/node/ - read only = false - lock file = /var/lock/container6021.lock - - [container6031] - max connections = 25 - path = /srv/3/node/ - read only = false - lock file = /var/lock/container6031.lock - - [container6041] - max connections = 25 - path = /srv/4/node/ - read only = false - lock file = /var/lock/container6041.lock - - - [object6010] - max connections = 25 - path = /srv/1/node/ - read only = false - lock file = /var/lock/object6010.lock - - [object6020] - max connections = 25 - path = /srv/2/node/ - read only = false - lock file = /var/lock/object6020.lock - - [object6030] - max connections = 25 - path = /srv/3/node/ - read only = false - lock file = /var/lock/object6030.lock - - [object6040] - max connections = 25 - path = /srv/4/node/ - read only = false - lock file = /var/lock/object6040.lock - - #. On Ubuntu, edit the following line in /etc/default/rsync:: - - RSYNC_ENABLE=true - - On Fedora, edit the following line in /etc/xinetd.d/rsync:: - - disable = no - - #. On Ubuntu `service rsync restart` + * On OpenSuse place the lines in ``/etc/init.d/boot.local``. ------------------- -Starting memcached ------------------- + .. note:: + On some systems the rc file might need to be an executable shell script. -On Fedora, make sure that memcached runs, running this if necessary: +Creating an XFS tmp dir +----------------------- - * `systemctl enable memcached.service` - * `systemctl start memcached.service` +Tests require having a directory available on an XFS filesystem. By default the +tests use ``/tmp``, however this can be pointed elsewhere with the ``TMPDIR`` +environment variable. -If this is not done, tokens of tempauth expire immediately and accessing -Swift with curl becomes impossible. +.. note:: + If your root filesystem is XFS, you can skip this section if ``/tmp`` is + just a directory and not a mounted tmpfs. Or you could simply point to any + existing directory owned by your user by specifying it with the ``TMPDIR`` + environment variable. ---------------------------------------------------- -Optional: Setting up rsyslog for individual logging ---------------------------------------------------- + If your root filesystem is not XFS, you should create a loopback device, + format it with XFS and mount it. You can mount it over ``/tmp`` or to + another location and specify it with the ``TMPDIR`` environment variable. - #. Create /etc/rsyslog.d/10-swift.conf:: +* Create the file for the tmp loopback device:: - # Uncomment the following to have a log containing all logs together - #local1,local2,local3,local4,local5.* /var/log/swift/all.log + sudo mkdir -p /srv + sudo truncate -s 1GB /srv/swift-tmp # create 1GB file for XFS in /srv + sudo mkfs.xfs /srv/swift-tmp - # Uncomment the following to have hourly proxy logs for stats processing - #$template HourlyProxyLog,"/var/log/swift/hourly/%$YEAR%%$MONTH%%$DAY%%$HOUR%" - #local1.*;local1.!notice ?HourlyProxyLog +* To mount the tmp loopback device at ``/tmp``, do the following:: - local1.*;local1.!notice /var/log/swift/proxy.log - local1.notice /var/log/swift/proxy.error - local1.* ~ + sudo mount -o loop,noatime /srv/swift-tmp /tmp + sudo chmod -R 1777 /tmp - local2.*;local2.!notice /var/log/swift/storage1.log - local2.notice /var/log/swift/storage1.error - local2.* ~ + * To persist this, edit and add the following to ``/etc/fstab``:: - local3.*;local3.!notice /var/log/swift/storage2.log - local3.notice /var/log/swift/storage2.error - local3.* ~ + /srv/swift-tmp /tmp xfs rw,noatime,attr2,inode64,noquota 0 0 - local4.*;local4.!notice /var/log/swift/storage3.log - local4.notice /var/log/swift/storage3.error - local4.* ~ +* To mount the tmp loopback at an alternate location (for example, ``/mnt/tmp``), + do the following:: - local5.*;local5.!notice /var/log/swift/storage4.log - local5.notice /var/log/swift/storage4.error - local5.* ~ + sudo mkdir -p /mnt/tmp + sudo mount -o loop,noatime /srv/swift-tmp /mnt/tmp + sudo chown ${USER}:${USER} /mnt/tmp - #. Edit /etc/rsyslog.conf and make the following change:: + * To persist this, edit and add the following to ``/etc/fstab``:: - $PrivDropToGroup adm + /srv/swift-tmp /mnt/tmp xfs rw,noatime,attr2,inode64,noquota 0 0 - #. `mkdir -p /var/log/swift/hourly` - #. `chown -R syslog.adm /var/log/swift` - #. `chmod -R g+w /var/log/swift` - #. `service rsyslog restart` + * Set your ``TMPDIR`` environment dir so that Swift looks in the right location:: ------------------------------------------------- -Getting the code and setting up test environment ------------------------------------------------- + export TMPDIR=/mnt/tmp + echo "export TMPDIR=/mnt/tmp" >> $HOME/.bashrc -Sample configuration files are provided with all defaults in line-by-line comments. +---------------- +Getting the code +---------------- -Do these commands as you on guest. +#. Check out the python-swiftclient repo:: - #. `mkdir ~/bin` - #. Check out the swift repo with `git clone https://github.com/openstack/swift.git` - #. Build a development installation of swift, for example: - `cd ~/swift; sudo python setup.py develop` - #. Check out the python-swiftclient repo with `git clone https://github.com/openstack/python-swiftclient.git` - #. Build a development installation of python-swiftclient, for example: - `cd ~/python-swiftclient; sudo python setup.py develop` - #. Edit `~/.bashrc` and add to the end:: + cd $HOME; git clone https://opendev.org/openstack/python-swiftclient.git - export SWIFT_TEST_CONFIG_FILE=/etc/swift/test.conf - export PATH=${PATH}:~/bin +#. Build a development installation of python-swiftclient:: - #. `. ~/.bashrc` + cd $HOME/python-swiftclient; sudo python3 setup.py develop; cd - ---------------------- -Configuring each node ---------------------- +#. Check out the Swift repo:: -Sample configuration files are provided with all defaults in line-by-line comments. + git clone https://github.com/openstack/swift.git - #. Create `/etc/swift/proxy-server.conf`:: +#. Build a development installation of Swift:: - [DEFAULT] - bind_port = 8080 - user = - log_facility = LOG_LOCAL1 - eventlet_debug = true + cd $HOME/swift; sudo pip install --no-binary cryptography -r requirements.txt; sudo python setup.py develop; cd - - [pipeline:main] - pipeline = healthcheck cache tempauth proxy-logging proxy-server + .. note:: + Due to a difference in how ``libssl.so`` is named in OpenSuse vs. other Linux distros the + wheel/binary won't work; thus we use ``--no-binary cryptography`` to build ``cryptography`` + locally. - [app:proxy-server] - use = egg:swift#proxy - allow_account_management = true - account_autocreate = true + Fedora users might have to perform the following if development + installation of Swift fails:: - [filter:tempauth] - use = egg:swift#tempauth - user_admin_admin = admin .admin .reseller_admin - user_test_tester = testing .admin - user_test2_tester2 = testing2 .admin - user_test_tester3 = testing3 + sudo pip install -U xattr - [filter:healthcheck] - use = egg:swift#healthcheck +#. Install Swift's test dependencies:: - [filter:cache] - use = egg:swift#memcache + cd $HOME/swift; sudo pip install -r test-requirements.txt - [filter:proxy-logging] - use = egg:swift#proxy_logging +---------------- +Setting up rsync +---------------- - #. Create `/etc/swift/swift.conf`: +#. Create ``/etc/rsyncd.conf``:: - .. code-block:: none + sudo cp $HOME/swift/doc/saio/rsyncd.conf /etc/ + sudo sed -i "s//${USER}/" /etc/rsyncd.conf - [swift-hash] - # random unique string that can never change (DO NOT LOSE) - swift_hash_path_suffix = changeme + Here is the default ``rsyncd.conf`` file contents maintained in the repo + that is copied and fixed up above: - #. Create `/etc/swift/account-server/1.conf`:: + .. literalinclude:: /../saio/rsyncd.conf + :language: ini - [DEFAULT] - devices = /srv/1/node - mount_check = false - disable_fallocate = true - bind_port = 6012 - user = - log_facility = LOG_LOCAL2 - recon_cache_path = /var/cache/swift - eventlet_debug = true +#. Enable rsync daemon - [pipeline:main] - pipeline = recon account-server + * On Ubuntu, edit the following line in ``/etc/default/rsync``:: - [app:account-server] - use = egg:swift#account + RSYNC_ENABLE=true - [filter:recon] - use = egg:swift#recon + .. note:: + You might have to create the file to perform the edits. - [account-replicator] - vm_test_mode = yes + * On CentOS and Fedora, enable the systemd service:: - [account-auditor] + sudo systemctl enable rsyncd - [account-reaper] + * On OpenSuse, nothing needs to happen here. - #. Create `/etc/swift/account-server/2.conf`:: - [DEFAULT] - devices = /srv/2/node - mount_check = false - disable_fallocate = true - bind_port = 6022 - user = - log_facility = LOG_LOCAL3 - recon_cache_path = /var/cache/swift2 - eventlet_debug = true +#. On platforms with SELinux in ``Enforcing`` mode, either set to ``Permissive``:: - [pipeline:main] - pipeline = recon account-server + sudo setenforce Permissive + sudo sed -i 's/^SELINUX=.*/SELINUX=permissive/g' /etc/selinux/config - [app:account-server] - use = egg:swift#account + Or just allow rsync full access:: - [filter:recon] - use = egg:swift#recon + sudo setsebool -P rsync_full_access 1 - [account-replicator] - vm_test_mode = yes +#. Start the rsync daemon - [account-auditor] + * On Ubuntu 14.04, run:: - [account-reaper] + sudo service rsync restart - #. Create `/etc/swift/account-server/3.conf`:: + * On Ubuntu 16.04, run:: - [DEFAULT] - devices = /srv/3/node - mount_check = false - disable_fallocate = true - bind_port = 6032 - user = - log_facility = LOG_LOCAL4 - recon_cache_path = /var/cache/swift3 - eventlet_debug = true + sudo systemctl enable rsync + sudo systemctl start rsync - [pipeline:main] - pipeline = recon account-server + * On CentOS, Fedora and OpenSuse, run:: - [app:account-server] - use = egg:swift#account + sudo systemctl start rsyncd - [filter:recon] - use = egg:swift#recon + * On other xinetd based systems simply run:: - [account-replicator] - vm_test_mode = yes + sudo service xinetd restart - [account-auditor] +#. Verify rsync is accepting connections for all servers:: - [account-reaper] + rsync rsync://pub@localhost/ - #. Create `/etc/swift/account-server/4.conf`:: + You should see the following output from the above command:: - [DEFAULT] - devices = /srv/4/node - mount_check = false - disable_fallocate = true - bind_port = 6042 - user = - log_facility = LOG_LOCAL5 - recon_cache_path = /var/cache/swift4 - eventlet_debug = true + account6212 + account6222 + account6232 + account6242 + container6211 + container6221 + container6231 + container6241 + object6210 + object6220 + object6230 + object6240 - [pipeline:main] - pipeline = recon account-server +------------------ +Starting memcached +------------------ - [app:account-server] - use = egg:swift#account +On non-Ubuntu distros you need to ensure memcached is running:: - [filter:recon] - use = egg:swift#recon + sudo service memcached start + sudo chkconfig memcached on - [account-replicator] - vm_test_mode = yes +or:: - [account-auditor] + sudo systemctl enable memcached + sudo systemctl start memcached - [account-reaper] +The tempauth middleware stores tokens in memcached. If memcached is not +running, tokens cannot be validated, and accessing Swift becomes impossible. - #. Create `/etc/swift/container-server/1.conf`:: +--------------------------------------------------- +Optional: Setting up rsyslog for individual logging +--------------------------------------------------- - [DEFAULT] - devices = /srv/1/node - mount_check = false - disable_fallocate = true - bind_port = 6011 - user = - log_facility = LOG_LOCAL2 - recon_cache_path = /var/cache/swift - eventlet_debug = true +Fedora and OpenSuse may not have rsyslog installed, in which case you will need +to install it if you want to use individual logging. - [pipeline:main] - pipeline = recon container-server +#. Install rsyslogd - [app:container-server] - use = egg:swift#container - [filter:recon] - use = egg:swift#recon + * On Fedora:: - [container-replicator] - vm_test_mode = yes + sudo dnf install rsyslog - [container-updater] + * On OpenSuse:: - [container-auditor] + sudo zypper install rsyslog - [container-sync] +#. Install the Swift rsyslogd configuration:: - #. Create `/etc/swift/container-server/2.conf`:: + sudo cp $HOME/swift/doc/saio/rsyslog.d/10-swift.conf /etc/rsyslog.d/ - [DEFAULT] - devices = /srv/2/node - mount_check = false - disable_fallocate = true - bind_port = 6021 - user = - log_facility = LOG_LOCAL3 - recon_cache_path = /var/cache/swift2 - eventlet_debug = true + Be sure to review that conf file to determine if you want all the logs + in one file vs. all the logs separated out, and if you want hourly logs + for stats processing. For convenience, we provide its default contents + below: - [pipeline:main] - pipeline = recon container-server + .. literalinclude:: /../saio/rsyslog.d/10-swift.conf + :language: ini - [app:container-server] - use = egg:swift#container +#. Edit ``/etc/rsyslog.conf`` and make the following change (usually in the + "GLOBAL DIRECTIVES" section):: - [filter:recon] - use = egg:swift#recon + $PrivDropToGroup adm - [container-replicator] - vm_test_mode = yes +#. If using hourly logs (see above) perform:: - [container-updater] + sudo mkdir -p /var/log/swift/hourly - [container-auditor] + Otherwise perform:: - [container-sync] + sudo mkdir -p /var/log/swift - #. Create `/etc/swift/container-server/3.conf`:: +#. Setup the logging directory and start syslog: - [DEFAULT] - devices = /srv/3/node - mount_check = false - disable_fallocate = true - bind_port = 6031 - user = - log_facility = LOG_LOCAL4 - recon_cache_path = /var/cache/swift3 - eventlet_debug = true + * On Ubuntu:: - [pipeline:main] - pipeline = recon container-server + sudo chown -R syslog.adm /var/log/swift + sudo chmod -R g+w /var/log/swift + sudo service rsyslog restart - [app:container-server] - use = egg:swift#container + * On CentOS, Fedora and OpenSuse:: - [filter:recon] - use = egg:swift#recon + sudo chown -R root:adm /var/log/swift + sudo chmod -R g+w /var/log/swift + sudo systemctl restart rsyslog + sudo systemctl enable rsyslog - [container-replicator] - vm_test_mode = yes +--------------------- +Configuring each node +--------------------- - [container-updater] +After performing the following steps, be sure to verify that Swift has access +to resulting configuration files (sample configuration files are provided with +all defaults in line-by-line comments). - [container-auditor] +#. Optionally remove an existing swift directory:: - [container-sync] + sudo rm -rf /etc/swift - #. Create `/etc/swift/container-server/4.conf`:: +#. Populate the ``/etc/swift`` directory itself:: - [DEFAULT] - devices = /srv/4/node - mount_check = false - disable_fallocate = true - bind_port = 6041 - user = - log_facility = LOG_LOCAL5 - recon_cache_path = /var/cache/swift4 - eventlet_debug = true + cd $HOME/swift/doc; sudo cp -r saio/swift /etc/swift; cd - + sudo chown -R ${USER}:${USER} /etc/swift - [pipeline:main] - pipeline = recon container-server +#. Update ```` references in the Swift config files:: - [app:container-server] - use = egg:swift#container + find /etc/swift/ -name \*.conf | xargs sudo sed -i "s//${USER}/" - [filter:recon] - use = egg:swift#recon +The contents of the configuration files provided by executing the above +commands are as follows: - [container-replicator] - vm_test_mode = yes +#. ``/etc/swift/swift.conf`` - [container-updater] + .. literalinclude:: /../saio/swift/swift.conf + :language: ini - [container-auditor] +#. ``/etc/swift/proxy-server.conf`` - [container-sync] + .. literalinclude:: /../saio/swift/proxy-server.conf + :language: ini +#. ``/etc/swift/object-expirer.conf`` - #. Create `/etc/swift/object-server/1.conf`:: + .. literalinclude:: /../saio/swift/object-expirer.conf + :language: ini - [DEFAULT] - devices = /srv/1/node - mount_check = false - disable_fallocate = true - bind_port = 6010 - user = - log_facility = LOG_LOCAL2 - recon_cache_path = /var/cache/swift - eventlet_debug = true +#. ``/etc/swift/container-sync-realms.conf`` - [pipeline:main] - pipeline = recon object-server + .. literalinclude:: /../saio/swift/container-sync-realms.conf + :language: ini - [app:object-server] - use = egg:swift#object +#. ``/etc/swift/account-server/1.conf`` - [filter:recon] - use = egg:swift#recon + .. literalinclude:: /../saio/swift/account-server/1.conf + :language: ini - [object-replicator] - vm_test_mode = yes +#. ``/etc/swift/container-server/1.conf`` - [object-updater] + .. literalinclude:: /../saio/swift/container-server/1.conf + :language: ini - [object-auditor] +#. ``/etc/swift/container-reconciler/1.conf`` - #. Create `/etc/swift/object-server/2.conf`:: + .. literalinclude:: /../saio/swift/container-reconciler/1.conf + :language: ini - [DEFAULT] - devices = /srv/2/node - mount_check = false - disable_fallocate = true - bind_port = 6020 - user = - log_facility = LOG_LOCAL3 - recon_cache_path = /var/cache/swift2 - eventlet_debug = true +#. ``/etc/swift/object-server/1.conf`` - [pipeline:main] - pipeline = recon object-server + .. literalinclude:: /../saio/swift/object-server/1.conf + :language: ini - [app:object-server] - use = egg:swift#object +#. ``/etc/swift/account-server/2.conf`` - [filter:recon] - use = egg:swift#recon + .. literalinclude:: /../saio/swift/account-server/2.conf + :language: ini - [object-replicator] - vm_test_mode = yes +#. ``/etc/swift/container-server/2.conf`` - [object-updater] + .. literalinclude:: /../saio/swift/container-server/2.conf + :language: ini - [object-auditor] +#. ``/etc/swift/container-reconciler/2.conf`` - #. Create `/etc/swift/object-server/3.conf`:: + .. literalinclude:: /../saio/swift/container-reconciler/2.conf + :language: ini - [DEFAULT] - devices = /srv/3/node - mount_check = false - disable_fallocate = true - bind_port = 6030 - user = - log_facility = LOG_LOCAL4 - recon_cache_path = /var/cache/swift3 - eventlet_debug = true +#. ``/etc/swift/object-server/2.conf`` - [pipeline:main] - pipeline = recon object-server + .. literalinclude:: /../saio/swift/object-server/2.conf + :language: ini - [app:object-server] - use = egg:swift#object +#. ``/etc/swift/account-server/3.conf`` - [filter:recon] - use = egg:swift#recon + .. literalinclude:: /../saio/swift/account-server/3.conf + :language: ini - [object-replicator] - vm_test_mode = yes +#. ``/etc/swift/container-server/3.conf`` - [object-updater] + .. literalinclude:: /../saio/swift/container-server/3.conf + :language: ini - [object-auditor] +#. ``/etc/swift/container-reconciler/3.conf`` - #. Create `/etc/swift/object-server/4.conf`:: + .. literalinclude:: /../saio/swift/container-reconciler/3.conf + :language: ini - [DEFAULT] - devices = /srv/4/node - mount_check = false - disable_fallocate = true - bind_port = 6040 - user = - log_facility = LOG_LOCAL5 - recon_cache_path = /var/cache/swift4 - eventlet_debug = true +#. ``/etc/swift/object-server/3.conf`` - [pipeline:main] - pipeline = recon object-server + .. literalinclude:: /../saio/swift/object-server/3.conf + :language: ini - [app:object-server] - use = egg:swift#object +#. ``/etc/swift/account-server/4.conf`` - [filter:recon] - use = egg:swift#recon + .. literalinclude:: /../saio/swift/account-server/4.conf + :language: ini - [object-replicator] - vm_test_mode = yes +#. ``/etc/swift/container-server/4.conf`` - [object-updater] + .. literalinclude:: /../saio/swift/container-server/4.conf + :language: ini - [object-auditor] +#. ``/etc/swift/container-reconciler/4.conf`` + + .. literalinclude:: /../saio/swift/container-reconciler/4.conf + :language: ini + +#. ``/etc/swift/object-server/4.conf`` + + .. literalinclude:: /../saio/swift/object-server/4.conf + :language: ini + +.. _setup_scripts: ------------------------------------ Setting up scripts for running Swift ------------------------------------ - #. Create `~/bin/resetswift.` +#. Copy the SAIO scripts for resetting the environment:: + + mkdir -p $HOME/bin + cd $HOME/swift/doc; cp saio/bin/* $HOME/bin; cd - + chmod +x $HOME/bin/* + +#. Edit the ``$HOME/bin/resetswift`` script + + The template ``resetswift`` script looks like the following: + + .. literalinclude:: /../saio/bin/resetswift + :language: bash + + If you did not set up rsyslog for individual logging, remove the ``find + /var/log/swift...`` line:: + + sed -i "/find \/var\/log\/swift/d" $HOME/bin/resetswift + + +#. Install the sample configuration file for running tests:: + + cp $HOME/swift/test/sample.conf /etc/swift/test.conf + + The template ``test.conf`` looks like the following: + + .. literalinclude:: /../../test/sample.conf + :language: ini + +----------------------------------------- +Configure environment variables for Swift +----------------------------------------- + +#. Add an environment variable for running tests below:: + + echo "export SWIFT_TEST_CONFIG_FILE=/etc/swift/test.conf" >> $HOME/.bashrc + +#. Be sure that your ``PATH`` includes the ``bin`` directory:: + + echo "export PATH=${PATH}:$HOME/bin" >> $HOME/.bashrc + +#. If you are using a loopback device for Swift Storage, add an environment var + to substitute ``/dev/sdb1`` with ``/srv/swift-disk``:: + + echo "export SAIO_BLOCK_DEVICE=/srv/swift-disk" >> $HOME/.bashrc + +#. If you are using a device other than ``/dev/sdb1`` for Swift storage (for + example, ``/dev/vdb1``), add an environment var to substitute it:: + + echo "export SAIO_BLOCK_DEVICE=/dev/vdb1" >> $HOME/.bashrc + +#. If you are using a location other than ``/tmp`` for Swift tmp data (for + example, ``/mnt/tmp``), add ``TMPDIR`` environment var to set it:: + + export TMPDIR=/mnt/tmp + echo "export TMPDIR=/mnt/tmp" >> $HOME/.bashrc + +#. Source the above environment variables into your current environment:: + + . $HOME/.bashrc + +-------------------------- +Constructing initial rings +-------------------------- + +#. Construct the initial rings using the provided script:: + + remakerings + + The ``remakerings`` script looks like the following: + + .. literalinclude:: /../saio/bin/remakerings + :language: bash + + You can expect the output from this command to produce the following. Note + that 3 object rings are created in order to test storage policies and EC in + the SAIO environment. The EC ring is the only one with all 8 devices. + There are also two replication rings, one for 3x replication and another + for 2x replication, but those rings only use 4 devices: + + + .. code-block:: console + + Device d0r1z1-127.0.0.1:6210R127.0.0.1:6210/sdb1_"" with 1.0 weight got id 0 + Device d1r1z2-127.0.0.2:6220R127.0.0.2:6220/sdb2_"" with 1.0 weight got id 1 + Device d2r1z3-127.0.0.3:6230R127.0.0.3:6230/sdb3_"" with 1.0 weight got id 2 + Device d3r1z4-127.0.0.4:6240R127.0.0.4:6240/sdb4_"" with 1.0 weight got id 3 + Reassigned 3072 (300.00%) partitions. Balance is now 0.00. Dispersion is now 0.00 + Device d0r1z1-127.0.0.1:6210R127.0.0.1:6210/sdb1_"" with 1.0 weight got id 0 + Device d1r1z2-127.0.0.2:6220R127.0.0.2:6220/sdb2_"" with 1.0 weight got id 1 + Device d2r1z3-127.0.0.3:6230R127.0.0.3:6230/sdb3_"" with 1.0 weight got id 2 + Device d3r1z4-127.0.0.4:6240R127.0.0.4:6240/sdb4_"" with 1.0 weight got id 3 + Reassigned 2048 (200.00%) partitions. Balance is now 0.00. Dispersion is now 0.00 + Device d0r1z1-127.0.0.1:6210R127.0.0.1:6210/sdb1_"" with 1.0 weight got id 0 + Device d1r1z1-127.0.0.1:6210R127.0.0.1:6210/sdb5_"" with 1.0 weight got id 1 + Device d2r1z2-127.0.0.2:6220R127.0.0.2:6220/sdb2_"" with 1.0 weight got id 2 + Device d3r1z2-127.0.0.2:6220R127.0.0.2:6220/sdb6_"" with 1.0 weight got id 3 + Device d4r1z3-127.0.0.3:6230R127.0.0.3:6230/sdb3_"" with 1.0 weight got id 4 + Device d5r1z3-127.0.0.3:6230R127.0.0.3:6230/sdb7_"" with 1.0 weight got id 5 + Device d6r1z4-127.0.0.4:6240R127.0.0.4:6240/sdb4_"" with 1.0 weight got id 6 + Device d7r1z4-127.0.0.4:6240R127.0.0.4:6240/sdb8_"" with 1.0 weight got id 7 + Reassigned 6144 (600.00%) partitions. Balance is now 0.00. Dispersion is now 0.00 + Device d0r1z1-127.0.0.1:6211R127.0.0.1:6211/sdb1_"" with 1.0 weight got id 0 + Device d1r1z2-127.0.0.2:6221R127.0.0.2:6221/sdb2_"" with 1.0 weight got id 1 + Device d2r1z3-127.0.0.3:6231R127.0.0.3:6231/sdb3_"" with 1.0 weight got id 2 + Device d3r1z4-127.0.0.4:6241R127.0.0.4:6241/sdb4_"" with 1.0 weight got id 3 + Reassigned 3072 (300.00%) partitions. Balance is now 0.00. Dispersion is now 0.00 + Device d0r1z1-127.0.0.1:6212R127.0.0.1:6212/sdb1_"" with 1.0 weight got id 0 + Device d1r1z2-127.0.0.2:6222R127.0.0.2:6222/sdb2_"" with 1.0 weight got id 1 + Device d2r1z3-127.0.0.3:6232R127.0.0.3:6232/sdb3_"" with 1.0 weight got id 2 + Device d3r1z4-127.0.0.4:6242R127.0.0.4:6242/sdb4_"" with 1.0 weight got id 3 + Reassigned 3072 (300.00%) partitions. Balance is now 0.00. Dispersion is now 0.00 + + +#. Read more about Storage Policies and your SAIO :doc:`policies_saio` + +------------- +Testing Swift +------------- - If you are using a loopback device substitute `/dev/sdb1` with `/srv/swift-disk`. +#. Verify the unit tests run:: - If you did not set up rsyslog for individual logging, remove the `find /var/log/swift...` line:: + $HOME/swift/.unittests - #!/bin/bash + Note that the unit tests do not require any Swift daemons running. - swift-init all stop - find /var/log/swift -type f -exec rm -f {} \; - sudo umount /mnt/sdb1 - sudo mkfs.xfs -f -i size=1024 /dev/sdb1 - sudo mount /mnt/sdb1 - sudo mkdir /mnt/sdb1/1 /mnt/sdb1/2 /mnt/sdb1/3 /mnt/sdb1/4 - sudo chown : /mnt/sdb1/* - mkdir -p /srv/1/node/sdb1 /srv/2/node/sdb2 /srv/3/node/sdb3 /srv/4/node/sdb4 - sudo rm -f /var/log/debug /var/log/messages /var/log/rsyncd.log /var/log/syslog - find /var/cache/swift* -type f -name *.recon -exec rm -f {} \; - sudo service rsyslog restart - sudo service memcached restart +#. Start the "main" Swift daemon processes (proxy, account, container, and + object):: - #. Create `~/bin/remakerings`:: + startmain - #!/bin/bash + (The "``Unable to increase file descriptor limit. Running as non-root?``" + warnings are expected and ok.) - cd /etc/swift + The ``startmain`` script looks like the following: - rm -f *.builder *.ring.gz backups/*.builder backups/*.ring.gz + .. literalinclude:: /../saio/bin/startmain + :language: bash - swift-ring-builder object.builder create 18 3 1 - swift-ring-builder object.builder add z1-127.0.0.1:6010/sdb1 1 - swift-ring-builder object.builder add z2-127.0.0.1:6020/sdb2 1 - swift-ring-builder object.builder add z3-127.0.0.1:6030/sdb3 1 - swift-ring-builder object.builder add z4-127.0.0.1:6040/sdb4 1 - swift-ring-builder object.builder rebalance - swift-ring-builder container.builder create 18 3 1 - swift-ring-builder container.builder add z1-127.0.0.1:6011/sdb1 1 - swift-ring-builder container.builder add z2-127.0.0.1:6021/sdb2 1 - swift-ring-builder container.builder add z3-127.0.0.1:6031/sdb3 1 - swift-ring-builder container.builder add z4-127.0.0.1:6041/sdb4 1 - swift-ring-builder container.builder rebalance - swift-ring-builder account.builder create 18 3 1 - swift-ring-builder account.builder add z1-127.0.0.1:6012/sdb1 1 - swift-ring-builder account.builder add z2-127.0.0.1:6022/sdb2 1 - swift-ring-builder account.builder add z3-127.0.0.1:6032/sdb3 1 - swift-ring-builder account.builder add z4-127.0.0.1:6042/sdb4 1 - swift-ring-builder account.builder rebalance +#. Get an ``X-Storage-Url`` and ``X-Auth-Token``:: - #. Create `~/bin/startmain`:: + curl -v -H 'X-Storage-User: test:tester' -H 'X-Storage-Pass: testing' http://127.0.0.1:8080/auth/v1.0 - #!/bin/bash +#. Check that you can ``GET`` account:: - swift-init main start + curl -v -H 'X-Auth-Token: ' - #. Create `~/bin/startrest`:: +#. Check that the ``swift`` command provided by python-swiftclient works:: - #!/bin/bash + swift -A http://127.0.0.1:8080/auth/v1.0 -U test:tester -K testing stat - swift-init rest start +#. Verify the functional tests run:: - #. `chmod +x ~/bin/*` - #. `remakerings` - #. `cd ~/swift; ./.unittests` - #. `startmain` (The ``Unable to increase file descriptor limit. Running as non-root?`` warnings are expected and ok.) - #. Get an `X-Storage-Url` and `X-Auth-Token`: ``curl -v -H 'X-Storage-User: test:tester' -H 'X-Storage-Pass: testing' http://127.0.0.1:8080/auth/v1.0`` - #. Check that you can GET account: ``curl -v -H 'X-Auth-Token: ' `` - #. Check that `swift` works: `swift -A http://127.0.0.1:8080/auth/v1.0 -U test:tester -K testing stat` - #. `cp ~/swift/test/sample.conf /etc/swift/test.conf` - #. `cd ~/swift; ./.functests` (Note: functional tests will first delete - everything in the configured accounts.) - #. `cd ~/swift; ./.probetests` (Note: probe tests will reset your - environment as they call `resetswift` for each test.) + $HOME/swift/.functests -If you plan to work on documentation (and who doesn't?!) you must -install Sphinx and then you can build the documentation: + (Note: functional tests will first delete everything in the configured + accounts.) -On Ubuntu: - #. `sudo apt-get install python-sphinx` - #. `python setup.py build_sphinx` +#. Verify the probe tests run:: -On MacOS: - #. `sudo easy_install -U sphinx` - #. `python setup.py build_sphinx` + $HOME/swift/.probetests -Install tox so you find Py26 and PEP8 problems before Jenkins does: - #. `sudo apt-get install python2.6-dev python-pip` - #. `sudo pip install tox` + (Note: probe tests will reset your environment as they call ``resetswift`` + for each test.) ---------------- Debugging Issues ---------------- -If all doesn't go as planned, and tests fail, or you can't auth, or something doesn't work, here are some good starting places to look for issues: +If all doesn't go as planned, and tests fail, or you can't auth, or something +doesn't work, here are some good starting places to look for issues: -#. Everything is logged in /var/log/syslog, so that is a good first place to - look for errors (most likely python tracebacks). +#. Everything is logged using system facilities -- usually in ``/var/log/syslog``, + but possibly in ``/var/log/messages`` on e.g. Fedora -- so that is a good first + place to look for errors (most likely python tracebacks). #. Make sure all of the server processes are running. For the base functionality, the Proxy, Account, Container, and Object servers should be running. #. If one of the servers are not running, and no errors are logged to syslog, it may be useful to try to start the server manually, for example: - `swift-object-server /etc/swift/object-server/1.conf` will start the + ``swift-object-server /etc/swift/object-server/1.conf`` will start the object server. If there are problems not showing up in syslog, then you will likely see the traceback on startup. #. If you need to, you can turn off syslog for unit tests. This can be - useful for environments where /dev/log is unavailable, or which + useful for environments where ``/dev/log`` is unavailable, or which cannot rate limit (unit tests generate a lot of logs very quickly). - Open the file SWIFT_TEST_CONFIG_FILE points to, and change the - value of fake_syslog to True. + Open the file ``SWIFT_TEST_CONFIG_FILE`` points to, and change the + value of ``fake_syslog`` to ``True``. +#. If you encounter a ``401 Unauthorized`` when following Step 12 where + you check that you can ``GET`` account, use ``sudo service memcached status`` + and check if memcache is running. If memcache is not running, start it using + ``sudo service memcached start``. Once memcache is running, rerun ``GET`` account. + +------------ +Known Issues +------------ + +Listed here are some "gotcha's" that you may run into when using or testing your SAIO: + +#. fallocate_reserve - in most cases a SAIO doesn't have a very large XFS partition + so having fallocate enabled and fallocate_reserve set can cause issues, specifically + when trying to run the functional tests. For this reason fallocate has been turned + off on the object-servers in the SAIO. If you want to play with the fallocate_reserve + settings then know that functional tests will fail unless you change the max_file_size + constraint to something more reasonable then the default (5G). Ideally you'd make + it 1/4 of your XFS file system size so the tests can pass. diff --git a/doc/source/development_watchers.rst b/doc/source/development_watchers.rst new file mode 100644 index 0000000000..6aee254394 --- /dev/null +++ b/doc/source/development_watchers.rst @@ -0,0 +1,112 @@ +================ +Auditor Watchers +================ + +-------- +Overview +-------- + +The duty of auditors is to guard Swift against corruption in the +storage media. But because auditors crawl all objects, they can be +used to program Swift to operate on every object. It is done through +an API known as "watcher". + +Watchers do not have any private view into the cluster. +An operator can write a standalone program that walks the +directories and performs any desired inspection or maintenance. +What watcher brings to the table is a framework to do the same +job easily, under resource restrictions already in place +for the auditor. + +Operations performed by watchers are often site-specific, or else +they would be incorporated into Swift already. However, the code in +the tree provides a reference implementation for convenience. +It is located in swift/obj/watchers/dark_data.py and implements +so-called "Dark Data Watcher". + +Currently, only object auditor supports the watchers. + +------------- +The API class +------------- + +The implementation of a watcher is a Python class that may look like this:: + + class MyWatcher(object): + + def __init__(self, conf, logger, **kwargs): + pass + + def start(self, audit_type, **kwargs): + pass + + def see_object(self, object_metadata, policy_index, partition, + data_file_path, **kwargs): + pass + + def end(self, **kwargs): + pass + +Arguments to watcher methods are passed as keyword arguments, +and methods are expected to consume new, unknown arguments. + +The method __init__() is used to save configuration and logger +at the start of the plug-in. + +The method start() is invoked when auditor starts a pass. +It usually resets counters. The argument `auditor_type` is string of +`"ALL"` or `"ZBF"`, according to the type of the auditor running +the watcher. Watchers that talk to the network tend to hang off the +ALL-type auditor, the lightweight ones are okay with the ZBF-type. + +The method end() is the closing bracket for start(). It is typically +used to log something, or dump some statistics. + +The method see_object() is called when auditor completed an audit +of an object. This is where most of the work is done. + +The protocol for see_object() allows it to raise a special exception, +QuarantienRequested. Auditor catches it and quarantines the object. +In general, it's okay for watcher methods to throw exceptions, so +an author of a watcher plugin does not have to catch them explicitly +with a try:; they can be just permitted to bubble up naturally. + +------------------- +Loading the plugins +------------------- + +Swift auditor loads watcher classes from eggs, so it is necessary +to wrap the class and provide it an entry point:: + + $ cat /usr/lib/python3.8/site-p*/mywatcher*egg-info/entry_points.txt + [mywatcher.mysection] + mywatcherentry = mywatcher:MyWatcher + +Operator tells Swift auditor what plugins to load by adding them +to object-server.conf in the section [object-auditor]. It is also +possible to pass parameters, arriving in the argument conf{} of +method start():: + + [object-auditor] + watchers = mywatcher#mywatcherentry,swift#dark_data + + [object-auditor:watcher:mywatcher#mywatcherentry] + myparam=testing2020 + +Do not forget to remove the watcher from auditors when done. +Although the API itself is very lightweight, it is common for watchers +to incur a significant performance penalty: they can talk to networked +services or access additional objects. + +----------------- +Dark Data Watcher +----------------- + +The watcher API is assumed to be under development. Operators who +need extensions are welcome to report any needs for more arguments +to see_object(). + +The :ref:`dark_data` watcher has been provided as an example. If an +operator wants to create their own watcher, start by copying +the provided example template ``swift/obj/watchers/dark_data.py`` and see +if it is sufficient. diff --git a/doc/source/first_contribution_swift.rst b/doc/source/first_contribution_swift.rst new file mode 100644 index 0000000000..8d986c3187 --- /dev/null +++ b/doc/source/first_contribution_swift.rst @@ -0,0 +1,208 @@ +=========================== +First Contribution to Swift +=========================== + +------------- +Getting Swift +------------- + +.. highlight: none + +Swift's source code is hosted on github and managed with git. The current +trunk can be checked out like this:: + + git clone https://github.com/openstack/swift.git + +This will clone the Swift repository under your account. + +A source tarball for the latest release of Swift is available on the +`launchpad project page `_. + +Prebuilt packages for Ubuntu and RHEL variants are available. + +* `Swift Ubuntu Packages `_ +* `Swift RDO Packages `_ + +-------------------- +Source Control Setup +-------------------- + +Swift uses ``git`` for source control. The OpenStack +`Developer's Guide `_ +describes the steps for setting up Git and all the necessary accounts for +contributing code to Swift. + +---------------- +Changes to Swift +---------------- + +Once you have the source code and source control set up, you can make your +changes to Swift. + +------- +Testing +------- + +The :doc:`Development Guidelines ` describe the testing +requirements before submitting Swift code. + +In summary, you can execute tox from the swift home directory (where you +checked out the source code):: + + tox + +Tox will present tests results. Notice that in the beginning, it is very common +to break many coding style guidelines. + +-------------------------- +Proposing changes to Swift +-------------------------- + +The OpenStack +`Developer's Guide `_ +describes the most common ``git`` commands that you will need. + +Following is a list of the commands that you need to know for your first +contribution to Swift: + +To clone a copy of Swift:: + + git clone https://github.com/openstack/swift.git + +Under the swift directory, set up the Gerrit repository. The following command +configures the repository to know about Gerrit and installs the ``Change-Id`` +commit hook. You only need to do this once:: + + git review -s + +To create your development branch (substitute branch_name for a name of your +choice:: + + git checkout -b + +To check the files that have been updated in your branch:: + + git status + +To check the differences between your branch and the repository:: + + git diff + +Assuming you have not added new files, you commit all your changes using:: + + git commit -a + +Read the `Summary of Git commit message structure `_ +for best practices on writing the commit message. When you are ready to send +your changes for review use:: + + git review + +If successful, Git response message will contain a URL you can use to track your +changes. + +If you need to make further changes to the same review, you can commit them +using:: + + git commit -a --amend + +This will commit the changes under the same set of changes you issued earlier. +Notice that in order to send your latest version for review, you will still +need to call:: + + git review + +--------------------- +Tracking your changes +--------------------- + +After proposing changes to Swift, you can track them at +https://review.opendev.org. After logging in, you will see a dashboard of +"Outgoing reviews" for changes you have proposed, "Incoming reviews" for +changes you are reviewing, and "Recently closed" changes for which you were +either a reviewer or owner. + +.. _post-rebase-instructions: + +------------------------ +Post rebase instructions +------------------------ + +After rebasing, the following steps should be performed to rebuild the swift +installation. Note that these commands should be performed from the root of the +swift repo directory (e.g. ``$HOME/swift/``):: + + sudo python setup.py develop + sudo pip install -r test-requirements.txt + +If using TOX, depending on the changes made during the rebase, you may need to +rebuild the TOX environment (generally this will be the case if +test-requirements.txt was updated such that a new version of a package is +required), this can be accomplished using the ``-r`` argument to the TOX cli:: + + tox -r + +You can include any of the other TOX arguments as well, for example, to run the +pep8 suite and rebuild the TOX environment the following can be used:: + + tox -r -e pep8 + +The rebuild option only needs to be specified once for a particular build (e.g. +pep8), that is further invocations of the same build will not require this +until the next rebase. + +--------------- +Troubleshooting +--------------- + +You may run into the following errors when starting Swift if you rebase +your commit using:: + + git rebase + +.. code-block:: python + + Traceback (most recent call last): + File "/usr/local/bin/swift-init", line 5, in + from pkg_resources import require + File "/usr/lib/python2.7/dist-packages/pkg_resources.py", line 2749, in + working_set = WorkingSet._build_master() + File "/usr/lib/python2.7/dist-packages/pkg_resources.py", line 446, in _build_master + return cls._build_from_requirements(__requires__) + File "/usr/lib/python2.7/dist-packages/pkg_resources.py", line 459, in _build_from_requirements + dists = ws.resolve(reqs, Environment()) + File "/usr/lib/python2.7/dist-packages/pkg_resources.py", line 628, in resolve + raise DistributionNotFound(req) + pkg_resources.DistributionNotFound: swift==2.3.1.devXXX + +(where XXX represents a dev version of Swift). + +.. code-block:: python + + Traceback (most recent call last): + File "/usr/local/bin/swift-proxy-server", line 10, in + execfile(__file__) + File "/home/swift/swift/bin/swift-proxy-server", line 23, in + sys.exit(run_wsgi(conf_file, 'proxy-server', **options)) + File "/home/swift/swift/swift/common/wsgi.py", line 888, in run_wsgi + loadapp(conf_path, global_conf=global_conf) + File "/home/swift/swift/swift/common/wsgi.py", line 390, in loadapp + func(PipelineWrapper(ctx)) + File "/home/swift/swift/swift/proxy/server.py", line 602, in modify_wsgi_pipeline + ctx = pipe.create_filter(filter_name) + File "/home/swift/swift/swift/common/wsgi.py", line 329, in create_filter + global_conf=self.context.global_conf) + File "/usr/lib/python2.7/dist-packages/paste/deploy/loadwsgi.py", line 296, in loadcontext + global_conf=global_conf) + File "/usr/lib/python2.7/dist-packages/paste/deploy/loadwsgi.py", line 328, in _loadegg + return loader.get_context(object_type, name, global_conf) + File "/usr/lib/python2.7/dist-packages/paste/deploy/loadwsgi.py", line 620, in get_context + object_type, name=name) + File "/usr/lib/python2.7/dist-packages/paste/deploy/loadwsgi.py", line 659, in find_egg_entry_point + for prot in protocol_options] or '(no entry points)')))) + LookupError: Entry point 'versioned_writes' not found in egg 'swift' (dir: /home/swift/swift; protocols: paste.filter_factory, paste.filter_app_factory; entry_points: ) + +This happens because ``git rebase`` will retrieve code for a different version +of Swift in the development stream, but the start scripts under +``/usr/local/bin`` have not been updated. The solution is to follow the steps +described in the :ref:`post-rebase-instructions` section. diff --git a/doc/source/getting_started.rst b/doc/source/getting_started.rst index dc6ee973d2..790ccd82dd 100644 --- a/doc/source/getting_started.rst +++ b/doc/source/getting_started.rst @@ -6,36 +6,17 @@ Getting Started System Requirements ------------------- -Swift development currently targets Ubuntu Server 10.04, but should work on -most Linux platforms with the following software: +Swift development currently targets Ubuntu Server 22.04, but should work on +most Linux platforms. -* Python 2.6 -* rsync 3.0 +Swift is written in Python and has these dependencies: -And the following python libraries: - -* Eventlet 0.9.8 -* Setuptools -* Simplejson -* Xattr -* Nose -* Sphinx -* netifaces - -------------- -Getting Swift -------------- - -Swift's source code is hosted on github and managed with git. The current trunk can be checked out like this: - - ``git clone https://github.com/openstack/swift.git`` - -A source tarball for the latest release of Swift is available on the `launchpad project page `_. - -Prebuilt packages for Ubuntu are available starting with Natty, or from PPAs for earlier releases. - -* `Swift Ubuntu Packages `_ -* `Swift PPA Archive `_ +* Python (3.6-3.12) +* rsync 3.x +* `liberasurecode `__ +* The Python packages listed in `the requirements file `__ +* Testing additionally requires `the test dependencies `__ +* Testing requires `these distribution packages `__ ----------- Development @@ -46,11 +27,24 @@ following docs will be useful: * :doc:`Swift All in One ` - Set up a VM with Swift installed * :doc:`Development Guidelines ` +* :doc:`First Contribution to Swift ` +* :doc:`Associated Projects ` + +-------------------------- +CLI client and SDK library +-------------------------- + +There are many clients in the :ref:`ecosystem `. The official CLI +and SDK is python-swiftclient. + +* `Source code `__ +* `Python Package Index `__ ---------- Production ---------- -If you want to set up and configure Swift for a production cluster, the following doc should be useful: +If you want to set up and configure Swift for a production cluster, the +following doc should be useful: -* :doc:`Multiple Server Swift Installation ` +* :doc:`install/index` diff --git a/doc/source/howto_installmultinode.rst b/doc/source/howto_installmultinode.rst deleted file mode 100644 index 61d208191d..0000000000 --- a/doc/source/howto_installmultinode.rst +++ /dev/null @@ -1,443 +0,0 @@ -============================================================== -Instructions for a Multiple Server Swift Installation (Ubuntu) -============================================================== - -Prerequisites -------------- -* Ubuntu Server 10.04 LTS installation media - -.. note: - Swift can run with other distros, but for this document we will focus - on installing on Ubuntu Server, ypmv (your packaging may vary). - -Basic architecture and terms ----------------------------- -- *node* - a host machine running one or more Swift services -- *Proxy node* - node that runs Proxy services; also runs TempAuth -- *Storage node* - node that runs Account, Container, and Object services -- *ring* - a set of mappings of Swift data to physical devices - -This document shows a cluster using the following types of nodes: - -- one Proxy node - - - Runs the swift-proxy-server processes which proxy requests to the - appropriate Storage nodes. The proxy server will also contain - the TempAuth service as WSGI middleware. - -- five Storage nodes - - - Runs the swift-account-server, swift-container-server, and - swift-object-server processes which control storage of the account - databases, the container databases, as well as the actual stored - objects. - -.. note:: - Fewer Storage nodes can be used initially, but a minimum of 5 is - recommended for a production cluster. - -This document describes each Storage node as a separate zone in the ring. -It is recommended to have a minimum of 5 zones. A zone is a group of nodes -that is as isolated as possible from other nodes (separate servers, network, -power, even geography). The ring guarantees that every replica is stored -in a separate zone. For more information about the ring and zones, see: :doc:`The Rings `. - -To increase reliability, you may want to add additional Proxy servers for performance which is described in :ref:`add-proxy-server`. - -Network Setup Notes -------------------- - -This document refers to two networks. An external network for connecting to the Proxy server, and a storage network that is not accessibile from outside the cluster, to which all of the nodes are connected. All of the Swift services, as well as the rsync daemon on the Storage nodes are configured to listen on their STORAGE_LOCAL_NET IP addresses. - -.. note:: - Run all commands as the root user - -General OS configuration and partitioning for each node -------------------------------------------------------- - -#. Install the baseline Ubuntu Server 10.04 LTS on all nodes. - -#. Install common Swift software prereqs:: - - apt-get install python-software-properties - add-apt-repository ppa:swift-core/release - apt-get update - apt-get install swift python-swiftclient openssh-server - -#. Create and populate configuration directories:: - - mkdir -p /etc/swift - chown -R swift:swift /etc/swift/ - -#. On the first node only, create /etc/swift/swift.conf:: - - cat >/etc/swift/swift.conf </etc/swift/proxy-server.conf <> /etc/fstab - mkdir -p /srv/node/sdb1 - mount /srv/node/sdb1 - chown -R swift:swift /srv/node - -#. Create /etc/rsyncd.conf:: - - cat >/etc/rsyncd.conf </etc/swift/account-server.conf </etc/swift/container-server.conf </etc/swift/object-server.conf <' - -#. Check that ``swift`` works (at this point, expect zero containers, zero objects, and zero bytes):: - - swift -A https://$PROXY_LOCAL_NET_IP:8080/auth/v1.0 -U system:root -K testpass stat - -#. Use ``swift`` to upload a few files named 'bigfile[1-2].tgz' to a container named 'myfiles':: - - swift -A https://$PROXY_LOCAL_NET_IP:8080/auth/v1.0 -U system:root -K testpass upload myfiles bigfile1.tgz - swift -A https://$PROXY_LOCAL_NET_IP:8080/auth/v1.0 -U system:root -K testpass upload myfiles bigfile2.tgz - -#. Use ``swift`` to download all files from the 'myfiles' container:: - - swift -A https://$PROXY_LOCAL_NET_IP:8080/auth/v1.0 -U system:root -K testpass download myfiles - -#. Use ``swift`` to save a backup of your builder files to a container named 'builders'. Very important not to lose your builders!:: - - swift -A https://$PROXY_LOCAL_NET_IP:8080/auth/v1.0 -U system:root -K testpass upload builders /etc/swift/*.builder - -#. Use ``swift`` to list your containers:: - - swift -A https://$PROXY_LOCAL_NET_IP:8080/auth/v1.0 -U system:root -K testpass list - -#. Use ``swift`` to list the contents of your 'builders' container:: - - swift -A https://$PROXY_LOCAL_NET_IP:8080/auth/v1.0 -U system:root -K testpass list builders - -#. Use ``swift`` to download all files from the 'builders' container:: - - swift -A https://$PROXY_LOCAL_NET_IP:8080/auth/v1.0 -U system:root -K testpass download builders - -.. _add-proxy-server: - -Adding a Proxy Server ---------------------- - -For reliability's sake you may want to have more than one proxy server. You can set up the additional proxy node in the same manner that you set up the first proxy node but with additional configuration steps. - -Once you have more than two proxies, you also want to load balance between the two, which means your storage endpoint also changes. You can select from different strategies for load balancing. For example, you could use round robin dns, or an actual load balancer (like pound) in front of the two proxies, and point your storage url to the load balancer. - -See :ref:`config-proxy` for the initial setup, and then follow these additional steps. - -#. Update the list of memcache servers in /etc/swift/proxy-server.conf for all the added proxy servers. If you run multiple memcache servers, use this pattern for the multiple IP:port listings: `10.1.2.3:11211,10.1.2.4:11211` in each proxy server's conf file.:: - - [filter:cache] - use = egg:swift#memcache - memcache_servers = $PROXY_LOCAL_NET_IP:11211 - -#. Change the storage url for any users to point to the load balanced url, rather than the first proxy server you created in /etc/swift/proxy-server.conf:: - - [filter:tempauth] - use = egg:swift#tempauth - user_system_root = testpass .admin http[s]://:/v1/AUTH_system - -#. Next, copy all the ring information to all the nodes, including your new proxy nodes, and ensure the ring info gets to all the storage nodes as well. - -#. After you sync all the nodes, make sure the admin has the keys in /etc/swift and the ownership for the ring file is correct. - -Troubleshooting Notes ---------------------- -If you see problems, look in var/log/syslog (or messages on some distros). - -Also, at Rackspace we have seen hints at drive failures by looking at error messages in /var/log/kern.log. - -There are more debugging hints and tips in the :doc:`admin_guide`. diff --git a/doc/source/images/ec_overview.png b/doc/source/images/ec_overview.png new file mode 100644 index 0000000000..d44a103177 Binary files /dev/null and b/doc/source/images/ec_overview.png differ diff --git a/doc/source/images/reload_process_tree_1.svg b/doc/source/images/reload_process_tree_1.svg new file mode 100644 index 0000000000..fe026aac4d --- /dev/null +++ b/doc/source/images/reload_process_tree_1.svg @@ -0,0 +1,35 @@ + + + + + + Manager Process + + + + + + Workers + + + + diff --git a/doc/source/images/reload_process_tree_2.svg b/doc/source/images/reload_process_tree_2.svg new file mode 100644 index 0000000000..ef071e2480 --- /dev/null +++ b/doc/source/images/reload_process_tree_2.svg @@ -0,0 +1,41 @@ + + + + + + Manager Process + + + + + + Workers + + + + Socket Closer + + + + + diff --git a/doc/source/images/reload_process_tree_3.svg b/doc/source/images/reload_process_tree_3.svg new file mode 100644 index 0000000000..93918a72e9 --- /dev/null +++ b/doc/source/images/reload_process_tree_3.svg @@ -0,0 +1,41 @@ + + + + + + Re-exec'ed Manager + + + + + + Old Workers + + + + Socket Closer + + + + + diff --git a/doc/source/images/reload_process_tree_4.svg b/doc/source/images/reload_process_tree_4.svg new file mode 100644 index 0000000000..fd04d0a30f --- /dev/null +++ b/doc/source/images/reload_process_tree_4.svg @@ -0,0 +1,49 @@ + + + + + + Re-exec'ed Manager + + + + + + Old Workers + + + + Socket Closer + + + + + + New Workers + + + + + + diff --git a/doc/source/images/reload_process_tree_5.svg b/doc/source/images/reload_process_tree_5.svg new file mode 100644 index 0000000000..baac19884b --- /dev/null +++ b/doc/source/images/reload_process_tree_5.svg @@ -0,0 +1,43 @@ + + + + + + Re-exec'ed Manager + + + + + + Old Workers + + + + + + New Workers + + + + + diff --git a/doc/source/images/reload_process_tree_6.svg b/doc/source/images/reload_process_tree_6.svg new file mode 100644 index 0000000000..730a8ac6ba --- /dev/null +++ b/doc/source/images/reload_process_tree_6.svg @@ -0,0 +1,35 @@ + + + + + + Re-exec'ed Manager + + + + + + New Workers + + + + diff --git a/doc/source/images/sharded_GET.svg b/doc/source/images/sharded_GET.svg new file mode 100644 index 0000000000..03c271b5cc --- /dev/null +++ b/doc/source/images/sharded_GET.svg @@ -0,0 +1,2019 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cont (fresh db) + /.shards_acct + /acct + cont-568d8e-<ts>-0 + + + cont-750ed3-<ts>-1 + cont-4ec28d-<ts>-2 + + cont-aef34f-<ts>-3 + "" - "cat" + "cat" - "giraffe" + "giraffe" - "igloo" + "igloo" - "linux" + + cont-4837ad-<ts>-4 + "linux" - "" + + proxy + + + + + + + + 1 + + + + 2 + + + + 3 + + + + 4 + + + + 5 + + diff --git a/doc/source/images/sharding_GET.svg b/doc/source/images/sharding_GET.svg new file mode 100644 index 0000000000..5e9240feeb --- /dev/null +++ b/doc/source/images/sharding_GET.svg @@ -0,0 +1,2112 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cont (fresh db) + cont (retiring db) + /.shards_acct + /acct + cont-568d8e-<ts>-0 + + + cont-750ed3-<ts>-1 + cont-4ec28d-<ts>-2 + + cat + + giraffe + + igloo + + linux + + cont-aef34f-<ts>-3 + "" - "cat" + "cat" - "giraffe" + "giraffe" - "igloo" + "igloo" - "linux" + "linux" - "" + + + proxy + + + + + + 1 + + + + 2 + + + + 3 + + + + 4 + + + + 5 + + + + 3 + + + + 4 + + + + + + + diff --git a/doc/source/images/sharding_cleave1_load.svg b/doc/source/images/sharding_cleave1_load.svg new file mode 100644 index 0000000000..4485e3ea09 --- /dev/null +++ b/doc/source/images/sharding_cleave1_load.svg @@ -0,0 +1,1694 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cont (fresh db) + cont (retiring db) + /.shards_acct + /acct + cont-568d8e-<ts>-0 + + + cont-750ed3-<ts>-1 + cont-4ec28d-<ts>-2 + + + + + + + + cat + + giraffe + + igloo + "igloo" - "" + "" - "cat" + "cat" - "giraffe" + "giraffe" - "igloo" + diff --git a/doc/source/images/sharding_cleave2_load.svg b/doc/source/images/sharding_cleave2_load.svg new file mode 100644 index 0000000000..548aab56ab --- /dev/null +++ b/doc/source/images/sharding_cleave2_load.svg @@ -0,0 +1,1754 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cont (fresh db) + cont (retiring db) + /.shards_acct + /acct + cont-568d8e-<ts>-0 + + + cont-750ed3-<ts>-1 + cont-4ec28d-<ts>-2 + + cat + + + + + + giraffe + + + + + + igloo + + linux + + + + cont-aef34f-<ts>-3 + "" - "cat" + "cat" - "giraffe" + "giraffe" - "igloo" + "igloo" - "linux" + "linux" - "" + diff --git a/doc/source/images/sharding_cleave_basic.svg b/doc/source/images/sharding_cleave_basic.svg new file mode 100644 index 0000000000..fd5069754f --- /dev/null +++ b/doc/source/images/sharding_cleave_basic.svg @@ -0,0 +1,649 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + /.shards_acct + /acct + cont-568d8e-<ts>-0 + cont-750ed3-<ts>-1 + cont + + diff --git a/doc/source/images/sharding_db_states.svg b/doc/source/images/sharding_db_states.svg new file mode 100644 index 0000000000..6693ef9b3a --- /dev/null +++ b/doc/source/images/sharding_db_states.svg @@ -0,0 +1,1502 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Container DB + + + Container DB + + + + + + + Retiring DB + + + Retiring DB + + + + + + + Fresh DB + + + Fresh DB + + + + + + + Fresh DB + + + Fresh DB + + + + + + + + + SHARDED + + + SHARDED + + + + + + + + + + + UNSHARDED + + + UNSHARDED + + + + + + + SHARDING + + + SHARDING + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/source/images/sharding_scan_basic.svg b/doc/source/images/sharding_scan_basic.svg new file mode 100644 index 0000000000..54c30f0d8d --- /dev/null +++ b/doc/source/images/sharding_scan_basic.svg @@ -0,0 +1,259 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + /acct + cont + + + cat + giraffe + diff --git a/doc/source/images/sharding_scan_load.svg b/doc/source/images/sharding_scan_load.svg new file mode 100644 index 0000000000..327ac1a06c --- /dev/null +++ b/doc/source/images/sharding_scan_load.svg @@ -0,0 +1,1665 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cont (fresh db) + cont (retiring db) + /.shards_acct + /acct + cont-568d8e-<ts>-0 + + + cont-750ed3-<ts>-1 + cont-4ec28d-<ts>-2 + "" - "cat" + "cat" - "giraffe" + "giraffe" - "igloo" + + + + + cat + + giraffe + + igloo + "igloo" - "" + diff --git a/doc/source/images/sharding_sharded_load.svg b/doc/source/images/sharding_sharded_load.svg new file mode 100644 index 0000000000..ae9aacb86c --- /dev/null +++ b/doc/source/images/sharding_sharded_load.svg @@ -0,0 +1,1650 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cont (fresh db) + + /.shards_acct + /acct + cont-568d8e-<ts>-0 + + + cont-750ed3-<ts>-1 + cont-4ec28d-<ts>-2 + + + + cont-aef34f-<ts>-3 + "" - "cat" + "cat" - "giraffe" + "giraffe" - "igloo" + "igloo" - "linux" + + + + cont-4837ad-<ts>-4 + "linux" - "" + diff --git a/doc/source/images/sharding_unsharded.svg b/doc/source/images/sharding_unsharded.svg new file mode 100644 index 0000000000..4241b0de13 --- /dev/null +++ b/doc/source/images/sharding_unsharded.svg @@ -0,0 +1,199 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + /acct + cont + diff --git a/doc/source/images/sharding_unsharded_load.svg b/doc/source/images/sharding_unsharded_load.svg new file mode 100644 index 0000000000..e613e8cbbd --- /dev/null +++ b/doc/source/images/sharding_unsharded_load.svg @@ -0,0 +1,219 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cont + /acct + diff --git a/doc/source/index.rst b/doc/source/index.rst index 2d8b97ba5a..6332d396e5 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -1,5 +1,5 @@ .. - Copyright 2010-2012 OpenStack LLC + Copyright 2010-2012 OpenStack Foundation All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may @@ -13,16 +13,17 @@ WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. - + +================================= Welcome to Swift's documentation! ================================= -Swift is a highly available, distributed, eventually consistent object/blob +Swift is a highly available, distributed, eventually consistent object/blob store. Organizations can use Swift to store lots of data efficiently, safely, and cheaply. This documentation is generated by the Sphinx toolkit and lives in the source tree. Additional documentation on Swift and other components of OpenStack can -be found on the `OpenStack wiki`_ and at http://docs.openstack.org. +be found on the `OpenStack wiki`_ and at http://docs.openstack.org. .. _`OpenStack wiki`: http://wiki.openstack.org @@ -32,8 +33,8 @@ be found on the `OpenStack wiki`_ and at http://docs.openstack.org. .. toctree:: - :maxdepth: 1 - + :maxdepth: 2 + getting_started Overview and Concepts @@ -42,18 +43,40 @@ Overview and Concepts .. toctree:: :maxdepth: 1 + api/object_api_v1_overview overview_architecture + overview_wsgi_management overview_ring + overview_ring_format + overview_policies overview_reaper overview_auth + overview_acl overview_replication ratelimit overview_large_objects - overview_object_versioning + overview_global_cluster overview_container_sync overview_expiring_objects + cors + crossdomain + overview_erasure_code + overview_encryption + overview_backing_store + overview_container_sharding + ring_background + ring_partpower associated_projects +Contributor Documentation +========================= + +.. toctree:: + :maxdepth: 2 + + contributor/contributing + contributor/review_guidelines + Developer Documentation ======================= @@ -62,17 +85,73 @@ Developer Documentation development_guidelines development_saio + first_contribution_swift + policies_saio development_auth + development_middleware + development_ondisk_backends + development_watchers Administrator Documentation =========================== .. toctree:: :maxdepth: 1 - - howto_installmultinode + deployment_guide + apache_deployment_guide admin_guide + replication_network + logs + ops_runbook/index + admin/index + install/index + config/index + + +Object Storage v1 REST API Documentation +======================================== + +See `Complete Reference for the Object Storage REST API `_ + +The following provides supporting information for the REST API: + +.. toctree:: + :maxdepth: 1 + + api/object_api_v1_overview.rst + api/discoverability.rst + api/authentication.rst + api/container_quotas.rst + api/object_versioning.rst + api/large_objects.rst + api/temporary_url_middleware.rst + api/form_post_middleware.rst + api/use_content-encoding_metadata.rst + api/use_the_content-disposition_metadata.rst + api/pseudo-hierarchical-folders-directories.rst + api/pagination.rst + api/serialized-response-formats.rst + api/static-website.rst + api/object-expiration.rst + api/bulk-delete.rst + +S3 Compatibility Info +===================== + +.. toctree:: + :maxdepth: 1 + + s3_compat + +OpenStack End User Guide +======================== + +The `OpenStack End User Guide `_ +has additional information on using Swift. +See the `Manage objects and containers `_ +section. + Source Documentation ==================== @@ -87,6 +166,8 @@ Source Documentation db object misc + middleware + audit_watchers Indices and tables @@ -95,4 +176,3 @@ Indices and tables * :ref:`genindex` * :ref:`modindex` * :ref:`search` - diff --git a/doc/source/install/controller-common_prerequisites.txt b/doc/source/install/controller-common_prerequisites.txt new file mode 100644 index 0000000000..0ba7c3d5f1 --- /dev/null +++ b/doc/source/install/controller-common_prerequisites.txt @@ -0,0 +1,116 @@ +Prerequisites +------------- + +The proxy service relies on an authentication and authorization mechanism such +as the Identity service. However, unlike other services, it also offers an +internal mechanism that allows it to operate without any other OpenStack +services. Before you configure the Object Storage service, you must +create service credentials and an API endpoint. + +.. note:: + + The Object Storage service does not use an SQL database on the controller + node. Instead, it uses distributed SQLite databases on each storage node. + +#. Source the ``admin`` credentials to gain access to admin-only CLI commands: + + .. code-block:: console + + $ . admin-openrc + +#. To create the Identity service credentials, complete these steps: + + * Create the ``swift`` user: + + .. code-block:: console + + $ openstack user create --domain default --password-prompt swift + User Password: + Repeat User Password: + +-----------+----------------------------------+ + | Field | Value | + +-----------+----------------------------------+ + | domain_id | default | + | enabled | True | + | id | d535e5cbd2b74ac7bfb97db9cced3ed6 | + | name | swift | + +-----------+----------------------------------+ + + * Add the ``admin`` role to the ``swift`` user: + + .. code-block:: console + + $ openstack role add --project service --user swift admin + + .. note:: + + This command provides no output. + + * Create the ``swift`` service entity: + + .. code-block:: console + + $ openstack service create --name swift \ + --description "OpenStack Object Storage" object-store + +-------------+----------------------------------+ + | Field | Value | + +-------------+----------------------------------+ + | description | OpenStack Object Storage | + | enabled | True | + | id | 75ef509da2c340499d454ae96a2c5c34 | + | name | swift | + | type | object-store | + +-------------+----------------------------------+ + +#. Create the Object Storage service API endpoints: + + .. code-block:: console + + $ openstack endpoint create --region RegionOne \ + object-store public http://controller:8080/v1/AUTH_%\(project_id\)s + +--------------+----------------------------------------------+ + | Field | Value | + +--------------+----------------------------------------------+ + | enabled | True | + | id | 12bfd36f26694c97813f665707114e0d | + | interface | public | + | region | RegionOne | + | region_id | RegionOne | + | service_id | 75ef509da2c340499d454ae96a2c5c34 | + | service_name | swift | + | service_type | object-store | + | url | http://controller:8080/v1/AUTH_%(project_id)s | + +--------------+----------------------------------------------+ + + $ openstack endpoint create --region RegionOne \ + object-store internal http://controller:8080/v1/AUTH_%\(project_id\)s + +--------------+----------------------------------------------+ + | Field | Value | + +--------------+----------------------------------------------+ + | enabled | True | + | id | 7a36bee6733a4b5590d74d3080ee6789 | + | interface | internal | + | region | RegionOne | + | region_id | RegionOne | + | service_id | 75ef509da2c340499d454ae96a2c5c34 | + | service_name | swift | + | service_type | object-store | + | url | http://controller:8080/v1/AUTH_%(project_id)s | + +--------------+----------------------------------------------+ + + $ openstack endpoint create --region RegionOne \ + object-store admin http://controller:8080/v1 + +--------------+----------------------------------+ + | Field | Value | + +--------------+----------------------------------+ + | enabled | True | + | id | ebb72cd6851d4defabc0b9d71cdca69b | + | interface | admin | + | region | RegionOne | + | region_id | RegionOne | + | service_id | 75ef509da2c340499d454ae96a2c5c34 | + | service_name | swift | + | service_type | object-store | + | url | http://controller:8080/v1 | + +--------------+----------------------------------+ + diff --git a/doc/source/install/controller-include.txt b/doc/source/install/controller-include.txt new file mode 100644 index 0000000000..ec6b37a270 --- /dev/null +++ b/doc/source/install/controller-include.txt @@ -0,0 +1,84 @@ +Edit the ``/etc/swift/proxy-server.conf`` file and complete the +following actions: + +* In the ``[DEFAULT]`` section, configure the bind port, user, and + configuration directory: + + .. code-block:: none + + [DEFAULT] + ... + bind_port = 8080 + user = swift + swift_dir = /etc/swift + +* In the ``[pipeline:main]`` section, remove the ``tempurl`` and + ``tempauth`` modules and add the ``authtoken`` and ``keystoneauth`` + modules: + + .. code-block:: none + + [pipeline:main] + pipeline = catch_errors gatekeeper healthcheck proxy-logging cache container_sync bulk ratelimit authtoken keystoneauth container-quotas account-quotas slo dlo versioned_writes proxy-logging proxy-server + + .. note:: + + Do not change the order of the modules. + + .. note:: + + For more information on other modules that enable additional features, + see the `Deployment Guide `__. + +* In the ``[app:proxy-server]`` section, enable automatic account creation: + + .. code-block:: console + + [app:proxy-server] + use = egg:swift#proxy + ... + account_autocreate = True + +* In the ``[filter:keystoneauth]`` section, configure the operator roles: + + .. code-block:: console + + [filter:keystoneauth] + use = egg:swift#keystoneauth + ... + operator_roles = admin,user + +* In the ``[filter:authtoken]`` section, configure Identity service access: + + .. code-block:: none + + [filter:authtoken] + paste.filter_factory = keystonemiddleware.auth_token:filter_factory + ... + www_authenticate_uri = http://controller:5000 + auth_url = http://controller:5000 + memcached_servers = controller:11211 + auth_type = password + project_domain_id = default + user_domain_id = default + project_name = service + username = swift + password = SWIFT_PASS + delay_auth_decision = True + + Replace ``SWIFT_PASS`` with the password you chose for the ``swift`` user + in the Identity service. + + .. note:: + + Comment out or remove any other options in the ``[filter:authtoken]`` + section. + +* In the ``[filter:cache]`` section, configure the ``memcached`` location: + + .. code-block:: none + + [filter:cache] + use = egg:swift#memcache + ... + memcache_servers = controller:11211 diff --git a/doc/source/install/controller-install-debian.rst b/doc/source/install/controller-install-debian.rst new file mode 100644 index 0000000000..abcd65ff77 --- /dev/null +++ b/doc/source/install/controller-install-debian.rst @@ -0,0 +1,52 @@ +.. _controller-debian: + +Install and configure the controller node for Debian +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This section describes how to install and configure the proxy service that +handles requests for the account, container, and object services operating +on the storage nodes. For simplicity, this guide installs and configures +the proxy service on the controller node. However, you can run the proxy +service on any node with network connectivity to the storage nodes. +Additionally, you can install and configure the proxy service on multiple +nodes to increase performance and redundancy. For more information, see the +`Deployment Guide `__. + +This section applies to Debian. + +.. include:: controller-common_prerequisites.txt + +Install and configure components +-------------------------------- + +.. note:: + + Default configuration files vary by distribution. You might need + to add these sections and options rather than modifying existing + sections and options. Also, an ellipsis (``...``) in the configuration + snippets indicates potential default configuration options that you + should retain. + +#. Install the packages: + + .. code-block:: console + + # apt-get install swift swift-proxy python3-swiftclient \ + python3-keystoneclient python3-keystonemiddleware \ + memcached + + .. note:: + + Complete OpenStack environments already include some of these + packages. + + 2. Create the ``/etc/swift`` directory. + + 3. Obtain the proxy service configuration file from the Object Storage + source repository: + + .. code-block:: console + + # curl -o /etc/swift/proxy-server.conf https://opendev.org/openstack/swift/raw/branch/master/etc/proxy-server.conf-sample + + 4. .. include:: controller-include.txt diff --git a/doc/source/install/controller-install-rdo.rst b/doc/source/install/controller-install-rdo.rst new file mode 100644 index 0000000000..076ecaf200 --- /dev/null +++ b/doc/source/install/controller-install-rdo.rst @@ -0,0 +1,50 @@ +.. _controller-rdo: + +Install and configure the controller node for Red Hat Enterprise Linux and CentOS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This section describes how to install and configure the proxy service that +handles requests for the account, container, and object services operating +on the storage nodes. For simplicity, this guide installs and configures +the proxy service on the controller node. However, you can run the proxy +service on any node with network connectivity to the storage nodes. +Additionally, you can install and configure the proxy service on multiple +nodes to increase performance and redundancy. For more information, see the +`Deployment Guide `__. + +This section applies to Red Hat Enterprise Linux 9 and CentOS stream9. + +.. include:: controller-common_prerequisites.txt + +Install and configure components +-------------------------------- + +.. note:: + + Default configuration files vary by distribution. You might need + to add these sections and options rather than modifying existing + sections and options. Also, an ellipsis (``...``) in the configuration + snippets indicates potential default configuration options that you + should retain. + +#. Install the packages: + + .. code-block:: console + + # dnf install openstack-swift-proxy python3-swiftclient \ + python3-keystoneclient python3-keystonemiddleware \ + memcached + + .. note:: + + Complete OpenStack environments already include some of these + packages. + + 2. Obtain the proxy service configuration file from the Object Storage + source repository: + + .. code-block:: console + + # curl -o /etc/swift/proxy-server.conf https://opendev.org/openstack/swift/raw/branch/master/etc/proxy-server.conf-sample + + 3. .. include:: controller-include.txt diff --git a/doc/source/install/controller-install-ubuntu.rst b/doc/source/install/controller-install-ubuntu.rst new file mode 100644 index 0000000000..e60ba8e872 --- /dev/null +++ b/doc/source/install/controller-install-ubuntu.rst @@ -0,0 +1,52 @@ +.. _controller-ubuntu: + +Install and configure the controller node for Ubuntu +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This section describes how to install and configure the proxy service that +handles requests for the account, container, and object services operating +on the storage nodes. For simplicity, this guide installs and configures +the proxy service on the controller node. However, you can run the proxy +service on any node with network connectivity to the storage nodes. +Additionally, you can install and configure the proxy service on multiple +nodes to increase performance and redundancy. For more information, see the +`Deployment Guide `__. + +This section applies to Ubuntu 14.04 (LTS). + +.. include:: controller-common_prerequisites.txt + +Install and configure components +-------------------------------- + +.. note:: + + Default configuration files vary by distribution. You might need + to add these sections and options rather than modifying existing + sections and options. Also, an ellipsis (``...``) in the configuration + snippets indicates potential default configuration options that you + should retain. + +#. Install the packages: + + .. code-block:: console + + # apt-get install swift swift-proxy python-swiftclient \ + python-keystoneclient python-keystonemiddleware \ + memcached + + .. note:: + + Complete OpenStack environments already include some of these + packages. + + 2. Create the ``/etc/swift`` directory. + + 3. Obtain the proxy service configuration file from the Object Storage + source repository: + + .. code-block:: console + + # curl -o /etc/swift/proxy-server.conf https://opendev.org/openstack/swift/raw/branch/master/etc/proxy-server.conf-sample + + 4. .. include:: controller-include.txt diff --git a/doc/source/install/controller-install.rst b/doc/source/install/controller-install.rst new file mode 100644 index 0000000000..294c2396b3 --- /dev/null +++ b/doc/source/install/controller-install.rst @@ -0,0 +1,17 @@ +.. _controller: + +Install and configure the controller node +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This section describes how to install and configure the proxy service that +handles requests for the account, container, and object services operating +on the storage nodes. + +Note that installation and configuration vary by distribution. + +.. toctree:: + :maxdepth: 1 + + controller-install-rdo.rst + controller-install-ubuntu.rst + controller-install-debian.rst diff --git a/doc/source/install/edit_hosts_file.txt b/doc/source/install/edit_hosts_file.txt new file mode 100644 index 0000000000..2e9bc9659f --- /dev/null +++ b/doc/source/install/edit_hosts_file.txt @@ -0,0 +1,19 @@ +Edit the ``/etc/hosts`` file to contain the following: + +.. code-block:: none + + # controller + 10.0.0.11 controller + + # compute1 + 10.0.0.31 compute1 + + # block1 + 10.0.0.41 block1 + + # object1 + 10.0.0.51 object1 + + # object2 + 10.0.0.52 object2 + diff --git a/doc/source/install/environment-networking.rst b/doc/source/install/environment-networking.rst new file mode 100644 index 0000000000..befb309fa7 --- /dev/null +++ b/doc/source/install/environment-networking.rst @@ -0,0 +1,66 @@ +.. _networking: + +Configure networking +~~~~~~~~~~~~~~~~~~~~ + +Before you start deploying the Object Storage service in your OpenStack +environment, configure networking for two additional storage nodes. + +First node +---------- + +Configure network interfaces +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* Configure the management interface: + + * IP address: ``10.0.0.51`` + + * Network mask: ``255.255.255.0`` (or ``/24``) + + * Default gateway: ``10.0.0.1`` + +Configure name resolution +^^^^^^^^^^^^^^^^^^^^^^^^^ + +#. Set the hostname of the node to ``object1``. + +#. .. include:: edit_hosts_file.txt + +#. Reboot the system to activate the changes. + +Second node +----------- + +Configure network interfaces +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* Configure the management interface: + + * IP address: ``10.0.0.52`` + + * Network mask: ``255.255.255.0`` (or ``/24``) + + * Default gateway: ``10.0.0.1`` + +Configure name resolution +^^^^^^^^^^^^^^^^^^^^^^^^^ + +#. Set the hostname of the node to ``object2``. + +#. .. include:: edit_hosts_file.txt + +#. Reboot the system to activate the changes. + +.. warning:: + + Some distributions add an extraneous entry in the ``/etc/hosts`` + file that resolves the actual hostname to another loopback IP + address such as ``127.0.1.1``. You must comment out or remove this + entry to prevent name resolution problems. **Do not remove the + 127.0.0.1 entry.** + +.. note:: + + To reduce complexity of this guide, we add host entries for optional + services regardless of whether you choose to deploy them. diff --git a/doc/source/install/finalize-installation-rdo.rst b/doc/source/install/finalize-installation-rdo.rst new file mode 100644 index 0000000000..ec6b115061 --- /dev/null +++ b/doc/source/install/finalize-installation-rdo.rst @@ -0,0 +1,89 @@ +.. _finalize-rdo: + +Finalize installation for Red Hat Enterprise Linux and CentOS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. note:: + + Default configuration files vary by distribution. You might need + to add these sections and options rather than modifying existing + sections and options. Also, an ellipsis (``...``) in the configuration + snippets indicates potential default configuration options that you + should retain. + +This section applies to Red Hat Enterprise Linux 9 and CentOS stream9. + +#. Obtain the ``/etc/swift/swift.conf`` file from the Object + Storage source repository: + + .. code-block:: console + + # curl -o /etc/swift/swift.conf \ + https://opendev.org/openstack/swift/raw/branch/master/etc/swift.conf-sample + +#. Edit the ``/etc/swift/swift.conf`` file and complete the following + actions: + + * In the ``[swift-hash]`` section, configure the hash path prefix and + suffix for your environment. + + .. code-block:: none + + [swift-hash] + ... + swift_hash_path_suffix = HASH_PATH_SUFFIX + swift_hash_path_prefix = HASH_PATH_PREFIX + + Replace HASH_PATH_PREFIX and HASH_PATH_SUFFIX with unique values. + + .. warning:: + + Keep these values secret and do not change or lose them. + + * In the ``[storage-policy:0]`` section, configure the default + storage policy: + + .. code-block:: none + + [storage-policy:0] + ... + name = Policy-0 + default = yes + +#. Copy the ``swift.conf`` file to the ``/etc/swift`` directory on + each storage node and any additional nodes running the proxy service. + +4. On all nodes, ensure proper ownership of the configuration directory: + + .. code-block:: console + + # chown -R root:swift /etc/swift + +5. On the controller node and any other nodes running the proxy service, + start the Object Storage proxy service including its dependencies and + configure them to start when the system boots: + + .. code-block:: console + + # systemctl enable openstack-swift-proxy.service memcached.service + # systemctl start openstack-swift-proxy.service memcached.service + +6. On the storage nodes, start the Object Storage services and configure + them to start when the system boots: + + .. code-block:: console + + # systemctl enable openstack-swift-account.service openstack-swift-account-auditor.service \ + openstack-swift-account-reaper.service openstack-swift-account-replicator.service + # systemctl start openstack-swift-account.service openstack-swift-account-auditor.service \ + openstack-swift-account-reaper.service openstack-swift-account-replicator.service + # systemctl enable openstack-swift-container.service \ + openstack-swift-container-auditor.service openstack-swift-container-replicator.service \ + openstack-swift-container-updater.service + # systemctl start openstack-swift-container.service \ + openstack-swift-container-auditor.service openstack-swift-container-replicator.service \ + openstack-swift-container-updater.service + # systemctl enable openstack-swift-object.service openstack-swift-object-auditor.service \ + openstack-swift-object-replicator.service openstack-swift-object-updater.service + # systemctl start openstack-swift-object.service openstack-swift-object-auditor.service \ + openstack-swift-object-replicator.service openstack-swift-object-updater.service diff --git a/doc/source/install/finalize-installation-ubuntu-debian.rst b/doc/source/install/finalize-installation-ubuntu-debian.rst new file mode 100644 index 0000000000..ccde2fd568 --- /dev/null +++ b/doc/source/install/finalize-installation-ubuntu-debian.rst @@ -0,0 +1,80 @@ +.. _finalize-ubuntu-debian: + +Finalize installation for Ubuntu and Debian +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. note:: + + Default configuration files vary by distribution. You might need + to add these sections and options rather than modifying existing + sections and options. Also, an ellipsis (``...``) in the configuration + snippets indicates potential default configuration options that you + should retain. + +This section applies to Ubuntu 14.04 (LTS) and Debian. + +#. Obtain the ``/etc/swift/swift.conf`` file from the Object + Storage source repository: + + .. code-block:: console + + # curl -o /etc/swift/swift.conf \ + https://opendev.org/openstack/swift/raw/branch/master/etc/swift.conf-sample + +#. Edit the ``/etc/swift/swift.conf`` file and complete the following + actions: + + * In the ``[swift-hash]`` section, configure the hash path prefix and + suffix for your environment. + + .. code-block:: none + + [swift-hash] + ... + swift_hash_path_suffix = HASH_PATH_SUFFIX + swift_hash_path_prefix = HASH_PATH_PREFIX + + Replace HASH_PATH_PREFIX and HASH_PATH_SUFFIX with unique values. + + .. warning:: + + Keep these values secret and do not change or lose them. + + * In the ``[storage-policy:0]`` section, configure the default + storage policy: + + .. code-block:: none + + [storage-policy:0] + ... + name = Policy-0 + default = yes + +#. Copy the ``swift.conf`` file to the ``/etc/swift`` directory on + each storage node and any additional nodes running the proxy service. + +4. On all nodes, ensure proper ownership of the configuration directory: + + .. code-block:: console + + # chown -R root:swift /etc/swift + +5. On the controller node and any other nodes running the proxy service, + restart the Object Storage proxy service including its dependencies: + + .. code-block:: console + + # service memcached restart + # service swift-proxy restart + +6. On the storage nodes, start the Object Storage services: + + .. code-block:: console + + # swift-init all start + + .. note:: + + The storage node runs many Object Storage services and the + :command:`swift-init` command makes them easier to manage. + You can ignore errors from services not running on the storage node. diff --git a/doc/source/install/finalize-installation.rst b/doc/source/install/finalize-installation.rst new file mode 100644 index 0000000000..447198bcae --- /dev/null +++ b/doc/source/install/finalize-installation.rst @@ -0,0 +1,12 @@ +.. _finalize: + +Finalize installation +~~~~~~~~~~~~~~~~~~~~~ + +Finalizing installation varies by distribution. + +.. toctree:: + :maxdepth: 1 + + finalize-installation-rdo.rst + finalize-installation-ubuntu-debian.rst diff --git a/doc/source/install/get_started.rst b/doc/source/install/get_started.rst new file mode 100644 index 0000000000..1cf963010f --- /dev/null +++ b/doc/source/install/get_started.rst @@ -0,0 +1,51 @@ +=============================== +Object Storage service overview +=============================== + +The OpenStack Object Storage is a multi-tenant object storage system. It +is highly scalable and can manage large amounts of unstructured data at +low cost through a RESTful HTTP API. + +It includes the following components: + +Proxy servers (swift-proxy-server) + Accepts OpenStack Object Storage API and raw HTTP requests to upload + files, modify metadata, and create containers. It also serves file + or container listings to web browsers. To improve performance, the + proxy server can use an optional cache that is usually deployed with + memcache. + +Account servers (swift-account-server) + Manages accounts defined with Object Storage. + +Container servers (swift-container-server) + Manages the mapping of containers or folders, within Object Storage. + +Object servers (swift-object-server) + Manages actual objects, such as files, on the storage nodes. + +Various periodic processes + Performs housekeeping tasks on the large data store. The replication + services ensure consistency and availability through the cluster. + Other periodic processes include auditors, updaters, and reapers. + +WSGI middleware + Handles authentication and is usually OpenStack Identity. + +swift client + Enables users to submit commands to the REST API through a + command-line client authorized as either a admin user, reseller + user, or swift user. + +swift-init + Script that initializes the building of the ring file, takes daemon + names as parameter and offers commands. Documented in + https://docs.openstack.org/swift/latest/admin_guide.html#managing-services. + +swift-recon + A cli tool used to retrieve various metrics and telemetry information + about a cluster that has been collected by the swift-recon middleware. + +swift-ring-builder + Storage ring build and rebalance utility. Documented in + https://docs.openstack.org/swift/latest/admin_guide.html#managing-the-rings. diff --git a/doc/source/install/index.rst b/doc/source/install/index.rst new file mode 100644 index 0000000000..62d062356e --- /dev/null +++ b/doc/source/install/index.rst @@ -0,0 +1,24 @@ +============================ +Object Storage Install Guide +============================ + +.. toctree:: + :maxdepth: 2 + + get_started.rst + environment-networking.rst + controller-install.rst + storage-install.rst + initial-rings.rst + finalize-installation.rst + verify.rst + next-steps.rst + +The Object Storage services (swift) work together to provide +object storage and retrieval through a REST API. + +This chapter assumes a working setup of OpenStack following the +`OpenStack Installation Tutorial `_. + +Your environment must at least include the Identity service (keystone) +prior to deploying Object Storage. diff --git a/doc/source/install/initial-rings.rst b/doc/source/install/initial-rings.rst new file mode 100644 index 0000000000..e09dfd4ed2 --- /dev/null +++ b/doc/source/install/initial-rings.rst @@ -0,0 +1,253 @@ +Create and distribute initial rings +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Before starting the Object Storage services, you must create the initial +account, container, and object rings. The ring builder creates configuration +files that each node uses to determine and deploy the storage architecture. +For simplicity, this guide uses one region and two zones with 2^10 (1024) +maximum partitions, 3 replicas of each object, and 1 hour minimum time between +moving a partition more than once. For Object Storage, a partition indicates a +directory on a storage device rather than a conventional partition table. +For more information, see the +`Deployment Guide `__. + +.. note:: + Perform these steps on the controller node. + +Create account ring +------------------- + +The account server uses the account ring to maintain lists of containers. + +#. Change to the ``/etc/swift`` directory. + +#. Create the base ``account.builder`` file: + + .. code-block:: console + + # swift-ring-builder account.builder create 10 3 1 + + .. note:: + + This command provides no output. + +#. Add each storage node to the ring: + + .. code-block:: console + + # swift-ring-builder account.builder \ + add --region 1 --zone 1 --ip STORAGE_NODE_MANAGEMENT_INTERFACE_IP_ADDRESS --port 6202 \ + --device DEVICE_NAME --weight DEVICE_WEIGHT + + Replace ``STORAGE_NODE_MANAGEMENT_INTERFACE_IP_ADDRESS`` with the IP address + of the management network on the storage node. Replace ``DEVICE_NAME`` with a + storage device name on the same storage node. For example, using the first + storage node in :ref:`storage` with the ``/dev/sdb`` storage + device and weight of 100: + + .. code-block:: console + + # swift-ring-builder account.builder add \ + --region 1 --zone 1 --ip 10.0.0.51 --port 6202 --device sdb --weight 100 + + Repeat this command for each storage device on each storage node. In the + example architecture, use the command in four variations: + + .. code-block:: console + + # swift-ring-builder account.builder add \ + --region 1 --zone 1 --ip 10.0.0.51 --port 6202 --device sdb --weight 100 + Device d0r1z1-10.0.0.51:6202R10.0.0.51:6202/sdb_"" with 100.0 weight got id 0 + # swift-ring-builder account.builder add \ + --region 1 --zone 1 --ip 10.0.0.51 --port 6202 --device sdc --weight 100 + Device d1r1z2-10.0.0.51:6202R10.0.0.51:6202/sdc_"" with 100.0 weight got id 1 + # swift-ring-builder account.builder add \ + --region 1 --zone 2 --ip 10.0.0.52 --port 6202 --device sdb --weight 100 + Device d2r1z3-10.0.0.52:6202R10.0.0.52:6202/sdb_"" with 100.0 weight got id 2 + # swift-ring-builder account.builder add \ + --region 1 --zone 2 --ip 10.0.0.52 --port 6202 --device sdc --weight 100 + Device d3r1z4-10.0.0.52:6202R10.0.0.52:6202/sdc_"" with 100.0 weight got id 3 + +#. Verify the ring contents: + + .. code-block:: console + + # swift-ring-builder account.builder + account.builder, build version 4 + 1024 partitions, 3.000000 replicas, 1 regions, 2 zones, 4 devices, 100.00 balance, 0.00 dispersion + The minimum number of hours before a partition can be reassigned is 1 + The overload factor is 0.00% (0.000000) + Devices: id region zone ip address port replication ip replication port name weight partitions balance meta + 0 1 1 10.0.0.51 6202 10.0.0.51 6202 sdb 100.00 0 -100.00 + 1 1 1 10.0.0.51 6202 10.0.0.51 6202 sdc 100.00 0 -100.00 + 2 1 2 10.0.0.52 6202 10.0.0.52 6202 sdb 100.00 0 -100.00 + 3 1 2 10.0.0.52 6202 10.0.0.52 6202 sdc 100.00 0 -100.00 + +#. Rebalance the ring: + + .. code-block:: console + + # swift-ring-builder account.builder rebalance + Reassigned 1024 (100.00%) partitions. Balance is now 0.00. Dispersion is now 0.00 + +Create container ring +--------------------- + +The container server uses the container ring to maintain lists of objects. +However, it does not track object locations. + +#. Change to the ``/etc/swift`` directory. + +#. Create the base ``container.builder`` file: + + .. code-block:: console + + # swift-ring-builder container.builder create 10 3 1 + + .. note:: + + This command provides no output. + +#. Add each storage node to the ring: + + .. code-block:: console + + # swift-ring-builder container.builder \ + add --region 1 --zone 1 --ip STORAGE_NODE_MANAGEMENT_INTERFACE_IP_ADDRESS --port 6201 \ + --device DEVICE_NAME --weight DEVICE_WEIGHT + + Replace ``STORAGE_NODE_MANAGEMENT_INTERFACE_IP_ADDRESS`` with the IP address + of the management network on the storage node. Replace ``DEVICE_NAME`` with a + storage device name on the same storage node. For example, using the first + storage node in :ref:`storage` with the ``/dev/sdb`` + storage device and weight of 100: + + .. code-block:: console + + # swift-ring-builder container.builder add \ + --region 1 --zone 1 --ip 10.0.0.51 --port 6201 --device sdb --weight 100 + + Repeat this command for each storage device on each storage node. In the + example architecture, use the command in four variations: + + .. code-block:: console + + # swift-ring-builder container.builder add \ + --region 1 --zone 1 --ip 10.0.0.51 --port 6201 --device sdb --weight 100 + Device d0r1z1-10.0.0.51:6201R10.0.0.51:6201/sdb_"" with 100.0 weight got id 0 + # swift-ring-builder container.builder add \ + --region 1 --zone 1 --ip 10.0.0.51 --port 6201 --device sdc --weight 100 + Device d1r1z2-10.0.0.51:6201R10.0.0.51:6201/sdc_"" with 100.0 weight got id 1 + # swift-ring-builder container.builder add \ + --region 1 --zone 2 --ip 10.0.0.52 --port 6201 --device sdb --weight 100 + Device d2r1z3-10.0.0.52:6201R10.0.0.52:6201/sdb_"" with 100.0 weight got id 2 + # swift-ring-builder container.builder add \ + --region 1 --zone 2 --ip 10.0.0.52 --port 6201 --device sdc --weight 100 + Device d3r1z4-10.0.0.52:6201R10.0.0.52:6201/sdc_"" with 100.0 weight got id 3 + +#. Verify the ring contents: + + .. code-block:: console + + # swift-ring-builder container.builder + container.builder, build version 4 + 1024 partitions, 3.000000 replicas, 1 regions, 2 zones, 4 devices, 100.00 balance, 0.00 dispersion + The minimum number of hours before a partition can be reassigned is 1 + The overload factor is 0.00% (0.000000) + Devices: id region zone ip address port replication ip replication port name weight partitions balance meta + 0 1 1 10.0.0.51 6201 10.0.0.51 6201 sdb 100.00 0 -100.00 + 1 1 1 10.0.0.51 6201 10.0.0.51 6201 sdc 100.00 0 -100.00 + 2 1 2 10.0.0.52 6201 10.0.0.52 6201 sdb 100.00 0 -100.00 + 3 1 2 10.0.0.52 6201 10.0.0.52 6201 sdc 100.00 0 -100.00 + +#. Rebalance the ring: + + .. code-block:: console + + # swift-ring-builder container.builder rebalance + Reassigned 1024 (100.00%) partitions. Balance is now 0.00. Dispersion is now 0.00 + +Create object ring +------------------ + +The object server uses the object ring to maintain lists of object locations +on local devices. + +#. Change to the ``/etc/swift`` directory. + +#. Create the base ``object.builder`` file: + + .. code-block:: console + + # swift-ring-builder object.builder create 10 3 1 + + .. note:: + + This command provides no output. + +#. Add each storage node to the ring: + + .. code-block:: console + + # swift-ring-builder object.builder \ + add --region 1 --zone 1 --ip STORAGE_NODE_MANAGEMENT_INTERFACE_IP_ADDRESS --port 6200 \ + --device DEVICE_NAME --weight DEVICE_WEIGHT + + Replace ``STORAGE_NODE_MANAGEMENT_INTERFACE_IP_ADDRESS`` with the IP address + of the management network on the storage node. Replace ``DEVICE_NAME`` with + a storage device name on the same storage node. For example, using the first + storage node in :ref:`storage` with the ``/dev/sdb`` storage + device and weight of 100: + + .. code-block:: console + + # swift-ring-builder object.builder add \ + --region 1 --zone 1 --ip 10.0.0.51 --port 6200 --device sdb --weight 100 + + Repeat this command for each storage device on each storage node. In the + example architecture, use the command in four variations: + + .. code-block:: console + + # swift-ring-builder object.builder add \ + --region 1 --zone 1 --ip 10.0.0.51 --port 6200 --device sdb --weight 100 + Device d0r1z1-10.0.0.51:6200R10.0.0.51:6200/sdb_"" with 100.0 weight got id 0 + # swift-ring-builder object.builder add \ + --region 1 --zone 1 --ip 10.0.0.51 --port 6200 --device sdc --weight 100 + Device d1r1z2-10.0.0.51:6200R10.0.0.51:6200/sdc_"" with 100.0 weight got id 1 + # swift-ring-builder object.builder add \ + --region 1 --zone 2 --ip 10.0.0.52 --port 6200 --device sdb --weight 100 + Device d2r1z3-10.0.0.52:6200R10.0.0.52:6200/sdb_"" with 100.0 weight got id 2 + # swift-ring-builder object.builder add \ + --region 1 --zone 2 --ip 10.0.0.52 --port 6200 --device sdc --weight 100 + Device d3r1z4-10.0.0.52:6200R10.0.0.52:6200/sdc_"" with 100.0 weight got id 3 + +#. Verify the ring contents: + + .. code-block:: console + + # swift-ring-builder object.builder + object.builder, build version 4 + 1024 partitions, 3.000000 replicas, 1 regions, 2 zones, 4 devices, 100.00 balance, 0.00 dispersion + The minimum number of hours before a partition can be reassigned is 1 + The overload factor is 0.00% (0.000000) + Devices: id region zone ip address port replication ip replication port name weight partitions balance meta + 0 1 1 10.0.0.51 6200 10.0.0.51 6200 sdb 100.00 0 -100.00 + 1 1 1 10.0.0.51 6200 10.0.0.51 6200 sdc 100.00 0 -100.00 + 2 1 2 10.0.0.52 6200 10.0.0.52 6200 sdb 100.00 0 -100.00 + 3 1 2 10.0.0.52 6200 10.0.0.52 6200 sdc 100.00 0 -100.00 + +#. Rebalance the ring: + + .. code-block:: console + + # swift-ring-builder object.builder rebalance + Reassigned 1024 (100.00%) partitions. Balance is now 0.00. Dispersion is now 0.00 + +Distribute ring configuration files +----------------------------------- + +* Copy the ``account.ring.gz``, ``container.ring.gz``, and + ``object.ring.gz`` files to the ``/etc/swift`` directory + on each storage node and any additional nodes running the + proxy service. diff --git a/doc/source/install/next-steps.rst b/doc/source/install/next-steps.rst new file mode 100644 index 0000000000..27585ff818 --- /dev/null +++ b/doc/source/install/next-steps.rst @@ -0,0 +1,10 @@ +.. _next-steps: + +========== +Next steps +========== + +Your OpenStack environment now includes Object Storage. + +To add more services, see the +`additional documentation on installing OpenStack `_ . diff --git a/doc/source/install/storage-include1.txt b/doc/source/install/storage-include1.txt new file mode 100644 index 0000000000..711782300a --- /dev/null +++ b/doc/source/install/storage-include1.txt @@ -0,0 +1,41 @@ +Edit the ``/etc/swift/account-server.conf`` file and complete the +following actions: + +* In the ``[DEFAULT]`` section, configure the bind IP address, bind port, + user, configuration directory, and mount point directory: + + .. code-block:: none + + [DEFAULT] + ... + bind_ip = MANAGEMENT_INTERFACE_IP_ADDRESS + bind_port = 6202 + user = swift + swift_dir = /etc/swift + devices = /srv/node + mount_check = True + + Replace ``MANAGEMENT_INTERFACE_IP_ADDRESS`` with the IP address of the + management network on the storage node. + +* In the ``[pipeline:main]`` section, enable the appropriate modules: + + .. code-block:: none + + [pipeline:main] + pipeline = healthcheck recon account-server + + .. note:: + + For more information on other modules that enable additional features, + see the `Deployment Guide `__. + +* In the ``[filter:recon]`` section, configure the recon (meters) cache + directory: + + .. code-block:: none + + [filter:recon] + use = egg:swift#recon + ... + recon_cache_path = /var/cache/swift diff --git a/doc/source/install/storage-include2.txt b/doc/source/install/storage-include2.txt new file mode 100644 index 0000000000..cb320d9a1b --- /dev/null +++ b/doc/source/install/storage-include2.txt @@ -0,0 +1,41 @@ +Edit the ``/etc/swift/container-server.conf`` file and complete the +following actions: + +* In the ``[DEFAULT]`` section, configure the bind IP address, bind port, + user, configuration directory, and mount point directory: + + .. code-block:: none + + [DEFAULT] + ... + bind_ip = MANAGEMENT_INTERFACE_IP_ADDRESS + bind_port = 6201 + user = swift + swift_dir = /etc/swift + devices = /srv/node + mount_check = True + + Replace ``MANAGEMENT_INTERFACE_IP_ADDRESS`` with the IP address of the + management network on the storage node. + +* In the ``[pipeline:main]`` section, enable the appropriate modules: + + .. code-block:: none + + [pipeline:main] + pipeline = healthcheck recon container-server + + .. note:: + + For more information on other modules that enable additional features, + see the `Deployment Guide `__. + +* In the ``[filter:recon]`` section, configure the recon (meters) cache + directory: + + .. code-block:: none + + [filter:recon] + use = egg:swift#recon + ... + recon_cache_path = /var/cache/swift diff --git a/doc/source/install/storage-include3.txt b/doc/source/install/storage-include3.txt new file mode 100644 index 0000000000..2cc9e2d235 --- /dev/null +++ b/doc/source/install/storage-include3.txt @@ -0,0 +1,42 @@ +Edit the ``/etc/swift/object-server.conf`` file and complete the +following actions: + +* In the ``[DEFAULT]`` section, configure the bind IP address, bind port, + user, configuration directory, and mount point directory: + + .. code-block:: none + + [DEFAULT] + ... + bind_ip = MANAGEMENT_INTERFACE_IP_ADDRESS + bind_port = 6200 + user = swift + swift_dir = /etc/swift + devices = /srv/node + mount_check = True + + Replace ``MANAGEMENT_INTERFACE_IP_ADDRESS`` with the IP address of the + management network on the storage node. + +* In the ``[pipeline:main]`` section, enable the appropriate modules: + + .. code-block:: none + + [pipeline:main] + pipeline = healthcheck recon object-server + + .. note:: + + For more information on other modules that enable additional features, + see the `Deployment Guide `__. + +* In the ``[filter:recon]`` section, configure the recon (meters) cache + and lock directories: + + .. code-block:: none + + [filter:recon] + use = egg:swift#recon + ... + recon_cache_path = /var/cache/swift + recon_lock_path = /var/lock diff --git a/doc/source/install/storage-install-rdo.rst b/doc/source/install/storage-install-rdo.rst new file mode 100644 index 0000000000..c032d8e35c --- /dev/null +++ b/doc/source/install/storage-install-rdo.rst @@ -0,0 +1,172 @@ +.. _storage-rdo: + +Install and configure the storage nodes for Red Hat Enterprise Linux and CentOS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This section describes how to install and configure storage nodes +that operate the account, container, and object services. For +simplicity, this configuration references two storage nodes, each +containing two empty local block storage devices. The instructions +use ``/dev/sdb`` and ``/dev/sdc``, but you can substitute different +values for your particular nodes. + +Although Object Storage supports any file system with +extended attributes (xattr), testing and benchmarking +indicate the best performance and reliability on XFS. For +more information on horizontally scaling your environment, see the +`Deployment Guide `_. + +This section applies to Red Hat Enterprise Linux 9 and CentOS stream9. + +Prerequisites +------------- + +Before you install and configure the Object Storage service on the +storage nodes, you must prepare the storage devices. + +.. note:: + + Perform these steps on each storage node. + +#. Install the supporting utility packages: + + .. code-block:: console + + # dnf install xfsprogs rsync + +#. Format the ``/dev/sdb`` and ``/dev/sdc`` devices as XFS: + + .. code-block:: console + + # mkfs.xfs /dev/sdb + # mkfs.xfs /dev/sdc + +#. Create the mount point directory structure: + + .. code-block:: console + + # mkdir -p /srv/node/sdb + # mkdir -p /srv/node/sdc + +#. Find the UUID of the new partitions: + + .. code-block:: console + + # blkid + +#. Edit the ``/etc/fstab`` file and add the following to it: + + .. code-block:: none + + UUID="" /srv/node/sdb xfs noatime 0 2 + UUID="" /srv/node/sdc xfs noatime 0 2 + +#. Mount the devices: + + .. code-block:: console + + # mount /srv/node/sdb + # mount /srv/node/sdc + +#. Create or edit the ``/etc/rsyncd.conf`` file to contain the following: + + .. code-block:: none + + uid = swift + gid = swift + log file = /var/log/rsyncd.log + pid file = /var/run/rsyncd.pid + address = MANAGEMENT_INTERFACE_IP_ADDRESS + + [account] + max connections = 2 + path = /srv/node/ + read only = False + lock file = /var/lock/account.lock + + [container] + max connections = 2 + path = /srv/node/ + read only = False + lock file = /var/lock/container.lock + + [object] + max connections = 2 + path = /srv/node/ + read only = False + lock file = /var/lock/object.lock + + Replace ``MANAGEMENT_INTERFACE_IP_ADDRESS`` with the IP address of the + management network on the storage node. + + .. note:: + + The ``rsync`` service requires no authentication, so consider running + it on a private network in production environments. + +7. Start the ``rsyncd`` service and configure it to start when the + system boots: + + .. code-block:: console + + # systemctl enable rsyncd.service + # systemctl start rsyncd.service + +Install and configure components +-------------------------------- + +.. note:: + + Default configuration files vary by distribution. You might need + to add these sections and options rather than modifying existing + sections and options. Also, an ellipsis (``...``) in the configuration + snippets indicates potential default configuration options that you + should retain. + +.. note:: + + Perform these steps on each storage node. + +#. Install the packages: + + .. code-block:: console + + # dnf install openstack-swift-account openstack-swift-container \ + openstack-swift-object + +2. Obtain the accounting, container, and object service configuration + files from the Object Storage source repository: + + .. code-block:: console + + # curl -o /etc/swift/account-server.conf https://opendev.org/openstack/swift/raw/branch/master/etc/account-server.conf-sample + # curl -o /etc/swift/container-server.conf https://opendev.org/openstack/swift/raw/branch/master/etc/container-server.conf-sample + # curl -o /etc/swift/object-server.conf https://opendev.org/openstack/swift/raw/branch/master/etc/object-server.conf-sample + +3. .. include:: storage-include1.txt +4. .. include:: storage-include2.txt +5. .. include:: storage-include3.txt +6. Ensure proper ownership of the mount point directory structure: + + .. code-block:: console + + # chown -R swift:swift /srv/node + +7. Create the ``recon`` directory and ensure proper ownership of it: + + .. code-block:: console + + # mkdir -p /var/cache/swift + # chown -R root:swift /var/cache/swift + # chmod -R 775 /var/cache/swift + +8. Enable necessary access in the firewall + + .. code-block:: console + + # firewall-cmd --permanent --add-port=6200/tcp + # firewall-cmd --permanent --add-port=6201/tcp + # firewall-cmd --permanent --add-port=6202/tcp + + The rsync service includes its own firewall configuration. + Connect from one node to another to ensure that access is allowed. diff --git a/doc/source/install/storage-install-ubuntu-debian.rst b/doc/source/install/storage-install-ubuntu-debian.rst new file mode 100644 index 0000000000..2464844370 --- /dev/null +++ b/doc/source/install/storage-install-ubuntu-debian.rst @@ -0,0 +1,165 @@ +.. _storage-ubuntu-debian: + +Install and configure the storage nodes for Ubuntu and Debian +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This section describes how to install and configure storage nodes +that operate the account, container, and object services. For +simplicity, this configuration references two storage nodes, each +containing two empty local block storage devices. The instructions +use ``/dev/sdb`` and ``/dev/sdc``, but you can substitute different +values for your particular nodes. + +Although Object Storage supports any file system with +extended attributes (xattr), testing and benchmarking +indicate the best performance and reliability on XFS. For +more information on horizontally scaling your environment, see the +`Deployment Guide `_. + +This section applies to Ubuntu 14.04 (LTS) and Debian. + +Prerequisites +------------- + +Before you install and configure the Object Storage service on the +storage nodes, you must prepare the storage devices. + +.. note:: + + Perform these steps on each storage node. + +#. Install the supporting utility packages: + + .. code-block:: console + + # apt-get install xfsprogs rsync + +#. Format the ``/dev/sdb`` and ``/dev/sdc`` devices as XFS: + + .. code-block:: console + + # mkfs.xfs /dev/sdb + # mkfs.xfs /dev/sdc + +#. Create the mount point directory structure: + + .. code-block:: console + + # mkdir -p /srv/node/sdb + # mkdir -p /srv/node/sdc + +#. Find the UUID of the new partitions: + + .. code-block:: console + + # blkid + +#. Edit the ``/etc/fstab`` file and add the following to it: + + .. code-block:: none + + UUID="" /srv/node/sdb xfs noatime 0 2 + UUID="" /srv/node/sdc xfs noatime 0 2 + +#. Mount the devices: + + .. code-block:: console + + # mount /srv/node/sdb + # mount /srv/node/sdc + +#. Create or edit the ``/etc/rsyncd.conf`` file to contain the following: + + .. code-block:: none + + uid = swift + gid = swift + log file = /var/log/rsyncd.log + pid file = /var/run/rsyncd.pid + address = MANAGEMENT_INTERFACE_IP_ADDRESS + + [account] + max connections = 2 + path = /srv/node/ + read only = False + lock file = /var/lock/account.lock + + [container] + max connections = 2 + path = /srv/node/ + read only = False + lock file = /var/lock/container.lock + + [object] + max connections = 2 + path = /srv/node/ + read only = False + lock file = /var/lock/object.lock + + Replace ``MANAGEMENT_INTERFACE_IP_ADDRESS`` with the IP address of the + management network on the storage node. + + .. note:: + + The ``rsync`` service requires no authentication, so consider running + it on a private network in production environments. + +7. Edit the ``/etc/default/rsync`` file and enable the ``rsync`` + service: + + .. code-block:: none + + RSYNC_ENABLE=true + +8. Start the ``rsync`` service: + + .. code-block:: console + + # service rsync start + +Install and configure components +-------------------------------- + +.. note:: + + Default configuration files vary by distribution. You might need + to add these sections and options rather than modifying existing + sections and options. Also, an ellipsis (``...``) in the configuration + snippets indicates potential default configuration options that you + should retain. + +.. note:: + + Perform these steps on each storage node. + +#. Install the packages: + + .. code-block:: console + + # apt-get install swift swift-account swift-container swift-object + +2. Obtain the accounting, container, and object service configuration + files from the Object Storage source repository: + + .. code-block:: console + + # curl -o /etc/swift/account-server.conf https://opendev.org/openstack/swift/raw/branch/master/etc/account-server.conf-sample + # curl -o /etc/swift/container-server.conf https://opendev.org/openstack/swift/raw/branch/master/etc/container-server.conf-sample + # curl -o /etc/swift/object-server.conf https://opendev.org/openstack/swift/raw/branch/master/etc/object-server.conf-sample + +3. .. include:: storage-include1.txt +4. .. include:: storage-include2.txt +5. .. include:: storage-include3.txt +6. Ensure proper ownership of the mount point directory structure: + + .. code-block:: console + + # chown -R swift:swift /srv/node + +7. Create the ``recon`` directory and ensure proper ownership of it: + + .. code-block:: console + + # mkdir -p /var/cache/swift + # chown -R root:swift /var/cache/swift + # chmod -R 775 /var/cache/swift diff --git a/doc/source/install/storage-install.rst b/doc/source/install/storage-install.rst new file mode 100644 index 0000000000..7c1d4f9169 --- /dev/null +++ b/doc/source/install/storage-install.rst @@ -0,0 +1,15 @@ +.. _storage: + +Install and configure the storage nodes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This section describes how to install and configure storage nodes +that operate the account, container, and object services. + +Note that installation and configuration vary by distribution. + +.. toctree:: + :maxdepth: 1 + + storage-install-rdo.rst + storage-install-ubuntu-debian.rst diff --git a/doc/source/install/verify.rst b/doc/source/install/verify.rst new file mode 100644 index 0000000000..2580cdd7cd --- /dev/null +++ b/doc/source/install/verify.rst @@ -0,0 +1,94 @@ +.. _verify: + +Verify operation +~~~~~~~~~~~~~~~~ + +Verify operation of the Object Storage service. + +.. note:: + + Perform these steps on the controller node. + +.. warning:: + + If you are using Red Hat Enterprise Linux 7 or CentOS 7 and one or more of + these steps do not work, check the ``/var/log/audit/audit.log`` file for + SELinux messages indicating denial of actions for the ``swift`` processes. + If present, change the security context of the ``/srv/node`` directory to + the lowest security level (s0) for the ``swift_data_t`` type, ``object_r`` + role and the ``system_u`` user: + + .. code-block:: console + + # chcon -R system_u:object_r:swift_data_t:s0 /srv/node + +#. Source the ``demo`` credentials: + + .. code-block:: console + + $ . demo-openrc + +#. Show the service status: + + .. code-block:: console + + $ swift stat + Account: AUTH_ed0b60bf607743088218b0a533d5943f + Containers: 0 + Objects: 0 + Bytes: 0 + X-Account-Project-Domain-Id: default + X-Timestamp: 1444143887.71539 + X-Trans-Id: tx1396aeaf17254e94beb34-0056143bde + X-Openstack-Request-Id: tx1396aeaf17254e94beb34-0056143bde + Content-Type: text/plain; charset=utf-8 + Accept-Ranges: bytes + +#. Create ``container1`` container: + + .. code-block:: console + + $ openstack container create container1 + +---------------------------------------+------------+------------------------------------+ + | account | container | x-trans-id | + +---------------------------------------+------------+------------------------------------+ + | AUTH_ed0b60bf607743088218b0a533d5943f | container1 | tx8c4034dc306c44dd8cd68-0056f00a4a | + +---------------------------------------+------------+------------------------------------+ + +#. Upload a test file to the ``container1`` container: + + .. code-block:: console + + $ openstack object create container1 FILE + +--------+------------+----------------------------------+ + | object | container | etag | + +--------+------------+----------------------------------+ + | FILE | container1 | ee1eca47dc88f4879d8a229cc70a07c6 | + +--------+------------+----------------------------------+ + + Replace ``FILE`` with the name of a local file to upload to the + ``container1`` container. + +#. List files in the ``container1`` container: + + .. code-block:: console + + $ openstack object list container1 + +------+ + | Name | + +------+ + | FILE | + +------+ + +#. Download a test file from the ``container1`` container: + + .. code-block:: console + + $ openstack object save container1 FILE + + Replace ``FILE`` with the name of the file uploaded to the + ``container1`` container. + + .. note:: + + This command provides no output. diff --git a/doc/source/logs.rst b/doc/source/logs.rst new file mode 100644 index 0000000000..16bc6f0da7 --- /dev/null +++ b/doc/source/logs.rst @@ -0,0 +1,198 @@ +==== +Logs +==== + +Swift has quite verbose logging, and the generated logs can be used for +cluster monitoring, utilization calculations, audit records, and more. As an +overview, Swift's logs are sent to syslog and organized by log level and +syslog facility. All log lines related to the same request have the same +transaction id. This page documents the log formats used in the system. + +.. note:: + + By default, Swift will log full log lines. However, with the + ``log_max_line_length`` setting and depending on your logging server + software, lines may be truncated or shortened. With ``log_max_line_length < + 7``, the log line will be truncated. With ``log_max_line_length >= 7``, the + log line will be "shortened": about half the max length followed by " ... " + followed by the other half the max length. Unless you use exceptionally + short values, you are unlikely to run across this with the following + documented log lines, but you may see it with debugging and error log + lines. + +---------- +Proxy Logs +---------- + +The proxy logs contain the record of all external API requests made to the +proxy server. Swift's proxy servers log requests using a custom format +designed to provide robust information and simple processing. It is possible +to change this format with the ``log_msg_template`` config parameter. +The default log format is:: + + {client_ip} {remote_addr} {end_time.datetime} {method} {path} {protocol} + {status_int} {referer} {user_agent} {auth_token} {bytes_recvd} + {bytes_sent} {client_etag} {transaction_id} {headers} {request_time} + {source} {log_info} {start_time} {end_time} {policy_index} + {access_user_id} + +Some keywords, signaled by the (anonymizable) flag, can be anonymized by +using the transformer 'anonymized'. The data are applied the hashing method of +``log_anonymization_method`` and an optional salt ``log_anonymization_salt``. + +Some keywords, signaled by the (timestamp) flag, can be converted to standard +dates formats using the matching transformers: 'datetime', 'asctime' or +'iso8601'. Other transformers for timestamps are 's', 'ms', 'us' and 'ns' for +seconds, milliseconds, microseconds and nanoseconds. Python's strftime +directives can also be used as tranformers (a, A, b, B, c, d, H, I, j, m, M, p, +S, U, w, W, x, X, y, Y, Z). + +Example:: + + {client_ip.anonymized} {remote_addr.anonymized} {start_time.iso8601} + {end_time.H}:{end_time.M} {method} acc:{account} cnt:{container} + obj:{object.anonymized} + +=================== ========================================================== +**Log Field** **Value** +------------------- ---------------------------------------------------------- +client_ip Swift's guess at the end-client IP, taken from various + headers in the request. (anonymizable) +remote_addr The IP address of the other end of the TCP connection. + (anonymizable) +end_time Timestamp of the request. (timestamp) +method The HTTP verb in the request. +domain The domain in the request. (anonymizable) +path The path portion of the request. (anonymizable) +protocol The transport protocol used (currently one of http or + https). +status_int The response code for the request. +referer The value of the HTTP Referer header. (anonymizable) +user_agent The value of the HTTP User-Agent header. (anonymizable) +auth_token The value of the auth token. This may be truncated or + otherwise obscured. +bytes_recvd The number of bytes read from the client for this request. +bytes_sent The number of bytes sent to the client in the body of the + response. This is how many bytes were yielded to the WSGI + server. +client_etag The etag header value given by the client. (anonymizable) +transaction_id The transaction id of the request. +headers The headers given in the request. (anonymizable) +request_time The duration of the request. +source The "source" of the request. This may be set for requests + that are generated in order to fulfill client requests, + e.g. bulk uploads. +log_info Various info that may be useful for diagnostics, e.g. the + value of any x-delete-at header. +start_time High-resolution timestamp from the start of the request. + (timestamp) +end_time High-resolution timestamp from the end of the request. + (timestamp) +ttfb Duration between the request and the first bytes is sent. +policy_index The value of the storage policy index. +account The account part extracted from the path of the request. + (anonymizable) +container The container part extracted from the path of the request. + (anonymizable) +object The object part extracted from the path of the request. + (anonymizable) +pid PID of the process emitting the log line. +wire_status_int The status sent to the client, which may be different than + the logged response code if there was an error during the + body of the request or a disconnect. +access_user_id The user ID for logging. Middlewares should set + environ['swift.access_logging']['user_id'] to identify the user + for logging purposes. For S3 API requests, this contains the S3 + access key ID. Other auth middlewares should set user-specific + identifiers. For requests without auth middleware support, this + field will be "-". +=================== ========================================================== + +In one log line, all of the above fields are space-separated and url-encoded. +If any value is empty, it will be logged as a "-". This allows for simple +parsing by splitting each line on whitespace. New values may be placed at the +end of the log line from time to time, but the order of the existing values +will not change. Swift log processing utilities should look for the first N +fields they require (e.g. in Python using something like +``log_line.split()[:14]`` to get up through the transaction id). + +.. note:: + + Some log fields (like the request path) are already url quoted, so the + logged value will be double-quoted. For example, if a client uploads an + object name with a ``:`` in it, it will be url-quoted as ``%3A``. The log + module will then quote this value as ``%253A``. + +Swift Source +============ + +The ``source`` value in the proxy logs is used to identify the originator of a +request in the system. For example, if the client initiates a bulk upload, the +proxy server may end up doing many requests. The initial bulk upload request +will be logged as normal, but all of the internal "child requests" will have a +source value indicating they came from the bulk functionality. + +======================= ============================= +**Logged Source Value** **Originator of the Request** +----------------------- ----------------------------- +FP :ref:`formpost` +SLO :ref:`static-large-objects` +SW :ref:`staticweb` +TU :ref:`tempurl` +BD :ref:`bulk` (delete) +EA :ref:`bulk` (extract) +AQ :ref:`account-quotas` +CQ :ref:`container-quotas` +CS :ref:`container-sync` +TA :ref:`common_tempauth` +DLO :ref:`dynamic-large-objects` +LE :ref:`list_endpoints` +KS :ref:`keystoneauth` +RL :ref:`ratelimit` +RO :ref:`read_only` +VW :ref:`versioned_writes` +SSC :ref:`copy` +SYM :ref:`symlink` +SH :ref:`sharding_doc` +S3 :ref:`s3api` +OV :ref:`object_versioning` +EQ :ref:`etag_quoter` +======================= ============================= + + +----------------- +Storage Node Logs +----------------- + +Swift's account, container, and object server processes each log requests +that they receive, if they have been configured to do so with the +``log_requests`` config parameter (which defaults to true). The format for +these log lines is:: + + remote_addr - - [datetime] "request_method request_path" status_int + content_length "referer" "transaction_id" "user_agent" request_time + additional_info server_pid policy_index + +=================== ========================================================== +**Log Field** **Value** +------------------- ---------------------------------------------------------- +remote_addr The IP address of the other end of the TCP connection. +datetime Timestamp of the request, in + "day/month/year:hour:minute:second +0000" format. +request_method The HTTP verb in the request. +request_path The path portion of the request. +status_int The response code for the request. +content_length The value of the Content-Length header in the response. +referer The value of the HTTP Referer header. +transaction_id The transaction id of the request. +user_agent The value of the HTTP User-Agent header. Swift services + report a user-agent string of the service name followed by + the process ID, such as ``"proxy-server "`` or ``"object-updater "``. +request_time The time between request received and response started. + **Note**: This includes transfer time on PUT, but not GET. +additional_info Additional useful information. +server_pid The process id of the server +policy_index The value of the storage policy index. +=================== ========================================================== diff --git a/doc/source/metrics/account_auditor.rst b/doc/source/metrics/account_auditor.rst new file mode 100644 index 0000000000..83fbd74408 --- /dev/null +++ b/doc/source/metrics/account_auditor.rst @@ -0,0 +1,12 @@ +``account-auditor`` Metrics +=========================== + +============================ ========================================================= +Metric Name Description +---------------------------- --------------------------------------------------------- +``account-auditor.errors`` Count of audit runs (across all account databases) which + caught an Exception. +``account-auditor.passes`` Count of individual account databases which passed audit. +``account-auditor.failures`` Count of individual account databases which failed audit. +``account-auditor.timing`` Timing data for individual account database audits. +============================ ========================================================= diff --git a/doc/source/metrics/account_reaper.rst b/doc/source/metrics/account_reaper.rst new file mode 100644 index 0000000000..a95f8b2c12 --- /dev/null +++ b/doc/source/metrics/account_reaper.rst @@ -0,0 +1,25 @@ +``account-reaper`` Metrics +========================== + +================================================ ==================================================== +Metric Name Description +------------------------------------------------ ---------------------------------------------------- +``account-reaper.errors`` Count of devices failing the mount check. +``account-reaper.timing`` Timing data for each reap_account() call. +``account-reaper.return_codes.X`` Count of HTTP return codes from various operations + (e.g. object listing, container deletion, etc.). The + value for X is the first digit of the return code + (2 for 201, 4 for 404, etc.). +``account-reaper.containers_failures`` Count of failures to delete a container. +``account-reaper.containers_deleted`` Count of containers successfully deleted. +``account-reaper.containers_remaining`` Count of containers which failed to delete with + zero successes. +``account-reaper.containers_possibly_remaining`` Count of containers which failed to delete with + at least one success. +``account-reaper.objects_failures`` Count of failures to delete an object. +``account-reaper.objects_deleted`` Count of objects successfully deleted. +``account-reaper.objects_remaining`` Count of objects which failed to delete with zero + successes. +``account-reaper.objects_possibly_remaining`` Count of objects which failed to delete with at + least one success. +================================================ ==================================================== diff --git a/doc/source/metrics/account_replicator.rst b/doc/source/metrics/account_replicator.rst new file mode 100644 index 0000000000..19127953b0 --- /dev/null +++ b/doc/source/metrics/account_replicator.rst @@ -0,0 +1,31 @@ +``account-replicator`` Metrics +============================== + +======================================= ==================================================== +Metric Name Description +--------------------------------------- ---------------------------------------------------- +``account-replicator.diffs`` Count of syncs handled by sending differing rows. +``account-replicator.diff_caps`` Count of "diffs" operations which failed because + "max_diffs" was hit. +``account-replicator.no_changes`` Count of accounts found to be in sync. +``account-replicator.hashmatches`` Count of accounts found to be in sync via hash + comparison (``broker.merge_syncs`` was called). +``account-replicator.rsyncs`` Count of completely missing accounts which were sent + via rsync. +``account-replicator.remote_merges`` Count of syncs handled by sending entire database + via rsync. +``account-replicator.attempts`` Count of database replication attempts. +``account-replicator.failures`` Count of database replication attempts which failed + due to corruption (quarantined) or inability to read + as well as attempts to individual nodes which + failed. +``account-replicator.removes.`` Count of databases on deleted because the + delete_timestamp was greater than the put_timestamp + and the database had no rows or because it was + successfully sync'ed to other locations and doesn't + belong here anymore. +``account-replicator.successes`` Count of replication attempts to an individual node + which were successful. +``account-replicator.timing`` Timing data for each database replication attempt + not resulting in a failure. +======================================= ==================================================== diff --git a/doc/source/metrics/account_server.rst b/doc/source/metrics/account_server.rst new file mode 100644 index 0000000000..6bf75f0558 --- /dev/null +++ b/doc/source/metrics/account_server.rst @@ -0,0 +1,37 @@ +``account-server`` Metrics +========================== + +..note:: + "Not Found" is not considered an error and requests + which increment ``errors`` are not included in the timing data. + +========================================== ======================================================= +Metric Name Description +------------------------------------------ ------------------------------------------------------- +``account-server.DELETE.errors.timing`` Timing data for each DELETE request resulting in an + error: bad request, not mounted, missing timestamp. +``account-server.DELETE.timing`` Timing data for each DELETE request not resulting in + an error. +``account-server.PUT.errors.timing`` Timing data for each PUT request resulting in an error: + bad request, not mounted, conflict, recently-deleted. +``account-server.PUT.timing`` Timing data for each PUT request not resulting in an + error. +``account-server.HEAD.errors.timing`` Timing data for each HEAD request resulting in an + error: bad request, not mounted. +``account-server.HEAD.timing`` Timing data for each HEAD request not resulting in + an error. +``account-server.GET.errors.timing`` Timing data for each GET request resulting in an + error: bad request, not mounted, bad delimiter, + account listing limit too high, bad accept header. +``account-server.GET.timing`` Timing data for each GET request not resulting in + an error. +``account-server.REPLICATE.errors.timing`` Timing data for each REPLICATE request resulting in an + error: bad request, not mounted. +``account-server.REPLICATE.timing`` Timing data for each REPLICATE request not resulting + in an error. +``account-server.POST.errors.timing`` Timing data for each POST request resulting in an + error: bad request, bad or missing timestamp, not + mounted. +``account-server.POST.timing`` Timing data for each POST request not resulting in + an error. +========================================== ======================================================= diff --git a/doc/source/metrics/all.rst b/doc/source/metrics/all.rst new file mode 100644 index 0000000000..bca1c10870 --- /dev/null +++ b/doc/source/metrics/all.rst @@ -0,0 +1,24 @@ +:orphan: + +All Statsd Metrics +================== + +.. include:: account_auditor.rst +.. include:: account_reaper.rst +.. include:: account_server.rst +.. include:: account_replicator.rst + +.. include:: container_auditor.rst +.. include:: container_replicator.rst +.. include:: container_server.rst +.. include:: container_sync.rst +.. include:: container_updater.rst + +.. include:: object_auditor.rst +.. include:: object_expirer.rst +.. include:: object_reconstructor.rst +.. include:: object_replicator.rst +.. include:: object_server.rst +.. include:: object_updater.rst + +.. include:: proxy_server.rst diff --git a/doc/source/metrics/container_auditor.rst b/doc/source/metrics/container_auditor.rst new file mode 100644 index 0000000000..4054337098 --- /dev/null +++ b/doc/source/metrics/container_auditor.rst @@ -0,0 +1,12 @@ +``container-auditor`` Metrics +============================= + +============================== ==================================================== +Metric Name Description +------------------------------ ---------------------------------------------------- +``container-auditor.errors`` Incremented when an Exception is caught in an audit + pass (only once per pass, max). +``container-auditor.passes`` Count of individual containers passing an audit. +``container-auditor.failures`` Count of individual containers failing an audit. +``container-auditor.timing`` Timing data for each container audit. +============================== ==================================================== diff --git a/doc/source/metrics/container_replicator.rst b/doc/source/metrics/container_replicator.rst new file mode 100644 index 0000000000..c575190638 --- /dev/null +++ b/doc/source/metrics/container_replicator.rst @@ -0,0 +1,31 @@ +``container-replicator`` Metrics +================================ + +========================================= ==================================================== +Metric Name Description +----------------------------------------- ---------------------------------------------------- +``container-replicator.diffs`` Count of syncs handled by sending differing rows. +``container-replicator.diff_caps`` Count of "diffs" operations which failed because + "max_diffs" was hit. +``container-replicator.no_changes`` Count of containers found to be in sync. +``container-replicator.hashmatches`` Count of containers found to be in sync via hash + comparison (``broker.merge_syncs`` was called). +``container-replicator.rsyncs`` Count of completely missing containers where were sent + via rsync. +``container-replicator.remote_merges`` Count of syncs handled by sending entire database + via rsync. +``container-replicator.attempts`` Count of database replication attempts. +``container-replicator.failures`` Count of database replication attempts which failed + due to corruption (quarantined) or inability to read + as well as attempts to individual nodes which + failed. +``container-replicator.removes.`` Count of databases deleted on because the + delete_timestamp was greater than the put_timestamp + and the database had no rows or because it was + successfully sync'ed to other locations and doesn't + belong here anymore. +``container-replicator.successes`` Count of replication attempts to an individual node + which were successful. +``container-replicator.timing`` Timing data for each database replication attempt + not resulting in a failure. +========================================= ==================================================== diff --git a/doc/source/metrics/container_server.rst b/doc/source/metrics/container_server.rst new file mode 100644 index 0000000000..a5c41daf87 --- /dev/null +++ b/doc/source/metrics/container_server.rst @@ -0,0 +1,35 @@ +``container-server`` Metrics +============================ + +.. note:: + "Not Found" is not considered an error and requests + which increment ``errors`` are not included in the timing data. + +============================================ ==================================================== +Metric Name Description +-------------------------------------------- ---------------------------------------------------- +``container-server.DELETE.errors.timing`` Timing data for DELETE request errors: bad request, + not mounted, missing timestamp, conflict. +``container-server.DELETE.timing`` Timing data for each DELETE request not resulting in + an error. +``container-server.PUT.errors.timing`` Timing data for PUT request errors: bad request, + missing timestamp, not mounted, conflict. +``container-server.PUT.timing`` Timing data for each PUT request not resulting in an + error. +``container-server.HEAD.errors.timing`` Timing data for HEAD request errors: bad request, + not mounted. +``container-server.HEAD.timing`` Timing data for each HEAD request not resulting in + an error. +``container-server.GET.errors.timing`` Timing data for GET request errors: bad request, + not mounted, parameters not utf8, bad accept header. +``container-server.GET.timing`` Timing data for each GET request not resulting in + an error. +``container-server.REPLICATE.errors.timing`` Timing data for REPLICATE request errors: bad + request, not mounted. +``container-server.REPLICATE.timing`` Timing data for each REPLICATE request not resulting + in an error. +``container-server.POST.errors.timing`` Timing data for POST request errors: bad request, + bad x-container-sync-to, not mounted. +``container-server.POST.timing`` Timing data for each POST request not resulting in + an error. +============================================ ==================================================== diff --git a/doc/source/metrics/container_sync.rst b/doc/source/metrics/container_sync.rst new file mode 100644 index 0000000000..e36a15fef1 --- /dev/null +++ b/doc/source/metrics/container_sync.rst @@ -0,0 +1,18 @@ +``container-sync`` Metrics +========================== + +================================= ==================================================== +Metric Name Description +--------------------------------- ---------------------------------------------------- +``container-sync.skips`` Count of containers skipped because they don't have + sync'ing enabled. +``container-sync.failures`` Count of failures sync'ing of individual containers. +``container-sync.syncs`` Count of individual containers sync'ed successfully. +``container-sync.deletes`` Count of container database rows sync'ed by + deletion. +``container-sync.deletes.timing`` Timing data for each container database row + synchronization via deletion. +``container-sync.puts`` Count of container database rows sync'ed by Putting. +``container-sync.puts.timing`` Timing data for each container database row + synchronization via Putting. +================================= ==================================================== diff --git a/doc/source/metrics/container_updater.rst b/doc/source/metrics/container_updater.rst new file mode 100644 index 0000000000..d498266f21 --- /dev/null +++ b/doc/source/metrics/container_updater.rst @@ -0,0 +1,17 @@ +``container-updater`` Metrics +============================= + +================================ ==================================================== +Metric Name Description +-------------------------------- ---------------------------------------------------- +``container-updater.successes`` Count of containers which successfully updated their + account. +``container-updater.failures`` Count of containers which failed to update their + account. +``container-updater.no_changes`` Count of containers which didn't need to update + their account. +``container-updater.timing`` Timing data for processing a container; only + includes timing for containers which needed to + update their accounts (i.e. "successes" and + "failures" but not "no_changes"). +================================ ==================================================== diff --git a/doc/source/metrics/labels.rst b/doc/source/metrics/labels.rst new file mode 100644 index 0000000000..6e85254167 --- /dev/null +++ b/doc/source/metrics/labels.rst @@ -0,0 +1,68 @@ +:orphan: + +Labeled Metrics +=============== + +.. note:: + Labeled metrics are still an experimental feature. This document contains + forward looking statements that anticipate future development of labeled + metrics support. In particular, metric names and labels may be subject to + change as we explore the space. + +.. warning:: + Enabling labeled metrics will likely cause a dramatic increase in the number + of distinct metrics time series. Ensure your metrics pipeline is prepared. + +Recent versions of Swift emit StatsD metrics with explicit application-defined +labels, rather than relying on consumers knowing how to unpack the legacy label +names. A variety of StatsD extension formats are available, many of which are +parsed by `statsd_exporter `__: + +- ``librato`` +- ``influxdb`` +- ``dogstatsd`` +- ``graphite`` + +See the ``proxy-server.conf-sample`` file for more information on configuring +labeled metrics. + +Labeled metrics are emitted in addition to legacy StatsD metrics. However, +legacy StatsD metrics can be disabled by setting the ``statsd_emit_legacy`` +option to ``False``. This is not recommended until more legacy metrics have +been supplemented with equivalent labeled metrics. + +As various Swift middlewares, services and daemons are upgraded to emit labeled +metrics, they will be documented in the relevant section of the :doc:`all` +page. + +Common Labels +------------- + +Each labeled metric may have its own unique labels, but many labeled metrics +will use some or all of a common set of labels. The common labels are +documented here for information purposes, but the authoritative set of labels +for each metric can be found in the sections of the :doc:`all` page. + +.. table:: + :align: left + + ================ ========================================================== + Label Name Value + ---------------- ---------------------------------------------------------- + ``resource`` The type of resource associated with the metric + i.e. ``account``, ``container`` or ``object``. + ``account`` The quoted account name associated with the metric. + ``container`` The quoted container name associated with the metric. + ``policy`` The storage policy index associated with the metric. + ``status`` The status int of an HTTP response associated with the + metric. + ``method`` The method of an HTTP request associated with the metric. + ================ ========================================================== + + +.. note:: + Note that metrics will *not* have labels that would likely have a very high + cardinality of values, such as object names, as this is expected to be + problematic for metrics collectors. Nevertheless, some operators may still + need to drop labels such as ``container`` in order to keep metric + cardinalities reasonable. diff --git a/doc/source/metrics/object_auditor.rst b/doc/source/metrics/object_auditor.rst new file mode 100644 index 0000000000..4baa040075 --- /dev/null +++ b/doc/source/metrics/object_auditor.rst @@ -0,0 +1,13 @@ +``object-auditor`` Metrics +========================== + +============================== ==================================================== +Metric Name Description +------------------------------ ---------------------------------------------------- +``object-auditor.quarantines`` Count of objects failing audit and quarantined. +``object-auditor.errors`` Count of errors encountered while auditing objects. +``object-auditor.timing`` Timing data for each object audit (does not include + any rate-limiting sleep time for + max_files_per_second, but does include rate-limiting + sleep time for max_bytes_per_second). +============================== ==================================================== diff --git a/doc/source/metrics/object_expirer.rst b/doc/source/metrics/object_expirer.rst new file mode 100644 index 0000000000..71e725ec8f --- /dev/null +++ b/doc/source/metrics/object_expirer.rst @@ -0,0 +1,12 @@ +``object-expirer`` Metrics +========================== + +========================== ==================================================== +Metric Name Description +-------------------------- ---------------------------------------------------- +``object-expirer.objects`` Count of objects expired. +``object-expirer.errors`` Count of errors encountered while attempting to + expire an object. +``object-expirer.timing`` Timing data for each object expiration attempt, + including ones resulting in an error. +========================== ==================================================== diff --git a/doc/source/metrics/object_reconstructor.rst b/doc/source/metrics/object_reconstructor.rst new file mode 100644 index 0000000000..66ec0bec8b --- /dev/null +++ b/doc/source/metrics/object_reconstructor.rst @@ -0,0 +1,25 @@ +``object-reconstructor`` Metrics +================================ + +======================================================== ====================================================== +Metric Name Description +-------------------------------------------------------- ------------------------------------------------------ +``object-reconstructor.partition.delete.count.`` A count of partitions on which were + reconstructed and synced to another node because they + didn't belong on this node. This metric is tracked + per-device to allow for "quiescence detection" for + object reconstruction activity on each device. +``object-reconstructor.partition.delete.timing`` Timing data for partitions reconstructed and synced to + another node because they didn't belong on this node. + This metric is not tracked per device. +``object-reconstructor.partition.update.count.`` A count of partitions on which were + reconstructed and synced to another node, but also + belong on this node. As with delete.count, this metric + is tracked per-device. +``object-reconstructor.partition.update.timing`` Timing data for partitions reconstructed which also + belong on this node. This metric is not tracked + per-device. +``object-reconstructor.suffix.hashes`` Count of suffix directories whose hash (of filenames) + was recalculated. +``object-reconstructor.suffix.syncs`` Count of suffix directories reconstructed with ssync. +======================================================== ====================================================== diff --git a/doc/source/metrics/object_replicator.rst b/doc/source/metrics/object_replicator.rst new file mode 100644 index 0000000000..934c1c3f5a --- /dev/null +++ b/doc/source/metrics/object_replicator.rst @@ -0,0 +1,25 @@ +``object-replicator`` Metrics +============================= + +===================================================== ==================================================== +Metric Name Description +----------------------------------------------------- ---------------------------------------------------- +``object-replicator.partition.delete.count.`` A count of partitions on which were + replicated to another node because they didn't + belong on this node. This metric is tracked + per-device to allow for "quiescence detection" for + object replication activity on each device. +``object-replicator.partition.delete.timing`` Timing data for partitions replicated to another + node because they didn't belong on this node. This + metric is not tracked per device. +``object-replicator.partition.update.count.`` A count of partitions on which were + replicated to another node, but also belong on this + node. As with delete.count, this metric is tracked + per-device. +``object-replicator.partition.update.timing`` Timing data for partitions replicated which also + belong on this node. This metric is not tracked + per-device. +``object-replicator.suffix.hashes`` Count of suffix directories whose hash (of filenames) + was recalculated. +``object-replicator.suffix.syncs`` Count of suffix directories replicated with rsync. +===================================================== ==================================================== diff --git a/doc/source/metrics/object_server.rst b/doc/source/metrics/object_server.rst new file mode 100644 index 0000000000..ef4c5e07ad --- /dev/null +++ b/doc/source/metrics/object_server.rst @@ -0,0 +1,49 @@ +``object-server`` Metrics +========================= + +========================================= ==================================================== +Metric Name Description +----------------------------------------- ---------------------------------------------------- +``object-server.quarantines`` Count of objects (files) found bad and moved to + quarantine. +``object-server.async_pendings`` Count of container updates saved as async_pendings + (may result from PUT or DELETE requests). +``object-server.POST.errors.timing`` Timing data for POST request errors: bad request, + missing timestamp, delete-at in past, not mounted. +``object-server.POST.timing`` Timing data for each POST request not resulting in + an error. +``object-server.PUT.errors.timing`` Timing data for PUT request errors: bad request, + not mounted, missing timestamp, object creation + constraint violation, delete-at in past. +``object-server.PUT.timeouts`` Count of object PUTs which exceeded max_upload_time. +``object-server.PUT.timing`` Timing data for each PUT request not resulting in an + error. +``object-server.PUT..timing`` Timing data per kB transferred (ms/kB) for each + non-zero-byte PUT request on each device. + Monitoring problematic devices, higher is bad. +``object-server.GET.errors.timing`` Timing data for GET request errors: bad request, + not mounted, header timestamps before the epoch, + precondition failed. + File errors resulting in a quarantine are not + counted here. +``object-server.GET.timing`` Timing data for each GET request not resulting in an + error. Includes requests which couldn't find the + object (including disk errors resulting in file + quarantine). +``object-server.HEAD.errors.timing`` Timing data for HEAD request errors: bad request, + not mounted. +``object-server.HEAD.timing`` Timing data for each HEAD request not resulting in + an error. Includes requests which couldn't find the + object (including disk errors resulting in file + quarantine). +``object-server.DELETE.errors.timing`` Timing data for DELETE request errors: bad request, + missing timestamp, not mounted, precondition + failed. Includes requests which couldn't find or + match the object. +``object-server.DELETE.timing`` Timing data for each DELETE request not resulting + in an error. +``object-server.REPLICATE.errors.timing`` Timing data for REPLICATE request errors: bad + request, not mounted. +``object-server.REPLICATE.timing`` Timing data for each REPLICATE request not resulting + in an error. +========================================= ==================================================== diff --git a/doc/source/metrics/object_updater.rst b/doc/source/metrics/object_updater.rst new file mode 100644 index 0000000000..ca223e4994 --- /dev/null +++ b/doc/source/metrics/object_updater.rst @@ -0,0 +1,22 @@ +``object-updater`` Metrics +========================== + +============================== ==================================================== +Metric Name Description +------------------------------ ---------------------------------------------------- +``object-updater.errors`` Count of drives not mounted or async_pending files + with an unexpected name. +``object-updater.timing`` Timing data for object sweeps to flush async_pending + container updates. Does not include object sweeps + which did not find an existing async_pending storage + directory. +``object-updater.quarantines`` Count of async_pending container updates which were + corrupted and moved to quarantine. +``object-updater.successes`` Count of successful container updates. +``object-updater.failures`` Count of failed container updates. +``object-updater.unlinks`` Count of async_pending files unlinked. An + async_pending file is unlinked either when it is + successfully processed or when the replicator sees + that there is a newer async_pending file for the + same object. +============================== ==================================================== diff --git a/doc/source/metrics/proxy_server.rst b/doc/source/metrics/proxy_server.rst new file mode 100644 index 0000000000..3e72f426f7 --- /dev/null +++ b/doc/source/metrics/proxy_server.rst @@ -0,0 +1,113 @@ +``proxy-server`` Metrics +======================== + +In the table, ```` is the proxy-server controller responsible for the +request and will be one of ``account``, ``container``, or ``object``. + +========================================== ==================================================== +Metric Name Description +------------------------------------------ ---------------------------------------------------- +``proxy-server.errors`` Count of errors encountered while serving requests + before the controller type is determined. Includes + invalid Content-Length, errors finding the internal + controller to handle the request, invalid utf8, and + bad URLs. +``proxy-server..handoff_count`` Count of node hand-offs; only tracked if log_handoffs + is set in the proxy-server config. +``proxy-server..handoff_all_count`` Count of times *only* hand-off locations were + utilized; only tracked if log_handoffs is set in the + proxy-server config. +``proxy-server..client_timeouts`` Count of client timeouts (client did not read within + ``client_timeout`` seconds during a GET or did not + supply data within ``client_timeout`` seconds during + a PUT). +``proxy-server..client_disconnects`` Count of detected client disconnects during PUT + operations (does NOT include caught Exceptions in + the proxy-server which caused a client disconnect). +========================================== ==================================================== + +Additionally, middleware often emit their own metrics + +``proxy-logging`` Middleware +---------------------------- + +In the table, ```` is either the proxy-server controller responsible +for the request: ``account``, ``container``, ``object``, or the string +``SOS`` if the request came from the `Swift Origin Server`_ middleware. +The ```` portion will be one of ``GET``, ``HEAD``, ``POST``, ``PUT``, +``DELETE``, ``COPY``, ``OPTIONS``, or ``BAD_METHOD``. The list of valid +HTTP methods is configurable via the ``log_statsd_valid_http_methods`` +config variable and the default setting yields the above behavior. + +.. _Swift Origin Server: https://github.com/dpgoetz/sos + +====================================================== ============================================ +Metric Name Description +------------------------------------------------------ -------------------------------------------- +``proxy-server....timing`` Timing data for requests, start to finish. + The portion is the numeric HTTP + status code for the request (e.g. "200" or + "404"). +``proxy-server..GET..first-byte.timing`` Timing data up to completion of sending the + response headers (only for GET requests). + and are as for the main + timing metric. +``proxy-server....xfer`` This counter metric is the sum of bytes + transferred in (from clients) and out (to + clients) for requests. The , , + and portions of the metric are just + like the main timing metric. +====================================================== ============================================ + +The ``proxy-logging`` middleware also groups these metrics by policy. The +```` portion represents a policy index: + +============================================================================ ===================================== +Metric Name Description +---------------------------------------------------------------------------- ------------------------------------- +``proxy-server.object.policy....timing`` Timing data for requests, aggregated + by policy index. +``proxy-server.object.policy..GET..first-byte.timing`` Timing data up to completion of + sending the response headers, + aggregated by policy index. +``proxy-server.object.policy....xfer`` Sum of bytes transferred in and out, + aggregated by policy index. +============================================================================ ===================================== + +``tempauth`` Middleware +----------------------- +In the table, ```` represents the actual configured +reseller_prefix or ``NONE`` if the reseller_prefix is the empty string: + +=========================================== ==================================================== +Metric Name Description +------------------------------------------- ---------------------------------------------------- +``tempauth..unauthorized`` Count of regular requests which were denied with + HTTPUnauthorized. +``tempauth..forbidden`` Count of regular requests which were denied with + HTTPForbidden. +``tempauth..token_denied`` Count of token requests which were denied. +``tempauth..errors`` Count of errors. +=========================================== ==================================================== + +``tempurl`` Middleware +---------------------- + +========================================== ==================================================== +Metric Name Description +------------------------------------------ ---------------------------------------------------- +``proxy-server.tempurl.digests.`` Count of requests authorized using the specified + ````; may be one of ``sha1``, ``sha256``, + or ``sha512``. +========================================== ==================================================== + +``formpost`` Middleware +----------------------- + +========================================== ==================================================== +Metric Name Description +------------------------------------------ ---------------------------------------------------- +``proxy-server.formpost.digests.`` Count of requests authorized using the specified + ````; may be one of ``sha1``, ``sha256``, + or ``sha512``. +========================================== ==================================================== diff --git a/doc/source/middleware.rst b/doc/source/middleware.rst new file mode 100644 index 0000000000..ee2e3dc674 --- /dev/null +++ b/doc/source/middleware.rst @@ -0,0 +1,404 @@ +.. _common_middleware: + +********** +Middleware +********** + +.. _account-quotas: + +Account Quotas +============== + +.. automodule:: swift.common.middleware.account_quotas + :members: + :show-inheritance: + +.. _s3api: + +AWS S3 Api +========== + +.. automodule:: swift.common.middleware.s3api.s3api + :members: + :show-inheritance: + +.. automodule:: swift.common.middleware.s3api.s3token + :members: + :show-inheritance: + +.. automodule:: swift.common.middleware.s3api.s3request + :members: + :show-inheritance: + +.. automodule:: swift.common.middleware.s3api.s3response + :members: + :show-inheritance: + +.. automodule:: swift.common.middleware.s3api.exception + :members: + :show-inheritance: + +.. automodule:: swift.common.middleware.s3api.etree + :members: _Element + :show-inheritance: + +.. automodule:: swift.common.middleware.s3api.utils + :members: + :show-inheritance: + +.. automodule:: swift.common.middleware.s3api.subresource + :members: + :show-inheritance: + +.. automodule:: swift.common.middleware.s3api.acl_handlers + :members: + :show-inheritance: + +.. automodule:: swift.common.middleware.s3api.acl_utils + :members: + :show-inheritance: + +.. automodule:: swift.common.middleware.s3api.controllers.base + :members: + :show-inheritance: + +.. automodule:: swift.common.middleware.s3api.controllers.service + :members: + :show-inheritance: + +.. automodule:: swift.common.middleware.s3api.controllers.bucket + :members: + :show-inheritance: + +.. automodule:: swift.common.middleware.s3api.controllers.obj + :members: + :show-inheritance: + +.. automodule:: swift.common.middleware.s3api.controllers.acl + :members: + :show-inheritance: + +.. automodule:: swift.common.middleware.s3api.controllers.s3_acl + :members: + :show-inheritance: + +.. automodule:: swift.common.middleware.s3api.controllers.multi_upload + :members: + :show-inheritance: + +.. automodule:: swift.common.middleware.s3api.controllers.multi_delete + :members: + :show-inheritance: + +.. automodule:: swift.common.middleware.s3api.controllers.versioning + :members: + :show-inheritance: + +.. automodule:: swift.common.middleware.s3api.controllers.location + :members: + :show-inheritance: + +.. automodule:: swift.common.middleware.s3api.controllers.logging + :members: + :show-inheritance: + +Backend Ratelimit +================= + +.. automodule:: swift.common.middleware.backend_ratelimit + :members: + :show-inheritance: + +.. _bulk: + +Bulk Operations (Delete and Archive Auto Extraction) +==================================================== + +.. automodule:: swift.common.middleware.bulk + :members: + :show-inheritance: + +.. _catch_errors: + +CatchErrors +============= + +.. automodule:: swift.common.middleware.catch_errors + :members: + :show-inheritance: + +CNAME Lookup +============ + +.. automodule:: swift.common.middleware.cname_lookup + :members: + :show-inheritance: + +.. _container-quotas: + +Container Quotas +================ + +.. automodule:: swift.common.middleware.container_quotas + :members: + :show-inheritance: + +.. _container-sync: + +Container Sync Middleware +========================= + +.. automodule:: swift.common.middleware.container_sync + :members: + :show-inheritance: + +Cross Domain Policies +===================== + +.. automodule:: swift.common.middleware.crossdomain + :members: + :show-inheritance: + +.. _discoverability: + +Discoverability +=============== + +Swift will by default provide clients with an interface providing details +about the installation. Unless disabled (i.e ``expose_info=false`` in +:ref:`proxy-server-config`), a GET request to ``/info`` will return configuration +data in JSON format. An example response:: + + {"swift": {"version": "1.11.0"}, "staticweb": {}, "tempurl": {}} + +This would signify to the client that swift version 1.11.0 is running and that +staticweb and tempurl are available in this installation. + +There may be administrator-only information available via ``/info``. To +retrieve it, one must use an HMAC-signed request, similar to TempURL. +The signature may be produced like so:: + + swift tempurl GET 3600 /info secret 2>/dev/null | sed s/temp_url/swiftinfo/g + +Domain Remap +============ + +.. automodule:: swift.common.middleware.domain_remap + :members: + :show-inheritance: + +Dynamic Large Objects +===================== + +DLO support centers around a user specified filter that matches +segments and concatenates them together in object listing order. Please see +the DLO docs for :ref:`dlo-doc` further details. + +.. _encryption: + +Encryption +========== + +Encryption middleware should be deployed in conjunction with the +:ref:`keymaster` middleware. + +.. automodule:: swift.common.middleware.crypto + :members: + :show-inheritance: + +.. automodule:: swift.common.middleware.crypto.encrypter + :members: + :show-inheritance: + +.. automodule:: swift.common.middleware.crypto.decrypter + :members: + :show-inheritance: + +.. _etag_quoter: + +Etag Quoter +=========== + +.. automodule:: swift.common.middleware.etag_quoter + :members: + :show-inheritance: + +.. _formpost: + +FormPost +======== + +.. automodule:: swift.common.middleware.formpost + :members: + :show-inheritance: + +.. _gatekeeper: + +GateKeeper +========== + +.. automodule:: swift.common.middleware.gatekeeper + :members: + :show-inheritance: + +.. _healthcheck: + +Healthcheck +=========== + +.. automodule:: swift.common.middleware.healthcheck + :members: + :show-inheritance: + +.. _keymaster: + +Keymaster +========= + +Keymaster middleware should be deployed in conjunction with the +:ref:`encryption` middleware. + +.. automodule:: swift.common.middleware.crypto.keymaster + :members: + :show-inheritance: + +.. _keystoneauth: + +KeystoneAuth +============ + +.. automodule:: swift.common.middleware.keystoneauth + :members: + :show-inheritance: + +.. _list_endpoints: + +List Endpoints +============== + +.. automodule:: swift.common.middleware.list_endpoints + :members: + :show-inheritance: + +Memcache +======== + +.. automodule:: swift.common.middleware.memcache + :members: + :show-inheritance: + +Name Check (Forbidden Character Filter) +======================================= + +.. automodule:: swift.common.middleware.name_check + :members: + :show-inheritance: + +.. _object_versioning: + +Object Versioning +================= + +.. automodule:: swift.common.middleware.versioned_writes.object_versioning + :members: + :show-inheritance: + +Proxy Logging +============= + +.. automodule:: swift.common.middleware.proxy_logging + :members: + :show-inheritance: + +Ratelimit +========= + +.. automodule:: swift.common.middleware.ratelimit + :members: + :show-inheritance: + +.. _read_only: + +Read Only +========= + +.. automodule:: swift.common.middleware.read_only + :members: + :show-inheritance: + +.. _recon: + +Recon +===== + +.. automodule:: swift.common.middleware.recon + :members: + :show-inheritance: + +.. _copy: + +Server Side Copy +================ + +.. automodule:: swift.common.middleware.copy + :members: + :show-inheritance: + +Static Large Objects +==================== + +Please see +the SLO docs for :ref:`slo-doc` further details. + + +.. _staticweb: + +StaticWeb +========= + +.. automodule:: swift.common.middleware.staticweb + :members: + :show-inheritance: + +.. _symlink: + +Symlink +======= + +.. automodule:: swift.common.middleware.symlink + :members: + :show-inheritance: + +.. _common_tempauth: + +TempAuth +======== + +.. automodule:: swift.common.middleware.tempauth + :members: + :show-inheritance: + +.. _tempurl: + +TempURL +======= + +.. automodule:: swift.common.middleware.tempurl + :members: + :show-inheritance: + +.. _versioned_writes: + +Versioned Writes +================= + +.. automodule:: swift.common.middleware.versioned_writes.legacy + :members: + :show-inheritance: + +XProfile +============== + +.. automodule:: swift.common.middleware.xprofile + :members: + :show-inheritance: diff --git a/doc/source/misc.rst b/doc/source/misc.rst index 0c0d607267..1012b13bec 100644 --- a/doc/source/misc.rst +++ b/doc/source/misc.rst @@ -4,60 +4,60 @@ Misc **** -.. _exceptions: +.. _acls: -Exceptions -========== +ACLs +==== -.. automodule:: swift.common.exceptions +.. automodule:: swift.common.middleware.acl :members: - :undoc-members: :show-inheritance: -.. _constraints: +.. _buffered_http: -Constraints -=========== +Buffered HTTP +============= -.. automodule:: swift.common.constraints +.. automodule:: swift.common.bufferedhttp :members: - :undoc-members: :show-inheritance: -.. _utils: -Utils -===== +.. _config: -.. automodule:: swift.common.utils +Config +====== + +.. automodule:: swift.common.utils.config :members: :show-inheritance: -.. _common_tempauth: +.. _constraints: -TempAuth -======== +Constraints +=========== -.. automodule:: swift.common.middleware.tempauth +.. automodule:: swift.common.constraints :members: + :undoc-members: :show-inheritance: -.. _acls: - -ACLs -==== +Container Sync Realms +===================== -.. automodule:: swift.common.middleware.acl +.. automodule:: swift.common.container_sync_realms :members: :show-inheritance: -.. _wsgi: -WSGI -==== +.. _digest: -.. automodule:: swift.common.wsgi +Digest +====== + +.. automodule:: swift.common.digest :members: + :undoc-members: :show-inheritance: .. _direct_client: @@ -70,6 +70,16 @@ Direct Client :undoc-members: :show-inheritance: +.. _exceptions: + +Exceptions +========== + +.. automodule:: swift.common.exceptions + :members: + :undoc-members: + :show-inheritance: + .. _internal_client: Internal Client @@ -80,34 +90,39 @@ Internal Client :undoc-members: :show-inheritance: -.. _buffered_http: +.. _ipaddrs: -Buffered HTTP -============= +IPAddrs +======= -.. automodule:: swift.common.bufferedhttp +.. automodule:: swift.common.utils.ipaddrs :members: :show-inheritance: -.. _healthcheck: +.. _libc: -Healthcheck -=========== +Libc +==== -.. automodule:: swift.common.middleware.healthcheck +.. automodule:: swift.common.utils.libc :members: :show-inheritance: -.. _recon: +.. _logs: -Recon -=========== +Logs +==== -.. automodule:: swift.common.middleware.recon +.. automodule:: swift.common.utils.logs :members: :show-inheritance: -.. _memecached: +Manager +========= + +.. automodule:: swift.common.manager + :members: + :show-inheritance: MemCacheD ========= @@ -116,90 +131,86 @@ MemCacheD :members: :show-inheritance: -Manager -========= +.. _registry: -.. automodule:: swift.common.manager +Middleware Registry +=================== + +.. automodule:: swift.common.registry :members: + :undoc-members: :show-inheritance: -Ratelimit -========= +.. _request_helpers: + +Request Helpers +=============== -.. automodule:: swift.common.middleware.ratelimit +.. automodule:: swift.common.request_helpers :members: + :undoc-members: :show-inheritance: -StaticWeb -========= +.. _statsd_client: + +StatsdClient +============ -.. automodule:: swift.common.middleware.staticweb +.. automodule:: swift.common.statsd_client :members: :show-inheritance: -TempURL -======= +.. _storage_policy: -.. automodule:: swift.common.middleware.tempurl +Storage Policy +============== + +.. automodule:: swift.common.storage_policy :members: :show-inheritance: -FormPost -======== +.. _swob: -.. automodule:: swift.common.middleware.formpost +Swob +==== + +.. automodule:: swift.common.swob :members: :show-inheritance: + :special-members: __call__ -Domain Remap -============ +.. _timestamp: -.. automodule:: swift.common.middleware.domain_remap +Timestamp +========= + +.. automodule:: swift.common.utils.timestamp :members: :show-inheritance: -CNAME Lookup -============ +.. _base_utils: + +Utils Base +========== -.. automodule:: swift.common.middleware.cname_lookup +.. automodule:: swift.common.utils.base :members: :show-inheritance: -Proxy Logging -============= +.. _utils: -.. automodule:: swift.common.middleware.proxy_logging +Utils +===== + +.. automodule:: swift.common.utils :members: :show-inheritance: -CORS Headers -============ +.. _wsgi: -Cross Origin RequestS or CORS allows the browser to make requests against -Swift from another origin via the browser. This enables the use of HTML5 -forms and javascript uploads to swift. The owner of a container can set -three headers: - -+---------------------------------------------+-------------------------------+ -|Metadata | Use | -+=============================================+===============================+ -|X-Container-Meta-Access-Control-Allow-Origin | Origins to be allowed to | -| | make Cross Origin Requests, | -| | space separated | -+---------------------------------------------+-------------------------------+ -|X-Container-Meta-Access-Control-Max-Age | Max age for the Origin to | -| | hold the preflight results. | -+---------------------------------------------+-------------------------------+ -|X-Container-Meta-Access-Control-Allow-Headers| Headers to be allowed in | -| | actual request by browser. | -+---------------------------------------------+-------------------------------+ - -When the browser does a request it can issue a preflight request. The -preflight request is the OPTIONS call that verifies the Origin is allowed -to make the request. - -* Browser makes OPTIONS request to Swift -* Swift returns 200/401 to browser based on allowed origins -* If 200, browser makes PUT, POST, DELETE, HEAD, GET request to Swift - -CORS should be used in conjunction with TempURL and FormPost. +WSGI +==== + +.. automodule:: swift.common.wsgi + :members: + :show-inheritance: diff --git a/doc/source/object.rst b/doc/source/object.rst index 9a2643d4a7..73e9ee0eae 100644 --- a/doc/source/object.rst +++ b/doc/source/object.rst @@ -4,12 +4,22 @@ Object ****** -.. _object-server: +.. _object-auditor: -Object Server -============= +Object Auditor +============== -.. automodule:: swift.obj.server +.. automodule:: swift.obj.auditor + :members: + :undoc-members: + :show-inheritance: + +.. _object-diskfile: + +Object Backend +============== + +.. automodule:: swift.obj.diskfile :members: :undoc-members: :show-inheritance: @@ -24,23 +34,42 @@ Object Replicator :undoc-members: :show-inheritance: -.. _object-updater: +.. automodule:: swift.obj.ssync_sender + :members: + :undoc-members: + :show-inheritance: -Object Updater -============== +.. automodule:: swift.obj.ssync_receiver + :members: + :undoc-members: + :show-inheritance: -.. automodule:: swift.obj.updater +.. _object-reconstructor: + +Object Reconstructor +==================== + +.. automodule:: swift.obj.reconstructor :members: :undoc-members: :show-inheritance: -.. _object-auditor: +.. _object-server: -Object Auditor -============== +Object Server +============= -.. automodule:: swift.obj.auditor +.. automodule:: swift.obj.server :members: :undoc-members: :show-inheritance: +.. _object-updater: + +Object Updater +============== + +.. automodule:: swift.obj.updater + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/ops_runbook/diagnose.rst b/doc/source/ops_runbook/diagnose.rst new file mode 100644 index 0000000000..976cdb70de --- /dev/null +++ b/doc/source/ops_runbook/diagnose.rst @@ -0,0 +1,1204 @@ +================================== +Identifying issues and resolutions +================================== + +Is the system up? +----------------- + +If you have a report that Swift is down, perform the following basic checks: + +#. Run swift functional tests. + +#. From a server in your data center, use ``curl`` to check ``/healthcheck`` + (see below). + +#. If you have a monitoring system, check your monitoring system. + +#. Check your hardware load balancers infrastructure. + +#. Run swift-recon on a proxy node. + +Functional tests usage +----------------------- + +We would recommend that you set up the functional tests to run against your +production system. Run regularly this can be a useful tool to validate +that the system is configured correctly. In addition, it can provide +early warning about failures in your system (if the functional tests stop +working, user applications will also probably stop working). + +A script for running the function tests is located in ``swift/.functests``. + + +External monitoring +------------------- + +We use pingdom.com to monitor the external Swift API. We suggest the +following: + +- Do a GET on ``/healthcheck`` + +- Create a container, make it public (``x-container-read: + .r*,.rlistings``), create a small file in the container; do a GET + on the object + +Diagnose: General approach +-------------------------- + +- Look at service status in your monitoring system. + +- In addition to system monitoring tools and issue logging by users, + swift errors will often result in log entries (see :ref:`swift_logs`). + +- Look at any logs your deployment tool produces. + +- Log files should be reviewed for error signatures (see below) that + may point to a known issue, or root cause issues reported by the + diagnostics tools, prior to escalation. + +Dependencies +^^^^^^^^^^^^ + +The Swift software is dependent on overall system health. Operating +system level issues with network connectivity, domain name resolution, +user management, hardware and system configuration and capacity in terms +of memory and free disk space, may result is secondary Swift issues. +System level issues should be resolved prior to diagnosis of swift +issues. + + +Diagnose: Swift-dispersion-report +--------------------------------- + +The swift-dispersion-report is a useful tool to gauge the general +health of the system. Configure the ``swift-dispersion`` report to cover at +a minimum every disk drive in your system (usually 1% coverage). +See :ref:`dispersion_report` for details of how to configure and +use the dispersion reporting tool. + +The ``swift-dispersion-report`` tool can take a long time to run, especially +if any servers are down. We suggest you run it regularly +(e.g., in a cron job) and save the results. This makes it easy to refer +to the last report without having to wait for a long-running command +to complete. + +Diagnose: Is system responding to ``/healthcheck``? +--------------------------------------------------- + +When you want to establish if a swift endpoint is running, run ``curl -k`` +against ``https://$ENDPOINT/healthcheck``. + +.. _swift_logs: + +Diagnose: Interpreting messages in ``/var/log/swift/`` files +------------------------------------------------------------ + +.. note:: + + In the Hewlett Packard Enterprise Helion Public Cloud we send logs to + ``proxy.log`` (proxy-server logs), ``server.log`` (object-server, + account-server, container-server logs), ``background.log`` (all + other servers [object-replicator, etc]). + +The following table lists known issues: + +.. list-table:: + :widths: 25 25 25 25 + :header-rows: 1 + + * - **Logfile** + - **Signature** + - **Issue** + - **Steps to take** + * - /var/log/syslog + - kernel: [] sd .... [csbu:sd...] Sense Key: Medium Error + - Suggests disk surface issues + - Run ``swift-drive-audit`` on the target node to check for disk errors, + repair disk errors + * - /var/log/syslog + - kernel: [] sd .... [csbu:sd...] Sense Key: Hardware Error + - Suggests storage hardware issues + - Run diagnostics on the target node to check for disk failures, + replace failed disks + * - /var/log/syslog + - kernel: [] .... I/O error, dev sd.... ,sector .... + - + - Run diagnostics on the target node to check for disk errors + * - /var/log/syslog + - pound: NULL get_thr_arg + - Multiple threads woke up + - Noise, safe to ignore + * - /var/log/swift/proxy.log + - .... ERROR .... ConnectionTimeout .... + - A storage node is not responding in a timely fashion + - Check if node is down, not running Swift, + unconfigured, storage off-line or for network issues between the + proxy and non responding node + * - /var/log/swift/proxy.log + - proxy-server .... HTTP/1.0 500 .... + - A proxy server has reported an internal server error + - Examine the logs for any errors at the time the error was reported to + attempt to understand the cause of the error. + * - /var/log/swift/server.log + - .... ERROR .... ConnectionTimeout .... + - A storage server is not responding in a timely fashion + - Check if node is down, not running Swift, + unconfigured, storage off-line or for network issues between the + server and non responding node + * - /var/log/swift/server.log + - .... ERROR .... Remote I/O error: '/srv/node/disk.... + - A storage device is not responding as expected + - Run ``swift-drive-audit`` and check the filesystem named in the error + for corruption (unmount & xfs_repair). Check if the filesystem + is mounted and working. + * - /var/log/swift/background.log + - object-server ERROR container update failed .... Connection refused + - A container server node could not be contacted + - Check if node is down, not running Swift, + unconfigured, storage off-line or for network issues between the + server and non responding node + * - /var/log/swift/background.log + - object-updater ERROR with remote .... ConnectionTimeout + - The remote container server is busy + - If the container is very large, some errors updating it can be + expected. However, this error can also occur if there is a networking + issue. + * - /var/log/swift/background.log + - account-reaper STDOUT: .... error: ECONNREFUSED + - Network connectivity issue or the target server is down. + - Resolve network issue or reboot the target server + * - /var/log/swift/background.log + - .... ERROR .... ConnectionTimeout + - A storage server is not responding in a timely fashion + - The target server may be busy. However, this error can also occur if + there is a networking issue. + * - /var/log/swift/background.log + - .... ERROR syncing .... Timeout + - A timeout occurred syncing data to another node. + - The target server may be busy. However, this error can also occur if + there is a networking issue. + * - /var/log/swift/background.log + - .... ERROR Remote drive not mounted .... + - A storage server disk is unavailable + - Repair and remount the file system (on the remote node) + * - /var/log/swift/background.log + - object-replicator .... responded as unmounted + - A storage server disk is unavailable + - Repair and remount the file system (on the remote node) + * - /var/log/swift/\*.log + - STDOUT: EXCEPTION IN + - A unexpected error occurred + - Read the Traceback details, if it matches known issues + (e.g. active network/disk issues), check for re-ocurrences + after the primary issues have been resolved + * - /var/log/rsyncd.log + - rsync: mkdir "/disk....failed: No such file or directory.... + - A local storage server disk is unavailable + - Run diagnostics on the node to check for a failed or + unmounted disk + * - /var/log/swift* + - Exception: Could not bind to 0.0.0.0:6xxx + - Possible Swift process restart issue. This indicates an old swift + process is still running. + - Restart Swift services. If some swift services are reported down, + check if they left residual process behind. + +Diagnose: Parted reports the backup GPT table is corrupt +-------------------------------------------------------- + +- If a GPT table is broken, a message like the following should be + observed when the following command is run: + + .. code:: console + + $ sudo parted -l + + .. code:: console + + Error: The backup GPT table is corrupt, but the primary appears OK, + so that will be used. + + OK/Cancel? + +To fix, go to :ref:`fix_broken_gpt_table` + + +Diagnose: Drives diagnostic reports a FS label is not acceptable +---------------------------------------------------------------- + +If diagnostics reports something like "FS label: obj001dsk011 is not +acceptable", it indicates that a partition has a valid disk label, but an +invalid filesystem label. In such cases proceed as follows: + +#. Verify that the disk labels are correct: + + .. code:: console + + $ FS=/dev/sd#1 + + $ sudo parted -l | grep object + +#. If partition labels are inconsistent then, resolve the disk label issues + before proceeding: + + .. code:: console + + $ sudo parted -s ${FS} name ${PART_NO} ${PART_NAME} #Partition Label + $ # PART_NO is 1 for object disks and 3 for OS disks + $ # PART_NAME follows the convention seen in "sudo parted -l | grep object" + +#. If the Filesystem label is missing then create it with care: + + .. code:: console + + $ sudo xfs_admin -l ${FS} #Filesystem label (12 Char limit) + + $ # Check for the existence of a FS label + + $ OBJNO=<3 Length Object No.> + + $ # I.E OBJNO for sw-stbaz3-object0007 would be 007 + + $ DISKNO=<3 Length Disk No.> + + $ # I.E DISKNO for /dev/sdb would be 001, /dev/sdc would be 002 etc. + + $ sudo xfs_admin -L "obj${OBJNO}dsk${DISKNO}" ${FS} + + $ # Create a FS Label + +Diagnose: Failed LUNs +--------------------- + +.. note:: + + The HPE Helion Public Cloud uses direct attach SmartArray + controllers/drives. The information here is specific to that + environment. The hpacucli utility mentioned here may be called + hpssacli in your environment. + +The ``swift_diagnostics`` mount checks may return a warning that a LUN has +failed, typically accompanied by DriveAudit check failures and device +errors. + +Such cases are typically caused by a drive failure, and if drive check +also reports a failed status for the underlying drive, then follow +the procedure to replace the disk. + +Otherwise the lun can be re-enabled as follows: + +#. Generate a hpssacli diagnostic report. This report allows the DC + team to troubleshoot potential cabling or hardware issues so it is + imperative that you run it immediately when troubleshooting a failed + LUN. You will come back later and grep this file for more details, but + just generate it for now. + + .. code:: console + + $ sudo hpssacli controller all diag file=/tmp/hpacu.diag ris=on xml=off zip=off + +Export the following variables using the below instructions before +proceeding further. + +#. Print a list of logical drives and their numbers and take note of the + failed drive's number and array value (example output: "array A + logicaldrive 1..." would be exported as LDRIVE=1): + + .. code:: console + + $ sudo hpssacli controller slot=1 ld all show + +#. Export the number of the logical drive that was retrieved from the + previous command into the LDRIVE variable: + + .. code:: console + + $ export LDRIVE= + +#. Print the array value and Port:Box:Bay for all drives and take note of + the Port:Box:Bay for the failed drive (example output: " array A + physicaldrive 2C:1:1..." would be exported as PBOX=2C:1:1). Match the + array value of this output with the array value obtained from the + previous command to be sure you are working on the same drive. Also, + the array value usually matches the device name (For example, /dev/sdc + in the case of "array c"), but we will run a different command to be sure + we are operating on the correct device. + + .. code:: console + + $ sudo hpssacli controller slot=1 pd all show + +.. note:: + + Sometimes a LUN may appear to be failed as it is not and cannot + be mounted but the hpssacli/parted commands may show no problems with + the LUNS/drives. In this case, the filesystem may be corrupt and may be + necessary to run ``sudo xfs_check /dev/sd[a-l][1-2]`` to see if there is + an xfs issue. The results of running this command may require that + ``xfs_repair`` is run. + +#. Export the Port:Box:Bay for the failed drive into the PBOX variable: + + .. code:: console + + $ export PBOX= + +#. Print the physical device information and take note of the Disk Name + (example output: "Disk Name: /dev/sdk" would be exported as + DEV=/dev/sdk): + + .. code:: console + + $ sudo hpssacli controller slot=1 ld ${LDRIVE} show detail | grep -i "Disk Name" + +#. Export the device name variable from the preceding command (example: + /dev/sdk): + + .. code:: console + + $ export DEV= + +#. Export the filesystem variable. Disks that are split between the + operating system and data storage, typically sda and sdb, should only + have repairs done on their data filesystem, usually /dev/sda2 and + /dev/sdb2, Other data only disks have just one partition on the device, + so the filesystem will be 1. In any case you should verify the data + filesystem by running ``df -h | grep /srv/node`` and using the listed + data filesystem for the device in question as the export. For example: + /dev/sdk1. + + .. code:: console + + $ export FS= + +#. Verify the LUN is failed, and the device is not: + + .. code:: console + + $ sudo hpssacli controller slot=1 ld all show + $ sudo hpssacli controller slot=1 pd all show + $ sudo hpssacli controller slot=1 ld ${LDRIVE} show detail + $ sudo hpssacli controller slot=1 pd ${PBOX} show detail + +#. Stop the swift and rsync service: + + .. code:: console + + $ sudo service rsync stop + $ sudo swift-init shutdown all + +#. Unmount the problem drive, fix the LUN and the filesystem: + + .. code:: console + + $ sudo umount ${FS} + +#. If umount fails, you should run lsof search for the mountpoint and + kill any lingering processes before repeating the unpount: + + .. code:: console + + $ sudo hpacucli controller slot=1 ld ${LDRIVE} modify reenable + $ sudo xfs_repair ${FS} + +#. If the ``xfs_repair`` complains about possible journal data, use the + ``xfs_repair -L`` option to zeroise the journal log. + +#. Once complete test-mount the filesystem, and tidy up its lost and + found area. + + .. code:: console + + $ sudo mount ${FS} /mnt + $ sudo rm -rf /mnt/lost+found/ + $ sudo umount /mnt + +#. Mount the filesystem and restart swift and rsync. + +#. Run the following to determine if a DC ticket is needed to check the + cables on the node: + + .. code:: console + + $ grep -y media.exchanged /tmp/hpacu.diag + $ grep -y hot.plug.count /tmp/hpacu.diag + +#. If the output reports any non 0x00 values, it suggests that the cables + should be checked. For example, log a DC ticket to check the sas cables + between the drive and the expander. + +.. _diagnose_slow_disk_drives: + +Diagnose: Slow disk devices +--------------------------- + +.. note:: + + collectl is an open-source performance gathering/analysis tool. + +If the diagnostics report a message such as ``sda: drive is slow``, you +should log onto the node and run the following command (remove ``-c 1`` option to continuously monitor +the data): + +.. code:: console + + $ /usr/bin/collectl -s D -c 1 + waiting for 1 second sample... + # DISK STATISTICS (/sec) + # <---------reads---------><---------writes---------><--------averages--------> Pct + #Name KBytes Merged IOs Size KBytes Merged IOs Size RWSize QLen Wait SvcTim Util + sdb 204 0 33 6 43 0 4 11 6 1 7 6 23 + sda 84 0 13 6 108 21 6 18 10 1 7 7 13 + sdc 100 0 16 6 0 0 0 0 6 1 7 6 9 + sdd 140 0 22 6 22 0 2 11 6 1 9 9 22 + sde 76 0 12 6 255 0 52 5 5 1 2 1 10 + sdf 276 0 44 6 0 0 0 0 6 1 11 8 38 + sdg 112 0 17 7 18 0 2 9 6 1 7 7 13 + sdh 3552 0 73 49 0 0 0 0 48 1 9 8 62 + sdi 72 0 12 6 0 0 0 0 6 1 8 8 10 + sdj 112 0 17 7 22 0 2 11 7 1 10 9 18 + sdk 120 0 19 6 21 0 2 11 6 1 8 8 16 + sdl 144 0 22 7 18 0 2 9 6 1 9 7 18 + dm-0 0 0 0 0 0 0 0 0 0 0 0 0 0 + dm-1 0 0 0 0 60 0 15 4 4 0 0 0 0 + dm-2 0 0 0 0 48 0 12 4 4 0 0 0 0 + dm-3 0 0 0 0 0 0 0 0 0 0 0 0 0 + dm-4 0 0 0 0 0 0 0 0 0 0 0 0 0 + dm-5 0 0 0 0 0 0 0 0 0 0 0 0 0 + + +Look at the ``Wait`` and ``SvcTime`` values. It is not normal for +these values to exceed 50msec. This is known to impact customer +performance (upload/download). For a controller problem, many/all drives +will show long wait and service times. A reboot may correct the problem; +otherwise hardware replacement is needed. + +Another way to look at the data is as follows: + +.. code:: console + + $ /opt/hp/syseng/disk-anal.pl -d + Disk: sda Wait: 54580 371 65 25 12 6 6 0 1 2 0 46 + Disk: sdb Wait: 54532 374 96 36 16 7 4 1 0 2 0 46 + Disk: sdc Wait: 54345 554 105 29 15 4 7 1 4 4 0 46 + Disk: sdd Wait: 54175 553 254 31 20 11 6 6 2 2 1 53 + Disk: sde Wait: 54923 66 56 15 8 7 7 0 1 0 2 29 + Disk: sdf Wait: 50952 941 565 403 426 366 442 447 338 99 38 97 + Disk: sdg Wait: 50711 689 808 562 642 675 696 185 43 14 7 82 + Disk: sdh Wait: 51018 668 688 483 575 542 692 275 55 22 9 87 + Disk: sdi Wait: 51012 1011 849 672 568 240 344 280 38 13 6 81 + Disk: sdj Wait: 50724 743 770 586 662 509 684 283 46 17 11 79 + Disk: sdk Wait: 50886 700 585 517 633 511 729 352 89 23 8 81 + Disk: sdl Wait: 50106 617 794 553 604 504 532 501 288 234 165 216 + Disk: sda Time: 55040 22 16 6 1 1 13 0 0 0 3 12 + + Disk: sdb Time: 55014 41 19 8 3 1 8 0 0 0 3 17 + Disk: sdc Time: 55032 23 14 8 9 2 6 1 0 0 0 19 + Disk: sdd Time: 55022 29 17 12 6 2 11 0 0 0 1 14 + Disk: sde Time: 55018 34 15 11 12 1 9 0 0 0 2 12 + Disk: sdf Time: 54809 250 45 7 1 0 0 0 0 0 1 1 + Disk: sdg Time: 55070 36 6 2 0 0 0 0 0 0 0 0 + Disk: sdh Time: 55079 33 2 0 0 0 0 0 0 0 0 0 + Disk: sdi Time: 55074 28 7 2 0 0 2 0 0 0 0 1 + Disk: sdj Time: 55067 35 10 0 1 0 0 0 0 0 0 1 + Disk: sdk Time: 55068 31 10 3 0 0 1 0 0 0 0 1 + Disk: sdl Time: 54905 130 61 7 3 4 1 0 0 0 0 3 + +This shows the historical distribution of the wait and service times +over a day. This is how you read it: + +- sda did 54580 operations with a short wait time, 371 operations with + a longer wait time and 65 with an even longer wait time. + +- sdl did 50106 operations with a short wait time, but as you can see + many took longer. + +There is a clear pattern that sdf to sdl have a problem. Actually, sda +to sde would more normally have lots of zeros in their data. But maybe +this is a busy system. In this example it is worth changing the +controller as the individual drives may be ok. + +After the controller is changed, use collectl -s D as described above to +see if the problem has cleared. disk-anal.pl will continue to show +historical data. You can look at recent data as follows. It only looks +at data from 13:15 to 14:15. As you can see, this is a relatively clean +system (few if any long wait or service times): + +.. code:: console + + $ /opt/hp/syseng/disk-anal.pl -d -t 13:15-14:15 + Disk: sda Wait: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdb Wait: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdc Wait: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdd Wait: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sde Wait: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdf Wait: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdg Wait: 3594 6 0 0 0 0 0 0 0 0 0 0 + Disk: sdh Wait: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdi Wait: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdj Wait: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdk Wait: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdl Wait: 3599 1 0 0 0 0 0 0 0 0 0 0 + Disk: sda Time: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdb Time: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdc Time: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdd Time: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sde Time: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdf Time: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdg Time: 3594 6 0 0 0 0 0 0 0 0 0 0 + Disk: sdh Time: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdi Time: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdj Time: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdk Time: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdl Time: 3599 1 0 0 0 0 0 0 0 0 0 0 + +For long wait times, where the service time appears normal is to check +the logical drive cache status. While the cache may be enabled, it can +be disabled on a per-drive basis. + +Diagnose: Slow network link - Measuring network performance +----------------------------------------------------------- + +Network faults can cause performance between Swift nodes to degrade. Testing +with ``netperf`` is recommended. Other methods (such as copying large +files) may also work, but can produce inconclusive results. + +Install ``netperf`` on all systems if not +already installed. Check that the UFW rules for its control port are in place. +However, there are no pre-opened ports for netperf's data connection. Pick a +port number. In this example, 12866 is used because it is one higher +than netperf's default control port number, 12865. If you get very +strange results including zero values, you may not have gotten the data +port opened in UFW at the target or may have gotten the netperf +command-line wrong. + +Pick a ``source`` and ``target`` node. The source is often a proxy node +and the target is often an object node. Using the same source proxy you +can test communication to different object nodes in different AZs to +identity possible bottlenecks. + +Running tests +^^^^^^^^^^^^^ + +#. Prepare the ``target`` node as follows: + + .. code:: console + + $ sudo iptables -I INPUT -p tcp -j ACCEPT + + Or, do: + + .. code:: console + + $ sudo ufw allow 12866/tcp + +#. On the ``source`` node, run the following command to check + throughput. Note the double-dash before the -P option. + The command takes 10 seconds to complete. The ``target`` node is 192.168.245.5. + + .. code:: console + + $ netperf -H 192.168.245.5 -- -P 12866 + MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 12866 AF_INET to + .72.4 (.72.4) port 12866 AF_INET : demo + Recv Send Send + Socket Socket Message Elapsed + Size Size Size Time Throughput + bytes bytes bytes secs. 10^6bits/sec + 87380 16384 16384 10.02 923.69 + +#. On the ``source`` node, run the following command to check latency: + + .. code:: console + + $ netperf -H 192.168.245.5 -t TCP_RR -- -P 12866 + MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 12866 + AF_INET to .72.4 (.72.4) port 12866 AF_INET : demo + : first burst 0 + Local Remote Socket Size Request Resp. Elapsed Trans. + Send Recv Size Size Time Rate + bytes Bytes bytes bytes secs. per sec + 16384 87380 1 1 10.00 11753.37 + 16384 87380 + +Expected results +^^^^^^^^^^^^^^^^ + +Faults will show up as differences between different pairs of nodes. +However, for reference, here are some expected numbers: + +- For throughput, proxy to proxy, expect ~9300 Mbit/sec (proxies have + a 10Ge link). + +- For throughout, proxy to object, expect ~920 Mbit/sec (at time of + writing this, object nodes have a 1Ge link). + +- For throughput, object to object, expect ~920 Mbit/sec. + +- For latency (all types), expect ~11000 transactions/sec. + +Diagnose: Remapping sectors experiencing UREs +--------------------------------------------- + +#. Find the bad sector, device, and filesystem in ``kern.log``. + +#. Set the environment variables SEC, DEV & FS, for example: + + .. code:: console + + $ SEC=2930954256 + $ DEV=/dev/sdi + $ FS=/dev/sdi1 + +#. Verify that the sector is bad: + + .. code:: console + + $ sudo dd if=${DEV} of=/dev/null bs=512 count=1 skip=${SEC} + +#. If the sector is bad this command will output an input/output error: + + .. code:: console + + dd: reading `/dev/sdi`: Input/output error + 0+0 records in + 0+0 records out + +#. Prevent chef from attempting to re-mount the filesystem while the + repair is in progress: + + .. code:: console + + $ sudo mv /etc/chef/client.pem /etc/chef/xx-client.xx-pem + +#. Stop the swift and rsync service: + + .. code:: console + + $ sudo service rsync stop + $ sudo swift-init shutdown all + +#. Unmount the problem drive: + + .. code:: console + + $ sudo umount ${FS} + +#. Overwrite/remap the bad sector: + + .. code:: console + + $ sudo dd_rescue -d -A -m8b -s ${SEC}b ${DEV} ${DEV} + +#. This command should report an input/output error the first time + it is run. Run the command a second time, if it successfully remapped + the bad sector it should not report an input/output error. + +#. Verify the sector is now readable: + + .. code:: console + + $ sudo dd if=${DEV} of=/dev/null bs=512 count=1 skip=${SEC} + +#. If the sector is now readable this command should not report an + input/output error. + +#. If more than one problem sector is listed, set the SEC environment + variable to the next sector in the list: + + .. code:: console + + $ SEC=123456789 + +#. Repeat from step 8. + +#. Repair the filesystem: + + .. code:: console + + $ sudo xfs_repair ${FS} + +#. If ``xfs_repair`` reports that the filesystem has valuable filesystem + changes: + + .. code:: console + + $ sudo xfs_repair ${FS} + Phase 1 - find and verify superblock... + Phase 2 - using internal log + - zero log... + ERROR: The filesystem has valuable metadata changes in a log which + needs to be replayed. + Mount the filesystem to replay the log, and unmount it before + re-running xfs_repair. + If you are unable to mount the filesystem, then use the -L option to + destroy the log and attempt a repair. Note that destroying the log may + cause corruption -- please attempt a mount of the filesystem before + doing this. + +#. You should attempt to mount the filesystem, and clear the lost+found + area: + + .. code:: console + + $ sudo mount $FS /mnt + $ sudo rm -rf /mnt/lost+found/* + $ sudo umount /mnt + +#. If the filesystem fails to mount then you will need to use the + ``xfs_repair -L`` option to force log zeroing. + Repeat step 11. + +#. If ``xfs_repair`` reports that an additional input/output error has been + encountered, get the sector details as follows: + + .. code:: console + + $ sudo grep "I/O error" /var/log/kern.log | grep sector | tail -1 + +#. If new input/output error is reported then set the SEC environment + variable to the problem sector number: + + .. code:: console + + $ SEC=234567890 + +#. Repeat from step 8 + + +#. Remount the filesystem and restart swift and rsync. + + - If all UREs in the kern.log have been fixed and you are still unable + to have xfs_repair disk, it is possible that the URE's have + corrupted the filesystem or possibly destroyed the drive altogether. + In this case, the first step is to re-format the filesystem and if + this fails, get the disk replaced. + + +Diagnose: High system latency +----------------------------- + +.. note:: + + The latency measurements described here are specific to the HPE + Helion Public Cloud. + +- A bad NIC on a proxy server. However, as explained above, this + usually causes the peak to rise, but average should remain near + normal parameters. A quick fix is to shutdown the proxy. + +- A stuck memcache server. Accepts connections, but then will not respond. + Expect to see timeout messages in ``/var/log/proxy.log`` (port 11211). + Swift Diags will also report this as a failed node/port. A quick fix + is to shutdown the proxy server. + +- A bad/broken object server can also cause problems if the accounts + used by the monitor program happen to live on the bad object server. + +- A general network problem within the data canter. Compare the results + with the Pingdom monitors to see if they also have a problem. + +Diagnose: Interface reports errors +---------------------------------- + +Should a network interface on a Swift node begin reporting network +errors, it may well indicate a cable, switch, or network issue. + +Get an overview of the interface with: + +.. code:: console + + $ sudo ifconfig eth{n} + $ sudo ethtool eth{n} + +The ``Link Detected:`` indicator will read ``yes`` if the nic is +cabled. + +Establish the adapter type with: + +.. code:: console + + $ sudo ethtool -i eth{n} + +Gather the interface statistics with: + +.. code:: console + + $ sudo ethtool -S eth{n} + +If the nick supports self test, this can be performed with: + +.. code:: console + + $ sudo ethtool -t eth{n} + +Self tests should read ``PASS`` if the nic is operating correctly. + +Nic module drivers can be re-initialised by carefully removing and +re-installing the modules (this avoids rebooting the server). +For example, mellanox drivers use a two part driver mlx4_en and +mlx4_core. To reload these you must carefully remove the mlx4_en +(ethernet) then the mlx4_core modules, and reinstall them in the +reverse order. + +As the interface will be disabled while the modules are unloaded, you +must be very careful not to lock yourself out so it may be better +to script this. + +Diagnose: Hung swift object replicator +-------------------------------------- + +A replicator reports in its log that remaining time exceeds +100 hours. This may indicate that the swift ``object-replicator`` is stuck and not +making progress. Another useful way to check this is with the +'swift-recon -r' command on a swift proxy server: + +.. code:: console + + $ sudo swift-recon -r + =============================================================================== + + --> Starting reconnaissance on 384 hosts + =============================================================================== + [2013-07-17 12:56:19] Checking on replication + [replication_time] low: 2, high: 80, avg: 28.8, total: 11037, Failed: 0.0%, no_result: 0, reported: 383 + Oldest completion was 2013-06-12 22:46:50 (12 days ago) by 192.168.245.3:6200. + Most recent completion was 2013-07-17 12:56:19 (5 seconds ago) by 192.168.245.5:6200. + =============================================================================== + +The ``Oldest completion`` line in this example indicates that the +object-replicator on swift object server 192.168.245.3 has not completed +the replication cycle in 12 days. This replicator is stuck. The object +replicator cycle is generally less than 1 hour. Though an replicator +cycle of 15-20 hours can occur if nodes are added to the system and a +new ring has been deployed. + +You can further check if the object replicator is stuck by logging on +the object server and checking the object replicator progress with +the following command: + +.. code:: console + + $ sudo grep object-rep /var/log/swift/background.log | grep -e "Starting object replication" -e "Object replication complete" -e "partitions rep" + Jul 16 06:25:46 192.168.245.4 object-replicator 15344/16450 (93.28%) partitions replicated in 69018.48s (0.22/sec, 22h remaining) + Jul 16 06:30:46 192.168.245.4object-replicator 15344/16450 (93.28%) partitions replicated in 69318.58s (0.22/sec, 22h remaining) + Jul 16 06:35:46 192.168.245.4 object-replicator 15344/16450 (93.28%) partitions replicated in 69618.63s (0.22/sec, 23h remaining) + Jul 16 06:40:46 192.168.245.4 object-replicator 15344/16450 (93.28%) partitions replicated in 69918.73s (0.22/sec, 23h remaining) + Jul 16 06:45:46 192.168.245.4 object-replicator 15348/16450 (93.30%) partitions replicated in 70218.75s (0.22/sec, 24h remaining) + Jul 16 06:50:47 192.168.245.4object-replicator 15348/16450 (93.30%) partitions replicated in 70518.85s (0.22/sec, 24h remaining) + Jul 16 06:55:47 192.168.245.4 object-replicator 15348/16450 (93.30%) partitions replicated in 70818.95s (0.22/sec, 25h remaining) + Jul 16 07:00:47 192.168.245.4 object-replicator 15348/16450 (93.30%) partitions replicated in 71119.05s (0.22/sec, 25h remaining) + Jul 16 07:05:47 192.168.245.4 object-replicator 15348/16450 (93.30%) partitions replicated in 71419.15s (0.21/sec, 26h remaining) + Jul 16 07:10:47 192.168.245.4object-replicator 15348/16450 (93.30%) partitions replicated in 71719.25s (0.21/sec, 26h remaining) + Jul 16 07:15:47 192.168.245.4 object-replicator 15348/16450 (93.30%) partitions replicated in 72019.27s (0.21/sec, 27h remaining) + Jul 16 07:20:47 192.168.245.4object-replicator 15348/16450 (93.30%) partitions replicated in 72319.37s (0.21/sec, 27h remaining) + Jul 16 07:25:47 192.168.245.4 object-replicator 15348/16450 (93.30%) partitions replicated in 72619.47s (0.21/sec, 28h remaining) + Jul 16 07:30:47 192.168.245.4 object-replicator 15348/16450 (93.30%) partitions replicated in 72919.56s (0.21/sec, 28h remaining) + Jul 16 07:35:47 192.168.245.4 object-replicator 15348/16450 (93.30%) partitions replicated in 73219.67s (0.21/sec, 29h remaining) + Jul 16 07:40:47 192.168.245.4 object-replicator 15348/16450 (93.30%) partitions replicated in 73519.76s (0.21/sec, 29h remaining) + +The above status is output every 5 minutes to ``/var/log/swift/background.log``. + +.. note:: + + The 'remaining' time is increasing as time goes on, normally the + time remaining should be decreasing. Also note the partition number. For example, + 15344 remains the same for several status lines. Eventually the object + replicator detects the hang and attempts to make progress by killing the + problem thread. The replicator then progresses to the next partition but + quite often it again gets stuck on the same partition. + +One of the reasons for the object replicator hanging like this is +filesystem corruption on the drive. The following is a typical log entry +of a corrupted filesystem detected by the object replicator: + +.. code:: console + + $ sudo bzgrep "Remote I/O error" /var/log/swift/background.log* |grep srv | - tail -1 + Jul 12 03:33:30 192.168.245.4 object-replicator STDOUT: ERROR:root:Error hashing suffix#012Traceback (most recent call last):#012 File + "/usr/lib/python2.7/dist-packages/swift/obj/replicator.py", line 199, in get_hashes#012 hashes[suffix] = hash_suffix(suffix_dir, + reclaim_age)#012 File "/usr/lib/python2.7/dist-packages/swift/obj/replicator.py", line 84, in hash_suffix#012 path_contents = + sorted(os.listdir(path))#012OSError: [Errno 121] Remote I/O error: '/srv/node/disk4/objects/1643763/b51' + +An ``ls`` of the problem file or directory usually shows something like the following: + +.. code:: console + + $ ls -l /srv/node/disk4/objects/1643763/b51 + ls: cannot access /srv/node/disk4/objects/1643763/b51: Remote I/O error + +If no entry with ``Remote I/O error`` occurs in the ``background.log`` it is +not possible to determine why the object-replicator is hung. It may be +that the ``Remote I/O error`` entry is older than 7 days and so has been +rotated out of the logs. In this scenario it may be best to simply +restart the object-replicator. + +#. Stop the object-replicator: + + .. code:: console + + # sudo swift-init object-replicator stop + +#. Make sure the object replicator has stopped, if it has hung, the stop + command will not stop the hung process: + + .. code:: console + + # ps auxww | - grep swift-object-replicator + +#. If the previous ps shows the object-replicator is still running, kill + the process: + + .. code:: console + + # kill -9 + +#. Start the object-replicator: + + .. code:: console + + # sudo swift-init object-replicator start + +If the above grep did find an ``Remote I/O error`` then it may be possible +to repair the problem filesystem. + +#. Stop swift and rsync: + + .. code:: console + + # sudo swift-init all shutdown + # sudo service rsync stop + +#. Make sure all swift process have stopped: + + .. code:: console + + # ps auxww | grep swift | grep python + +#. Kill any swift processes still running. + +#. Unmount the problem filesystem: + + .. code:: console + + # sudo umount /srv/node/disk4 + +#. Repair the filesystem: + + .. code:: console + + # sudo xfs_repair -P /dev/sde1 + +#. If the ``xfs_repair`` fails then it may be necessary to re-format the + filesystem. See :ref:`fix_broken_xfs_filesystem`. If the + ``xfs_repair`` is successful, re-enable chef using the following command + and replication should commence again. + + +Diagnose: High CPU load +----------------------- + +The CPU load average on an object server, as shown with the +'uptime' command, is typically under 10 when the server is +lightly-moderately loaded: + +.. code:: console + + $ uptime + 07:59:26 up 99 days, 5:57, 1 user, load average: 8.59, 8.39, 8.32 + +During times of increased activity, due to user transactions or object +replication, the CPU load average can increase to to around 30. + +However, sometimes the CPU load average can increase significantly. The +following is an example of an object server that has extremely high CPU +load: + +.. code:: console + + $ uptime + 07:44:02 up 18:22, 1 user, load average: 407.12, 406.36, 404.59 + +Further issues and resolutions +------------------------------ + +.. note:: + + The urgency levels in each **Action** column indicates whether or + not it is required to take immediate action, or if the problem can be worked + on during business hours. + +.. list-table:: + :widths: 33 33 33 + :header-rows: 1 + + * - **Scenario** + - **Description** + - **Action** + * - ``/healthcheck`` latency is high. + - The ``/healthcheck`` test does not tax the proxy very much so any drop in value is probably related to + network issues, rather than the proxies being very busy. A very slow proxy might impact the average + number, but it would need to be very slow to shift the number that much. + - Check networks. Do a ``curl https://:/healthcheck`` where + ``ip-address`` is individual proxy IP address. + Repeat this for every proxy server to see if you can pin point the problem. + + Urgency: If there are other indications that your system is slow, you should treat + this as an urgent problem. + * - Swift process is not running. + - You can use ``swift-init`` status to check if swift processes are running on any + given server. + - Run this command: + + .. code:: console + + $ sudo swift-init all start + + Examine messages in the swift log files to see if there are any + error messages related to any of the swift processes since the time you + ran the ``swift-init`` command. + + Take any corrective actions that seem necessary. + + Urgency: If this only affects one server, and you have more than one, + identifying and fixing the problem can wait until business hours. + If this same problem affects many servers, then you need to take corrective + action immediately. + * - ntpd is not running. + - NTP is not running. + - Configure and start NTP. + + Urgency: For proxy servers, this is vital. + + * - Host clock is not syncd to an NTP server. + - Node time settings does not match NTP server time. + This may take some time to sync after a reboot. + - Assuming NTP is configured and running, you have to wait until the times sync. + * - A swift process has hundreds, to thousands of open file descriptors. + - May happen to any of the swift processes. + Known to have happened with a ``rsyslod`` restart and where ``/tmp`` was hanging. + + - Restart the swift processes on the affected node: + + .. code:: console + + $ sudo swift-init all reload + + Urgency: + If known performance problem: Immediate + + If system seems fine: Medium + * - A swift process is not owned by the swift user. + - If the UID of the swift user has changed, then the processes might not be + owned by that UID. + - Urgency: If this only affects one server, and you have more than one, + identifying and fixing the problem can wait until business hours. + If this same problem affects many servers, then you need to take corrective + action immediately. + * - Object account or container files not owned by swift. + - This typically happens if during a reinstall or a re-image of a server that the UID + of the swift user was changed. The data files in the object account and container + directories are owned by the original swift UID. As a result, the current swift + user does not own these files. + - Correct the UID of the swift user to reflect that of the original UID. An alternate + action is to change the ownership of every file on all file systems. This alternate + action is often impractical and will take considerable time. + + Urgency: If this only affects one server, and you have more than one, + identifying and fixing the problem can wait until business hours. + If this same problem affects many servers, then you need to take corrective + action immediately. + * - A disk drive has a high IO wait or service time. + - If high wait IO times are seen for a single disk, then the disk drive is the problem. + If most/all devices are slow, the controller is probably the source of the problem. + The controller cache may also be miss configured – which will cause similar long + wait or service times. + - As a first step, if your controllers have a cache, check that it is enabled and their battery/capacitor + is working. + + Second, reboot the server. + If problem persists, file a DC ticket to have the drive or controller replaced. + See :ref:`diagnose_slow_disk_drives` on how to check the drive wait or service times. + + Urgency: Medium + * - The network interface is not up. + - Use the ``ifconfig`` and ``ethtool`` commands to determine the network state. + - You can try restarting the interface. However, generally the interface + (or cable) is probably broken, especially if the interface is flapping. + + Urgency: If this only affects one server, and you have more than one, + identifying and fixing the problem can wait until business hours. + If this same problem affects many servers, then you need to take corrective + action immediately. + * - Network interface card (NIC) is not operating at the expected speed. + - The NIC is running at a slower speed than its nominal rated speed. + For example, it is running at 100 Mb/s and the NIC is a 1Ge NIC. + - 1. Try resetting the interface with: + + .. code:: console + + $ sudo ethtool -s eth0 speed 1000 + + ... and then run: + + .. code:: console + + $ sudo lshw -class + + See if size goes to the expected speed. Failing + that, check hardware (NIC cable/switch port). + + 2. If persistent, consider shutting down the server (especially if a proxy) + until the problem is identified and resolved. If you leave this server + running it can have a large impact on overall performance. + + Urgency: High + * - The interface RX/TX error count is non-zero. + - A value of 0 is typical, but counts of 1 or 2 do not indicate a problem. + - 1. For low numbers (For example, 1 or 2), you can simply ignore. Numbers in the range + 3-30 probably indicate that the error count has crept up slowly over a long time. + Consider rebooting the server to remove the report from the noise. + + Typically, when a cable or interface is bad, the error count goes to 400+. For example, + it stands out. There may be other symptoms such as the interface going up and down or + not running at correct speed. A server with a high error count should be watched. + + 2. If the error count continues to climb, consider taking the server down until + it can be properly investigated. In any case, a reboot should be done to clear + the error count. + + Urgency: High, if the error count increasing. + + * - In a swift log you see a message that a process has not replicated in over 24 hours. + - The replicator has not successfully completed a run in the last 24 hours. + This indicates that the replicator has probably hung. + - Use ``swift-init`` to stop and then restart the replicator process. + + Urgency: Low. However if you + recently added or replaced disk drives then you should treat this urgently. + * - Container Updater has not run in 4 hour(s). + - The service may appear to be running however, it may be hung. Examine their swift + logs to see if there are any error messages relating to the container updater. This + may potentially explain why the container is not running. + - Urgency: Medium + This may have been triggered by a recent restart of the rsyslog daemon. + Restart the service with: + + .. code:: console + + $ sudo swift-init reload + + * - Object replicator: Reports the remaining time and that time is more than 100 hours. + - Each replication cycle the object replicator writes a log message to its log + reporting statistics about the current cycle. This includes an estimate for the + remaining time needed to replicate all objects. If this time is longer than + 100 hours, there is a problem with the replication process. + - Urgency: Medium + Restart the service with: + + .. code:: console + + $ sudo swift-init object-replicator reload + + Check that the remaining replication time is going down. + diff --git a/doc/source/ops_runbook/index.rst b/doc/source/ops_runbook/index.rst new file mode 100644 index 0000000000..8bf3f425ac --- /dev/null +++ b/doc/source/ops_runbook/index.rst @@ -0,0 +1,27 @@ +================= +Swift Ops Runbook +================= + +This document contains operational procedures that Hewlett Packard Enterprise (HPE) uses to operate +and monitor the Swift system within the HPE Helion Public Cloud. This +document is an excerpt of a larger product-specific handbook. As such, +the material may appear incomplete. The suggestions and recommendations +made in this document are for our particular environment, and may not be +suitable for your environment or situation. We make no representations +concerning the accuracy, adequacy, completeness or suitability of the +information, suggestions or recommendations. This document are provided +for reference only. We are not responsible for your use of any +information, suggestions or recommendations contained herein. + + +.. toctree:: + :maxdepth: 2 + + diagnose.rst + procedures.rst + maintenance.rst + troubleshooting.rst + + + + diff --git a/doc/source/ops_runbook/maintenance.rst b/doc/source/ops_runbook/maintenance.rst new file mode 100644 index 0000000000..c63feb7bd5 --- /dev/null +++ b/doc/source/ops_runbook/maintenance.rst @@ -0,0 +1,330 @@ +================== +Server maintenance +================== + +General assumptions +~~~~~~~~~~~~~~~~~~~ + +- It is assumed that anyone attempting to replace hardware components + will have already read and understood the appropriate maintenance and + service guides. + +- It is assumed that where servers need to be taken off-line for + hardware replacement, that this will be done in series, bringing the + server back on-line before taking the next off-line. + +- It is assumed that the operations directed procedure will be used for + identifying hardware for replacement. + +Assessing the health of swift +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can run the swift-recon tool on a Swift proxy node to get a quick +check of how Swift is doing. Please note that the numbers below are +necessarily somewhat subjective. Sometimes parameters for which we +say 'low values are good' will have pretty high values for a time. Often +if you wait a while things get better. + +For example: + +.. code:: console + + $ sudo swift-recon -rla + =============================================================================== + [2012-03-10 12:57:21] Checking async pendings on 384 hosts... + Async stats: low: 0, high: 1, avg: 0, total: 1 + =============================================================================== + + [2012-03-10 12:57:22] Checking replication times on 384 hosts... + [Replication Times] shortest: 1.4113877813, longest: 36.8293570836, avg: 4.86278064749 + =============================================================================== + + [2012-03-10 12:57:22] Checking load avg's on 384 hosts... + [5m load average] lowest: 2.22, highest: 9.5, avg: 4.59578125 + [15m load average] lowest: 2.36, highest: 9.45, avg: 4.62622395833 + [1m load average] lowest: 1.84, highest: 9.57, avg: 4.5696875 + =============================================================================== + +In the example above we ask for information on replication times (-r), +load averages (-l) and async pendings (-a). This is a healthy Swift +system. Rules-of-thumb for 'good' recon output are: + +- Nodes that respond are up and running Swift. If all nodes respond, + that is a good sign. But some nodes may time out. For example: + + .. code:: console + + -> [http://.29:6200/recon/load:] + -> [http://.31:6200/recon/load:] + +- That could be okay or could require investigation. + +- Low values (say < 10 for high and average) for async pendings are + good. Higher values occur when disks are down and/or when the system + is heavily loaded. Many simultaneous PUTs to the same container can + drive async pendings up. This may be normal, and may resolve itself + after a while. If it persists, one way to track down the problem is + to find a node with high async pendings (with ``swift-recon -av | sort + -n -k4``), then check its Swift logs, Often async pendings are high + because a node cannot write to a container on another node. Often + this is because the node or disk is offline or bad. This may be okay + if we know about it. + +- Low values for replication times are good. These values rise when new + rings are pushed, and when nodes and devices are brought back on + line. + +- Our 'high' load average values are typically in the 9-15 range. If + they are a lot bigger it is worth having a look at the systems + pushing the average up. Run ``swift-recon -av`` to get the individual + averages. To sort the entries with the highest at the end, + run ``swift-recon -av | sort -n -k4``. + +For comparison here is the recon output for the same system above when +two entire racks of Swift are down: + +.. code:: console + + [2012-03-10 16:56:33] Checking async pendings on 384 hosts... + -> http://.22:6200/recon/async: + -> http://.18:6200/recon/async: + -> http://.16:6200/recon/async: + -> http://.13:6200/recon/async: + -> http://.30:6200/recon/async: + -> http://.6:6200/recon/async: + ......... + -> http://.5:6200/recon/async: + -> http://.15:6200/recon/async: + -> http://.9:6200/recon/async: + -> http://.27:6200/recon/async: + -> http://.4:6200/recon/async: + -> http://.8:6200/recon/async: + Async stats: low: 243, high: 659, avg: 413, total: 132275 + =============================================================================== + [2012-03-10 16:57:48] Checking replication times on 384 hosts... + -> http://.22:6200/recon/replication: + -> http://.18:6200/recon/replication: + -> http://.16:6200/recon/replication: + -> http://.13:6200/recon/replication: + -> http://.30:6200/recon/replication: + -> http://.6:6200/recon/replication: + ............ + -> http://.5:6200/recon/replication: + -> http://.15:6200/recon/replication: + -> http://.9:6200/recon/replication: + -> http://.27:6200/recon/replication: + -> http://.4:6200/recon/replication: + -> http://.8:6200/recon/replication: + [Replication Times] shortest: 1.38144306739, longest: 112.620954418, avg: 10.285 + 9475361 + =============================================================================== + [2012-03-10 16:59:03] Checking load avg's on 384 hosts... + -> http://.22:6200/recon/load: + -> http://.18:6200/recon/load: + -> http://.16:6200/recon/load: + -> http://.13:6200/recon/load: + -> http://.30:6200/recon/load: + -> http://.6:6200/recon/load: + ............ + -> http://.15:6200/recon/load: + -> http://.9:6200/recon/load: + -> http://.27:6200/recon/load: + -> http://.4:6200/recon/load: + -> http://.8:6200/recon/load: + [5m load average] lowest: 1.71, highest: 4.91, avg: 2.486375 + [15m load average] lowest: 1.79, highest: 5.04, avg: 2.506125 + [1m load average] lowest: 1.46, highest: 4.55, avg: 2.4929375 + =============================================================================== + +.. note:: + + The replication times and load averages are within reasonable + parameters, even with 80 object stores down. Async pendings, however is + quite high. This is due to the fact that the containers on the servers + which are down cannot be updated. When those servers come back up, async + pendings should drop. If async pendings were at this level without an + explanation, we have a problem. + +Recon examples +~~~~~~~~~~~~~~ + +Here is an example of noting and tracking down a problem with recon. + +Running reccon shows some async pendings: + +.. code:: console + + $ ssh -q .132.7 sudo swift-recon -alr + =============================================================================== + [2012-03-14 17:25:55] Checking async pendings on 384 hosts... + Async stats: low: 0, high: 23, avg: 8, total: 3356 + =============================================================================== + [2012-03-14 17:25:55] Checking replication times on 384 hosts... + [Replication Times] shortest: 1.49303831657, longest: 39.6982825994, avg: 4.2418222066 + =============================================================================== + [2012-03-14 17:25:56] Checking load avg's on 384 hosts... + [5m load average] lowest: 2.35, highest: 8.88, avg: 4.45911458333 + [15m load average] lowest: 2.41, highest: 9.11, avg: 4.504765625 + [1m load average] lowest: 1.95, highest: 8.56, avg: 4.40588541667 + =============================================================================== + +Why? Running recon again with -av swift (not shown here) tells us that +the node with the highest (23) is .72.61. Looking at the log +files on .72.61 we see: + +.. code:: console + + $ sudo tail -f /var/log/swift/background.log | - grep -i ERROR + Mar 14 17:28:06 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.119', 'id': 5481, 'meta': '', 'device': 'disk6', 'port': 6201} + Mar 14 17:28:06 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.119', 'id': 5481, 'meta': '', 'device': 'disk6', 'port': 6201} + Mar 14 17:28:09 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.20', 'id': 2311, 'meta': '', 'device': 'disk5', 'port': 6201} + Mar 14 17:28:11 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.20', 'id': 2311, 'meta': '', 'device': 'disk5', 'port': 6201} + Mar 14 17:28:13 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.119', 'id': 5481, 'meta': '', 'device': 'disk6', 'port': 6201} + Mar 14 17:28:13 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.119', 'id': 5481, 'meta': '', 'device': 'disk6', 'port': 6201} + Mar 14 17:28:15 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.20', 'id': 2311, 'meta': '', 'device': 'disk5', 'port': 6201} + Mar 14 17:28:15 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.20', 'id': 2311, 'meta': '', 'device': 'disk5', 'port': 6201} + Mar 14 17:28:19 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.20', 'id': 2311, 'meta': '', 'device': 'disk5', 'port': 6201} + Mar 14 17:28:19 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.20', 'id': 2311, 'meta': '', 'device': 'disk5', 'port': 6201} + Mar 14 17:28:20 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.119', 'id': 5481, 'meta': '', 'device': 'disk6', 'port': 6201} + Mar 14 17:28:21 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.20', 'id': 2311, 'meta': '', 'device': 'disk5', 'port': 6201} + Mar 14 17:28:21 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.20', 'id': 2311, 'meta': '', 'device': 'disk5', 'port': 6201} + Mar 14 17:28:22 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.20', 'id': 2311, 'meta': '', 'device': 'disk5', 'port': 6201} + +That is why this node has a lot of async pendings: a bunch of disks that +are not mounted on and . There may be other issues, +but clearing this up will likely drop the async pendings a fair bit, as +other nodes will be having the same problem. + +Assessing the availability risk when multiple storage servers are down +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. note:: + + This procedure will tell you if you have a problem, however, in practice + you will find that you will not use this procedure frequently. + +If three storage nodes (or, more precisely, three disks on three +different storage nodes) are down, there is a small but nonzero +probability that user objects, containers, or accounts will not be +available. + +Procedure +--------- + +.. note:: + + swift has three rings: one each for objects, containers and accounts. + This procedure should be run three times, each time specifying the + appropriate ``*.builder`` file. + +#. Determine whether all three nodes are in different Swift zones by + running the ring builder on a proxy node to determine which zones + the storage nodes are in. For example: + + .. code:: console + + % sudo swift-ring-builder /etc/swift/object.builder + /etc/swift/object.builder, build version 1467 + 2097152 partitions, 3 replicas, 5 zones, 1320 devices, 0.02 balance + The minimum number of hours before a partition can be reassigned is 24 + Devices: id zone ip address port name weight partitions balance meta + 0 1 .4 6200 disk0 1708.00 4259 -0.00 + 1 1 .4 6200 disk1 1708.00 4260 0.02 + 2 1 .4 6200 disk2 1952.00 4868 0.01 + 3 1 .4 6200 disk3 1952.00 4868 0.01 + 4 1 .4 6200 disk4 1952.00 4867 -0.01 + +#. Here, node .4 is in zone 1. If two or more of the three + nodes under consideration are in the same Swift zone, they do not + have any ring partitions in common; there is little/no data + availability risk if all three nodes are down. + +#. If the nodes are in three distinct Swift zones it is necessary to + whether the nodes have ring partitions in common. Run ``swift-ring`` + builder again, this time with the ``list_parts`` option and specify + the nodes under consideration. For example: + + .. code:: console + + % sudo swift-ring-builder /etc/swift/object.builder list_parts .8 .15 .72.2 + Partition Matches + 91 2 + 729 2 + 3754 2 + 3769 2 + 3947 2 + 5818 2 + 7918 2 + 8733 2 + 9509 2 + 10233 2 + +#. The ``list_parts`` option to the ring builder indicates how many ring + partitions the nodes have in common. If, as in this case, the + first entry in the list has a 'Matches' column of 2 or less, there + is no data availability risk if all three nodes are down. + +#. If the 'Matches' column has entries equal to 3, there is some data + availability risk if all three nodes are down. The risk is generally + small, and is proportional to the number of entries that have a 3 in + the Matches column. For example: + + .. code:: console + + Partition Matches + 26865 3 + 362367 3 + 745940 3 + 778715 3 + 797559 3 + 820295 3 + 822118 3 + 839603 3 + 852332 3 + 855965 3 + 858016 3 + +#. A quick way to count the number of rows with 3 matches is: + + .. code:: console + + % sudo swift-ring-builder /etc/swift/object.builder list_parts .8 .15 .72.2 | grep "3$" | wc -l + + 30 + +#. In this case the nodes have 30 out of a total of 2097152 partitions + in common; about 0.001%. In this case the risk is small/nonzero. + Recall that a partition is simply a portion of the ring mapping + space, not actual data. So having partitions in common is a necessary + but not sufficient condition for data unavailability. + + .. note:: + + We should not bring down a node for repair if it shows + Matches entries of 3 with other nodes that are also down. + + If three nodes that have 3 partitions in common are all down, there is + a nonzero probability that data are unavailable and we should work to + bring some or all of the nodes up ASAP. + +Swift startup/shutdown +~~~~~~~~~~~~~~~~~~~~~~ + +- Use reload - not stop/start/restart. + +- Try to roll sets of servers (especially proxy) in groups of less + than 20% of your servers. diff --git a/doc/source/ops_runbook/procedures.rst b/doc/source/ops_runbook/procedures.rst new file mode 100644 index 0000000000..1d84d59698 --- /dev/null +++ b/doc/source/ops_runbook/procedures.rst @@ -0,0 +1,412 @@ +================================= +Software configuration procedures +================================= + +.. _fix_broken_gpt_table: + +Fix broken GPT table (broken disk partition) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- If a GPT table is broken, a message like the following should be + observed when the command... + + .. code:: console + + $ sudo parted -l + +- ... is run. + + .. code:: console + + ... + Error: The backup GPT table is corrupt, but the primary appears OK, so that will + be used. + OK/Cancel? + +#. To fix this, firstly install the ``gdisk`` program to fix this: + + .. code:: console + + $ sudo aptitude install gdisk + +#. Run ``gdisk`` for the particular drive with the damaged partition: + + .. code: console + + $ sudo gdisk /dev/sd*a-l* + GPT fdisk (gdisk) version 0.6.14 + + Caution: invalid backup GPT header, but valid main header; regenerating + backup header from main header. + + Warning! One or more CRCs don't match. You should repair the disk! + + Partition table scan: + MBR: protective + BSD: not present + APM: not present + GPT: damaged + /dev/sd + ***************************************************************************** + Caution: Found protective or hybrid MBR and corrupt GPT. Using GPT, but disk + verification and recovery are STRONGLY recommended. + ***************************************************************************** + +#. On the command prompt, type ``r`` (recovery and transformation + options), followed by ``d`` (use main GPT header) , ``v`` (verify disk) + and finally ``w`` (write table to disk and exit). Will also need to + enter ``Y`` when prompted in order to confirm actions. + + .. code:: console + + Command (? for help): r + + Recovery/transformation command (? for help): d + + Recovery/transformation command (? for help): v + + Caution: The CRC for the backup partition table is invalid. This table may + be corrupt. This program will automatically create a new backup partition + table when you save your partitions. + + Caution: Partition 1 doesn't begin on a 8-sector boundary. This may + result in degraded performance on some modern (2009 and later) hard disks. + + Caution: Partition 2 doesn't begin on a 8-sector boundary. This may + result in degraded performance on some modern (2009 and later) hard disks. + + Caution: Partition 3 doesn't begin on a 8-sector boundary. This may + result in degraded performance on some modern (2009 and later) hard disks. + + Identified 1 problems! + + Recovery/transformation command (? for help): w + + Final checks complete. About to write GPT data. THIS WILL OVERWRITE EXISTING + PARTITIONS!! + + Do you want to proceed, possibly destroying your data? (Y/N): Y + + OK; writing new GUID partition table (GPT). + The operation has completed successfully. + +#. Running the command: + + .. code:: console + + $ sudo parted /dev/sd# + +#. Should now show that the partition is recovered and healthy again. + +#. Finally, uninstall ``gdisk`` from the node: + + .. code:: console + + $ sudo aptitude remove gdisk + +.. _fix_broken_xfs_filesystem: + +Procedure: Fix broken XFS filesystem +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +#. A filesystem may be corrupt or broken if the following output is + observed when checking its label: + + .. code:: console + + $ sudo xfs_admin -l /dev/sd# + cache_node_purge: refcount was 1, not zero (node=0x25d5ee0) + xfs_admin: cannot read root inode (117) + cache_node_purge: refcount was 1, not zero (node=0x25d92b0) + xfs_admin: cannot read realtime bitmap inode (117) + bad sb magic # 0 in AG 1 + failed to read label in AG 1 + +#. Run the following commands to remove the broken/corrupt filesystem and replace. + (This example uses the filesystem ``/dev/sdb2``) Firstly need to replace the partition: + + .. code:: console + + $ sudo parted + GNU Parted 2.3 + Using /dev/sda + Welcome to GNU Parted! Type 'help' to view a list of commands. + (parted) select /dev/sdb + Using /dev/sdb + (parted) p + Model: HP LOGICAL VOLUME (scsi) + Disk /dev/sdb: 2000GB + Sector size (logical/physical): 512B/512B + Partition Table: gpt + + Number Start End Size File system Name Flags + 1 17.4kB 1024MB 1024MB ext3 boot + 2 1024MB 1751GB 1750GB xfs sw-aw2az1-object045-disk1 + 3 1751GB 2000GB 249GB lvm + + (parted) rm 2 + (parted) mkpart primary 2 -1 + Warning: You requested a partition from 2000kB to 2000GB. + The closest location we can manage is 1024MB to 1751GB. + Is this still acceptable to you? + Yes/No? Yes + Warning: The resulting partition is not properly aligned for best performance. + Ignore/Cancel? Ignore + (parted) p + Model: HP LOGICAL VOLUME (scsi) + Disk /dev/sdb: 2000GB + Sector size (logical/physical): 512B/512B + Partition Table: gpt + + Number Start End Size File system Name Flags + 1 17.4kB 1024MB 1024MB ext3 boot + 2 1024MB 1751GB 1750GB xfs primary + 3 1751GB 2000GB 249GB lvm + + (parted) quit + +#. Next step is to scrub the filesystem and format: + + .. code:: console + + $ sudo dd if=/dev/zero of=/dev/sdb2 bs=$((1024*1024)) count=1 + 1+0 records in + 1+0 records out + 1048576 bytes (1.0 MB) copied, 0.00480617 s, 218 MB/s + $ sudo /sbin/mkfs.xfs -f -i size=1024 /dev/sdb2 + meta-data=/dev/sdb2 isize=1024 agcount=4, agsize=106811524 blks + = sectsz=512 attr=2, projid32bit=0 + data = bsize=4096 blocks=427246093, imaxpct=5 + = sunit=0 swidth=0 blks + naming =version 2 bsize=4096 ascii-ci=0 + log =internal log bsize=4096 blocks=208616, version=2 + = sectsz=512 sunit=0 blks, lazy-count=1 + realtime =none extsz=4096 blocks=0, rtextents=0 + +#. You should now label and mount your filesystem. + +#. Can now check to see if the filesystem is mounted using the command: + + .. code:: console + + $ mount + +.. _checking_if_account_ok: + +Procedure: Checking if an account is okay +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. note:: + + ``swift-direct`` is only available in the HPE Helion Public Cloud. + Use ``swiftly`` as an alternate (or use ``swift-get-nodes`` as explained + here). + +You must know the tenant/project ID. You can check if the account is okay as follows from a proxy. + +.. code:: console + + $ sudo -u swift /opt/hp/swift/bin/swift-direct show AUTH_ + +The response will either be similar to a swift list of the account +containers, or an error indicating that the resource could not be found. + +Alternatively, you can use ``swift-get-nodes`` to find the account database +files. Run the following on a proxy: + +.. code:: console + + $ sudo swift-get-nodes /etc/swift/account.ring.gz AUTH_ + +The response will print curl/ssh commands that will list the replicated +account databases. Use the indicated ``curl`` or ``ssh`` commands to check +the status and existence of the account. + +Procedure: Getting swift account stats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. note:: + + ``swift-direct`` is specific to the HPE Helion Public Cloud. Go look at + ``swifty`` for an alternate or use ``swift-get-nodes`` as explained + in :ref:`checking_if_account_ok`. + +This procedure describes how you determine the swift usage for a given +swift account, that is the number of containers, number of objects and +total bytes used. To do this you will need the project ID. + +Log onto one of the swift proxy servers. + +Use swift-direct to show this accounts usage: + +.. code:: console + + $ sudo -u swift /opt/hp/swift/bin/swift-direct show AUTH_ + Status: 200 + Content-Length: 0 + Accept-Ranges: bytes + X-Timestamp: 1379698586.88364 + X-Account-Bytes-Used: 67440225625994 + X-Account-Container-Count: 1 + Content-Type: text/plain; charset=utf-8 + X-Account-Object-Count: 8436776 + Status: 200 + name: my_container count: 8436776 bytes: 67440225625994 + +This account has 1 container. That container has 8436776 objects. The +total bytes used is 67440225625994. + +Procedure: Revive a deleted account +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Swift accounts are normally not recreated. If a tenant/project is deleted, +the account can then be deleted. If the user wishes to use Swift again, +the normal process is to create a new tenant/project -- and hence a +new Swift account. + +However, if the Swift account is deleted, but the tenant/project is not +deleted from Keystone, the user can no longer access the account. This +is because the account is marked deleted in Swift. You can revive +the account as described in this process. + +.. note:: + + The containers and objects in the "old" account cannot be listed + anymore. In addition, if the Account Reaper process has not + finished reaping the containers and objects in the "old" account, these + are effectively orphaned and it is virtually impossible to find and delete + them to free up disk space. + +The solution is to delete the account database files and +re-create the account as follows: + +#. You must know the tenant/project ID. The account name is AUTH_. + In this example, the tenant/project is ``4ebe3039674d4864a11fe0864ae4d905`` + so the Swift account name is ``AUTH_4ebe3039674d4864a11fe0864ae4d905``. + +#. Use ``swift-get-nodes`` to locate the account's database files (on three + servers). The output has been truncated so we can focus on the import pieces + of data: + + .. code:: console + + $ sudo swift-get-nodes /etc/swift/account.ring.gz AUTH_4ebe3039674d4864a11fe0864ae4d905 + ... + curl -I -XHEAD "http://192.168.245.5:6202/disk1/3934/AUTH_4ebe3039674d4864a11fe0864ae4d905" + curl -I -XHEAD "http://192.168.245.3:6202/disk0/3934/AUTH_4ebe3039674d4864a11fe0864ae4d905" + curl -I -XHEAD "http://192.168.245.4:6202/disk1/3934/AUTH_4ebe3039674d4864a11fe0864ae4d905" + ... + Use your own device location of servers: + such as "export DEVICE=/srv/node" + ssh 192.168.245.5 "ls -lah ${DEVICE:-/srv/node*}/disk1/accounts/3934/052/f5ecf8b40de3e1b0adb0dbe576874052" + ssh 192.168.245.3 "ls -lah ${DEVICE:-/srv/node*}/disk0/accounts/3934/052/f5ecf8b40de3e1b0adb0dbe576874052" + ssh 192.168.245.4 "ls -lah ${DEVICE:-/srv/node*}/disk1/accounts/3934/052/f5ecf8b40de3e1b0adb0dbe576874052" + ... + note: `/srv/node*` is used as default value of `devices`, the real value is set in the config file on each storage node. + + +#. Before proceeding check that the account is really deleted by using curl. Execute the + commands printed by ``swift-get-nodes``. For example: + + .. code:: console + + $ curl -I -XHEAD "http://192.168.245.5:6202/disk1/3934/AUTH_4ebe3039674d4864a11fe0864ae4d905" + HTTP/1.1 404 Not Found + Content-Length: 0 + Content-Type: text/html; charset=utf-8 + + Repeat for the other two servers (192.168.245.3 and 192.168.245.4). + A ``404 Not Found`` indicates that the account is deleted (or never existed). + + If you get a ``204 No Content`` response, do **not** proceed. + +#. Use the ssh commands printed by ``swift-get-nodes`` to check if database + files exist. For example: + + .. code:: console + + $ ssh 192.168.245.5 "ls -lah ${DEVICE:-/srv/node*}/disk1/accounts/3934/052/f5ecf8b40de3e1b0adb0dbe576874052" + total 20K + drwxr-xr-x 2 swift swift 110 Mar 9 10:22 . + drwxr-xr-x 3 swift swift 45 Mar 9 10:18 .. + -rw------- 1 swift swift 17K Mar 9 10:22 f5ecf8b40de3e1b0adb0dbe576874052.db + -rw-r--r-- 1 swift swift 0 Mar 9 10:22 f5ecf8b40de3e1b0adb0dbe576874052.db.pending + -rwxr-xr-x 1 swift swift 0 Mar 9 10:18 .lock + + Repeat for the other two servers (192.168.245.3 and 192.168.245.4). + + If no files exist, no further action is needed. + +#. Stop Swift processes on all nodes listed by ``swift-get-nodes`` + (In this example, that is 192.168.245.3, 192.168.245.4 and 192.168.245.5). + +#. We recommend you make backup copies of the database files. + +#. Delete the database files. For example: + + .. code:: console + + $ ssh 192.168.245.5 + $ cd /srv/node/disk1/accounts/3934/052/f5ecf8b40de3e1b0adb0dbe576874052 + $ sudo rm * + + Repeat for the other two servers (192.168.245.3 and 192.168.245.4). + +#. Restart Swift on all three servers + +At this stage, the account is fully deleted. If you enable the auto-create option, the +next time the user attempts to access the account, the account will be created. +You may also use swiftly to recreate the account. + + +Procedure: Temporarily stop load balancers from directing traffic to a proxy server +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can stop the load balancers sending requests to a proxy server as +follows. This can be useful when a proxy is misbehaving but you need +Swift running to help diagnose the problem. By removing from the load +balancers, customer's are not impacted by the misbehaving proxy. + +#. Ensure that in /etc/swift/proxy-server.conf the ``disable_path`` variable is set to + ``/etc/swift/disabled-by-file``. + +#. Log onto the proxy node. + +#. Shut down Swift as follows: + + .. code:: console + + $ sudo swift-init proxy shutdown + + .. note:: + + Shutdown, not stop. + +#. Create the ``/etc/swift/disabled-by-file`` file. For example: + + .. code:: console + + $ sudo touch /etc/swift/disabled-by-file + +#. Optional, restart Swift: + + .. code:: console + + $ sudo swift-init proxy start + +It works because the healthcheck middleware looks for /etc/swift/disabled-by-file. +If it exists, the middleware will return 503/error instead of 200/OK. This means the load balancer +should stop sending traffic to the proxy. + +Procedure: Ad-Hoc disk performance test +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can get an idea whether a disk drive is performing as follows: + +.. code:: console + + $ sudo dd bs=1M count=256 if=/dev/zero conv=fdatasync of=/srv/node/disk11/remember-to-delete-this-later + +You can expect ~600MB/sec. If you get a low number, repeat many times as +Swift itself may also read or write to the disk, hence giving a lower +number. diff --git a/doc/source/ops_runbook/troubleshooting.rst b/doc/source/ops_runbook/troubleshooting.rst new file mode 100644 index 0000000000..75511010cd --- /dev/null +++ b/doc/source/ops_runbook/troubleshooting.rst @@ -0,0 +1,259 @@ +==================== +Troubleshooting tips +==================== + +Diagnose: Customer complains they receive a HTTP status 500 when trying to browse containers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This entry is prompted by a real customer issue and exclusively focused on how +that problem was identified. +There are many reasons why a http status of 500 could be returned. If +there are no obvious problems with the swift object store, then it may +be necessary to take a closer look at the users transactions. +After finding the users swift account, you can +search the swift proxy logs on each swift proxy server for +transactions from this user. The linux ``bzgrep`` command can be used to +search all the proxy log files on a node including the ``.bz2`` compressed +files. For example: + +.. code:: console + + $ PDSH_SSH_ARGS_APPEND="-o StrictHostKeyChecking=no" pdsh -l -R ssh \ + -w .68.[4-11,132-139 4-11,132-139],.132.[4-11,132-139] \ + 'sudo bzgrep -w AUTH_redacted-4962-4692-98fb-52ddda82a5af /var/log/swift/proxy.log*' | dshbak -c + . + . + ---------------- + .132.6 + ---------------- + Feb 29 08:51:57 sw-aw2az2-proxy011 proxy-server .16.132 + .66.8 29/Feb/2012/08/51/57 GET /v1.0/AUTH_redacted-4962-4692-98fb-52ddda82a5af + /%3Fformat%3Djson HTTP/1.0 404 - - _4f4d50c5e4b064d88bd7ab82 - - - + tx429fc3be354f434ab7f9c6c4206c1dc3 - 0.0130 + +This shows a ``GET`` operation on the users account. + +.. note:: + + The HTTP status returned is 404, Not found, rather than 500 as reported by the user. + +Using the transaction ID, ``tx429fc3be354f434ab7f9c6c4206c1dc3`` you can +search the swift object servers log files for this transaction ID: + +.. code:: console + + $ PDSH_SSH_ARGS_APPEND="-o StrictHostKeyChecking=no" pdsh -l -R ssh \ + -w .72.[4-67|4-67],.[4-67|4-67],.[4-67|4-67],.204.[4-131] \ + 'sudo bzgrep tx429fc3be354f434ab7f9c6c4206c1dc3 /var/log/swift/server.log*' | dshbak -c + . + . + ---------------- + .72.16 + ---------------- + Feb 29 08:51:57 sw-aw2az1-object013 account-server .132.6 - - + + [29/Feb/2012:08:51:57 +0000|] "GET /disk9/198875/AUTH_redacted-4962-4692-98fb-52ddda82a5af" + 404 - "tx429fc3be354f434ab7f9c6c4206c1dc3" "-" "-" + + 0.0016 "" + ---------------- + .31 + ---------------- + Feb 29 08:51:57 node-az2-object060 account-server .132.6 - - + [29/Feb/2012:08:51:57 +0000|] "GET /disk6/198875/AUTH_redacted-4962- + 4692-98fb-52ddda82a5af" 404 - "tx429fc3be354f434ab7f9c6c4206c1dc3" "-" "-" 0.0011 "" + ---------------- + .204.70 + ---------------- + + Feb 29 08:51:57 sw-aw2az3-object0067 account-server .132.6 - - + [29/Feb/2012:08:51:57 +0000|] "GET /disk6/198875/AUTH_redacted-4962- + 4692-98fb-52ddda82a5af" 404 - "tx429fc3be354f434ab7f9c6c4206c1dc3" "-" "-" 0.0014 "" + +.. note:: + + The 3 GET operations to 3 different object servers that hold the 3 + replicas of this users account. Each ``GET`` returns a HTTP status of 404, + Not found. + +Next, use the ``swift-get-nodes`` command to determine exactly where the +user's account data is stored: + +.. code:: console + + $ sudo swift-get-nodes /etc/swift/account.ring.gz AUTH_redacted-4962-4692-98fb-52ddda82a5af + Account AUTH_redacted-4962-4692-98fb-52ddda82a5af + Container None + Object None + + Partition 198875 + Hash 1846d99185f8a0edaf65cfbf37439696 + + Server:Port Device .31:6202 disk6 + Server:Port Device .204.70:6202 disk6 + Server:Port Device .72.16:6202 disk9 + Server:Port Device .204.64:6202 disk11 [Handoff] + Server:Port Device .26:6202 disk11 [Handoff] + Server:Port Device .72.27:6202 disk11 [Handoff] + + curl -I -XHEAD "`http://.31:6202/disk6/198875/AUTH_redacted-4962-4692-98fb-52ddda82a5af" + `_ + curl -I -XHEAD "`http://.204.70:6202/disk6/198875/AUTH_redacted-4962-4692-98fb-52ddda82a5af" + `_ + curl -I -XHEAD "`http://.72.16:6202/disk9/198875/AUTH_redacted-4962-4692-98fb-52ddda82a5af" + `_ + curl -I -XHEAD "`http://.204.64:6202/disk11/198875/AUTH_redacted-4962-4692-98fb-52ddda82a5af" + `_ # [Handoff] + curl -I -XHEAD "`http://.26:6202/disk11/198875/AUTH_redacted-4962-4692-98fb-52ddda82a5af" + `_ # [Handoff] + curl -I -XHEAD "`http://.72.27:6202/disk11/198875/AUTH_redacted-4962-4692-98fb-52ddda82a5af" + `_ # [Handoff] + + ssh .31 "ls -lah /srv/node/disk6/accounts/198875/696/1846d99185f8a0edaf65cfbf37439696/" + ssh .204.70 "ls -lah /srv/node/disk6/accounts/198875/696/1846d99185f8a0edaf65cfbf37439696/" + ssh .72.16 "ls -lah /srv/node/disk9/accounts/198875/696/1846d99185f8a0edaf65cfbf37439696/" + ssh .204.64 "ls -lah /srv/node/disk11/accounts/198875/696/1846d99185f8a0edaf65cfbf37439696/" # [Handoff] + ssh .26 "ls -lah /srv/node/disk11/accounts/198875/696/1846d99185f8a0edaf65cfbf37439696/" # [Handoff] + ssh .72.27 "ls -lah /srv/node/disk11/accounts/198875/696/1846d99185f8a0edaf65cfbf37439696/" # [Handoff] + +Check each of the primary servers, .31, .204.70 and .72.16, for +this users account. For example on .72.16: + +.. code:: console + + $ ls -lah /srv/node/disk9/accounts/198875/696/1846d99185f8a0edaf65cfbf37439696/ + total 1.0M + drwxrwxrwx 2 swift swift 98 2012-02-23 14:49 . + drwxrwxrwx 3 swift swift 45 2012-02-03 23:28 .. + -rw------- 1 swift swift 15K 2012-02-23 14:49 1846d99185f8a0edaf65cfbf37439696.db + -rw-rw-rw- 1 swift swift 0 2012-02-23 14:49 1846d99185f8a0edaf65cfbf37439696.db.pending + +So this users account db, an sqlite db is present. Use sqlite to +checkout the account: + +.. code:: console + + $ sudo cp /srv/node/disk9/accounts/198875/696/1846d99185f8a0edaf65cfbf37439696/1846d99185f8a0edaf65cfbf37439696.db /tmp + $ sudo sqlite3 /tmp/1846d99185f8a0edaf65cfbf37439696.db + sqlite> .mode line + sqlite> select * from account_stat; + account = AUTH_redacted-4962-4692-98fb-52ddda82a5af + created_at = 1328311738.42190 + put_timestamp = 1330000873.61411 + delete_timestamp = 1330001026.00514 + container_count = 0 + object_count = 0 + bytes_used = 0 + hash = eb7e5d0ea3544d9def940b19114e8b43 + id = 2de8c8a8-cef9-4a94-a421-2f845802fe90 + status = DELETED + status_changed_at = 1330001026.00514 + metadata = + +.. note: + + The status is ``DELETED``. So this account was deleted. This explains + why the GET operations are returning 404, not found. Check the account + delete date/time: + + .. code:: console + + $ python + + >>> import time + >>> time.ctime(1330001026.00514) + 'Thu Feb 23 12:43:46 2012' + +Next try and find the ``DELETE`` operation for this account in the proxy +server logs: + +.. code:: console + + $ PDSH_SSH_ARGS_APPEND="-o StrictHostKeyChecking=no" pdsh -l -R ssh \ + -w .68.[4-11,132-139 4-11,132-139],.132.[4-11,132-139|4-11,132-139] \ + 'sudo bzgrep AUTH_redacted-4962-4692-98fb-52ddda82a5af /var/log/swift/proxy.log* \ + | grep -w DELETE | awk "{print $3,$10,$12}"' |- dshbak -c + . + . + Feb 23 12:43:46 sw-aw2az2-proxy001 proxy-server .66.7 23/Feb/2012/12/43/46 DELETE /v1.0/AUTH_redacted-4962-4692-98fb- + 52ddda82a5af/ HTTP/1.0 204 - Apache-HttpClient/4.1.2%20%28java%201.5%29 _4f458ee4e4b02a869c3aad02 - - - + tx4471188b0b87406899973d297c55ab53 - 0.0086 + +From this you can see the operation that resulted in the account being deleted. + +Procedure: Deleting objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Simple case - deleting small number of objects and containers +------------------------------------------------------------- + +.. note:: + + ``swift-direct`` is specific to the Hewlett Packard Enterprise Helion Public Cloud. + Use ``swiftly`` as an alternative. + +.. note:: + + Object and container names are in UTF8. Swift direct accepts UTF8 + directly, not URL-encoded UTF8 (the REST API expects UTF8 and then + URL-encoded). In practice cut and paste of foreign language strings to + a terminal window will produce the right result. + + Hint: Use the ``head`` command before any destructive commands. + +To delete a small number of objects, log into any proxy node and proceed +as follows: + +Examine the object in question: + +.. code:: console + + $ sudo -u swift /opt/hp/swift/bin/swift-direct head 132345678912345 container_name obj_name + +See if ``X-Object-Manifest`` or ``X-Static-Large-Object`` is set, +then this is the manifest object and segment objects may be in another +container. + +If the ``X-Object-Manifest`` attribute is set, you need to find the +name of the objects this means it is a DLO. For example, +if ``X-Object-Manifest`` is ``container2/seg-blah``, list the contents +of the container container2 as follows: + +.. code:: console + + $ sudo -u swift /opt/hp/swift/bin/swift-direct show 132345678912345 container2 + +Pick out the objects whose names start with ``seg-blah``. +Delete the segment objects as follows: + +.. code:: console + + $ sudo -u swift /opt/hp/swift/bin/swift-direct delete 132345678912345 container2 seg-blah01 + $ sudo -u swift /opt/hp/swift/bin/swift-direct delete 132345678912345 container2 seg-blah02 + etc + +If ``X-Static-Large-Object`` is set, you need to read the contents. Do this by: + +- Using swift-get-nodes to get the details of the object's location. +- Change the ``-X HEAD`` to ``-X GET`` and run ``curl`` against one copy. +- This lists a JSON body listing containers and object names +- Delete the objects as described above for DLO segments + +Once the segments are deleted, you can delete the object using +``swift-direct`` as described above. + +Finally, use ``swift-direct`` to delete the container. + +Procedure: Decommissioning swift nodes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Should Swift nodes need to be decommissioned (e.g.,, where they are being +re-purposed), it is very important to follow the following steps. + +#. In the case of object servers, follow the procedure for removing + the node from the rings. +#. In the case of swift proxy servers, have the network team remove + the node from the load balancers. +#. Open a network ticket to have the node removed from network + firewalls. +#. Make sure that you remove the ``/etc/swift`` directory and everything in it. diff --git a/doc/source/overview_acl.rst b/doc/source/overview_acl.rst new file mode 100644 index 0000000000..d4c1ca09e2 --- /dev/null +++ b/doc/source/overview_acl.rst @@ -0,0 +1,410 @@ + +=========================== +Access Control Lists (ACLs) +=========================== + +Normally to create, read and modify containers and objects, you must have the +appropriate roles on the project associated with the account, i.e., you +must be the owner of the account. However, an owner can grant access to +other users by using an Access Control List (ACL). + +There are two types of ACLs: + +- :ref:`container_acls`. These are specified on a container and + apply to that container only and the objects in the container. +- :ref:`account_acls`. These are specified at the account level and + apply to all containers and objects in the account. + +.. _container_acls: + +-------------- +Container ACLs +-------------- + +Container ACLs are stored in the ``X-Container-Write`` and ``X-Container-Read`` +metadata. The scope of the ACL is limited to the container where the +metadata is set and the objects in the container. In addition: + +- ``X-Container-Write`` grants the ability to perform PUT, POST and DELETE + operations on objects within a container. It does not grant the ability + to perform POST or DELETE operations on the container itself. Some ACL + elements also grant the ability to perform HEAD or GET operations on the + container. + +- ``X-Container-Read`` grants the ability to perform GET and HEAD + operations on objects within a container. Some of the ACL elements also grant + the ability to perform HEAD or GET operations on the container itself. + However, a container ACL does not allow access to privileged metadata (such + as ``X-Container-Sync-Key``). + +Container ACLs use the "V1" ACL syntax which is a comma separated string +of elements as shown in the following example:: + + .r:*,.rlistings,7ec59e87c6584c348b563254aae4c221:* + +Spaces may occur between elements as shown in the following example:: + + + .r : *, .rlistings, 7ec59e87c6584c348b563254aae4c221:* + +However, these spaces are removed from the value stored in the +``X-Container-Write`` and ``X-Container-Read`` metadata. In addition, +the ``.r:`` string can be written as ``.referrer:``, but is stored as ``.r:``. + +While all auth systems use +the same syntax, the meaning of some elements +is different because of the different concepts used by different +auth systems as explained in the following sections: + +- :ref:`acl_common_elements` +- :ref:`acl_keystone_elements` +- :ref:`acl_tempauth_elements` + + +.. _acl_common_elements: + +Common ACL Elements +------------------- + +The following table describes elements of an ACL that are +supported by both Keystone auth and TempAuth. These elements +should only be used with ``X-Container-Read`` (with the exception +of ``.rlistings``, an error will occur if used with +``X-Container-Write``): + +============================== ================================================ +Element Description +============================== ================================================ +.r:* Any user has access to objects. No token is + required in the request. +.r: The referrer is granted access to objects. The + referrer is identified by the ``Referer`` + request header in the request. No token is + required. +.r:- This syntax (with "-" prepended to the + referrer) is supported. However, it does not + deny access if another element (e.g., ``.r:*``) + grants access. +.rlistings Any user can perform a HEAD or GET operation + on the container provided the user also has + read access on objects (e.g., also has ``.r:*`` + or ``.r:``. No token is required. +============================== ================================================ + +.. _acl_keystone_elements: + +Keystone Auth ACL Elements +-------------------------- + +The following table describes elements of an ACL that are +supported only by Keystone auth. Keystone auth also supports +the elements described in :ref:`acl_common_elements`. + +A token must be included in the request for any of these ACL elements +to take effect. + +============================== ================================================ +Element Description +============================== ================================================ +: The specified user, provided a token + scoped to the project is included + in the request, is granted access. + Access to the container is also granted + when used in ``X-Container-Read``. +:\* Any user with a role in the specified Keystone + project has access. A token scoped to the + project must be included in the request. + Access to the container is also granted + when used in ``X-Container-Read``. +\*: The specified user has access. A token + for the user (scoped to any + project) must be included in the request. + Access to the container is also granted + when used in ``X-Container-Read``. +\*:\* Any user has access. + Access to the container is also granted + when used in ``X-Container-Read``. + The ``*:*`` element differs from the ``.r:*`` + element because + ``*:*`` requires that a valid token is + included in the request whereas ``.r:*`` + does not require a token. In addition, + ``.r:*`` does not grant access to the + container listing. + A user with the specified role *name* on the + project within which the container is stored is + granted access. A user token scoped to the + project must be included in the request. Access + to the container is also granted when used in + ``X-Container-Read``. +============================== ================================================ + +.. note:: + + Keystone project (tenant) or user *names* (i.e., + ``:``) must no longer be + used because with the introduction + of domains in Keystone, names are not globally unique. You should + use user and project *ids* instead. + + For backwards compatibility, ACLs using names will be granted by + keystoneauth when it can be established that + the grantee project, the grantee user and the project being + accessed are either not yet in a domain (e.g. the ``X-Auth-Token`` has + been obtained via the Keystone V2 API) or are all in the default domain + to which legacy accounts would have been migrated. + + +.. _acl_tempauth_elements: + +TempAuth ACL Elements +--------------------- + +The following table describes elements of an ACL that are +supported only by TempAuth. TempAuth auth also supports +the elements described in :ref:`acl_common_elements`. + +============================== ================================================ +Element Description +============================== ================================================ + The named user is granted access. The + wildcard ("*") character is not supported. + A token from the user must be included in the + request. +============================== ================================================ + +---------------------- +Container ACL Examples +---------------------- + +Container ACLs may be set by including ``X-Container-Write`` and/or +``X-Container-Read`` headers with a PUT or a POST request to the container URL. +The following examples use the ``swift`` command line client which support +these headers being set via its ``--write-acl`` and ``--read-acl`` options. + +Example: Public Container +------------------------- + +The following allows anybody to list objects in the ``www`` container and +download objects. The users do not need to include a token in +their request. This ACL is commonly referred to as making the +container "public". It is useful when used with :ref:`staticweb`:: + + swift post www --read-acl ".r:*,.rlistings" + + +Example: Shared Writable Container +---------------------------------- + +The following allows anybody to upload or download objects. However, to +download an object, the exact name of the object must be known since +users cannot list the objects in the container. +The users must include a Keystone token in the upload request. However, it does not +need to be scoped to the project associated with the container:: + + swift post www --read-acl ".r:*" --write-acl "*:*" + + +Example: Sharing a Container with Project Members +------------------------------------------------- + +The following allows any member of the ``77b8f82565f14814bece56e50c4c240f`` +project to upload and download objects or to list the contents +of the ``www`` container. A token scoped to the ``77b8f82565f14814bece56e50c4c240f`` +project must be included in the request:: + + swift post www --read-acl "77b8f82565f14814bece56e50c4c240f:*" \ + --write-acl "77b8f82565f14814bece56e50c4c240f:*" + + +Example: Sharing a Container with Users having a specified Role +--------------------------------------------------------------- + +The following allows any user that has been assigned the +``my_read_access_role`` on the project within which the ``www`` container is +stored to download objects or to list the contents of the ``www`` container. A +user token scoped to the project must be included in the download or list +request:: + + swift post www --read-acl "my_read_access_role" + + +Example: Allowing a Referrer Domain to Download Objects +------------------------------------------------------- + +The following allows any request from +the ``example.com`` domain to access an object in the container:: + + swift post www --read-acl ".r:.example.com" + +However, the request from the user **must** contain the appropriate +`Referer` header as shown in this example request:: + + curl -i $publicURL/www/document --head -H "Referer: http://www.example.com/index.html" + +.. note:: + + The `Referer` header is included in requests by many browsers. However, + since it is easy to create a request with any desired value in the + `Referer` header, the referrer ACL has very weak security. + + +Example: Sharing a Container with Another User +---------------------------------------------- + +Sharing a Container with another user requires the knowledge of few +parameters regarding the users. + +The sharing user must know: + +- the ``OpenStack user id`` of the other user + +The sharing user must communicate to the other user: + +- the name of the shared container +- the ``OS_STORAGE_URL`` + +Usually the ``OS_STORAGE_URL`` is not exposed directly to the user +because the ``swift client`` by default automatically construct the +``OS_STORAGE_URL`` based on the User credential. + +We assume that in the current directory there are the two client +environment script for the two users ``sharing.openrc`` and +``other.openrc``. + +The ``sharing.openrc`` should be similar to the following: + +.. code-block:: bash + + export OS_USERNAME=sharing + # WARNING: Save the password in clear text only for testing purposes + export OS_PASSWORD=password + export OS_TENANT_NAME=projectName + export OS_AUTH_URL=https://identityHost:portNumber/v2.0 + # The following lines can be omitted + export OS_TENANT_ID=tenantIDString + export OS_REGION_NAME=regionName + export OS_CACERT=/path/to/cacertFile + +The ``other.openrc`` should be similar to the following: + +.. code-block:: bash + + export OS_USERNAME=other + # WARNING: Save the password in clear text only for testing purposes + export OS_PASSWORD=otherPassword + export OS_TENANT_NAME=otherProjectName + export OS_AUTH_URL=https://identityHost:portNumber/v2.0 + # The following lines can be omitted + export OS_TENANT_ID=tenantIDString + export OS_REGION_NAME=regionName + export OS_CACERT=/path/to/cacertFile + +For more information see `using the OpenStack RC file +`_ + +First we figure out the other user id:: + + . other.openrc + OUID="$(openstack user show --format json "${OS_USERNAME}" | jq -r .id)" + +or alternatively:: + + . other.openrc + OUID="$(openstack token issue -f json | jq -r .user_id)" + +Then we figure out the storage url of the sharing user:: + + sharing.openrc + SURL="$(swift auth | awk -F = '/OS_STORAGE_URL/ {print $2}')" + +Running as the sharing user create a shared container named ``shared`` +in read-only mode with the other user using the proper acl:: + + sharing.openrc + swift post --read-acl "*:${OUID}" shared + +Running as the sharing user create and upload a test file:: + + touch void + swift upload shared void + +Running as the other user list the files in the ``shared`` container:: + + other.openrc + swift --os-storage-url="${SURL}" list shared + +Running as the other user download the ``shared`` container in the +``/tmp`` directory:: + + cd /tmp + swift --os-storage-url="${SURL}" download shared + + +.. _account_acls: + +------------ +Account ACLs +------------ + +.. note:: + + Account ACLs are not currently supported by Keystone auth + +The ``X-Account-Access-Control`` header is used to specify +account-level ACLs in a format specific to the auth system. +These headers are visible and settable only by account owners (those for whom +``swift_owner`` is true). +Behavior of account ACLs is auth-system-dependent. In the case of TempAuth, +if an authenticated user has membership in a group which is listed in the +ACL, then the user is allowed the access level of that ACL. + +Account ACLs use the "V2" ACL syntax, which is a JSON dictionary with keys +named "admin", "read-write", and "read-only". (Note the case sensitivity.) +An example value for the ``X-Account-Access-Control`` header looks like this, +where ``a``, ``b`` and ``c`` are user names:: + + {"admin":["a","b"],"read-only":["c"]} + +Keys may be absent (as shown in above example). + +The recommended way to generate ACL strings is as follows:: + + from swift.common.middleware.acl import format_acl + acl_data = { 'admin': ['alice'], 'read-write': ['bob', 'carol'] } + acl_string = format_acl(version=2, acl_dict=acl_data) + +Using the :func:`format_acl` method will ensure +that JSON is encoded as ASCII (using e.g. '\u1234' for Unicode). While +it's permissible to manually send ``curl`` commands containing +``X-Account-Access-Control`` headers, you should exercise caution when +doing so, due to the potential for human error. + +Within the JSON dictionary stored in ``X-Account-Access-Control``, the keys +have the following meanings: + +============ ============================================================== +Access Level Description +============ ============================================================== +read-only These identities can read *everything* (except privileged + headers) in the account. Specifically, a user with read-only + account access can get a list of containers in the account, + list the contents of any container, retrieve any object, and + see the (non-privileged) headers of the account, any + container, or any object. +read-write These identities can read or write (or create) any container. + A user with read-write account access can create new + containers, set any unprivileged container headers, overwrite + objects, delete containers, etc. A read-write user can NOT + set account headers (or perform any PUT/POST/DELETE requests + on the account). +admin These identities have "swift_owner" privileges. A user with + admin account access can do anything the account owner can, + including setting account headers and any privileged headers + -- and thus granting read-only, read-write, or admin access + to other users. +============ ============================================================== + + +For more details, see :mod:`swift.common.middleware.tempauth`. For details +on the ACL format, see :mod:`swift.common.middleware.acl`. diff --git a/doc/source/overview_architecture.rst b/doc/source/overview_architecture.rst index 7cd56a8c7c..b0ae293d9a 100644 --- a/doc/source/overview_architecture.rst +++ b/doc/source/overview_architecture.rst @@ -2,8 +2,6 @@ Swift Architectural Overview ============================ -.. TODO - add links to more detailed overview in each section below. - ------------ Proxy Server ------------ @@ -11,7 +9,10 @@ Proxy Server The Proxy Server is responsible for tying together the rest of the Swift architecture. For each request, it will look up the location of the account, container, or object in the ring (see below) and route the request accordingly. -The public API is also exposed through the Proxy Server. +For Erasure Code type policies, the Proxy Server is also responsible for +encoding and decoding object data. See :doc:`overview_erasure_code` for +complete information on Erasure Code support. The public API is also exposed +through the Proxy Server. A large number of failures are also handled in the Proxy Server. For example, if a server is unavailable for an object PUT, it will ask the @@ -27,9 +28,9 @@ The Ring A ring represents a mapping between the names of entities stored on disk and their physical location. There are separate rings for accounts, containers, and -objects. When other components need to perform any operation on an object, -container, or account, they need to interact with the appropriate ring to -determine its location in the cluster. +one object ring per storage policy. When other components need to perform any +operation on an object, container, or account, they need to interact with the +appropriate ring to determine its location in the cluster. The Ring maintains this mapping using zones, devices, partitions, and replicas. Each partition in the ring is replicated, by default, 3 times across the @@ -37,22 +38,71 @@ cluster, and the locations for a partition are stored in the mapping maintained by the ring. The ring is also responsible for determining which devices are used for handoff in failure scenarios. -Data can be isolated with the concept of zones in the ring. Each replica -of a partition is guaranteed to reside in a different zone. A zone could -represent a drive, a server, a cabinet, a switch, or even a datacenter. +The replicas of each partition will be isolated onto as many distinct regions, +zones, servers and devices as the capacity of these failure domains allow. If +there are less failure domains at a given tier than replicas of the partition +assigned within a tier (e.g. a 3 replica cluster with 2 servers), or the +available capacity across the failure domains within a tier are not well +balanced it will not be possible to achieve both even capacity distribution +(`balance`) as well as complete isolation of replicas across failure domains +(`dispersion`). When this occurs the ring management tools will display a +warning so that the operator can evaluate the cluster topology. + +Data is evenly distributed across the capacity available in the cluster as +described by the devices weight. Weights can be used to balance the +distribution of partitions on drives across the cluster. This can be useful, +for example, when different sized drives are used in a cluster. Device +weights can also be used when adding or removing capacity or failure domains +to control how many partitions are reassigned during a rebalance to be moved +as soon as replication bandwidth allows. + +.. note:: + Prior to Swift 2.1.0 it was not possible to restrict partition movement by + device weight when adding new failure domains, and would allow extremely + unbalanced rings. The greedy dispersion algorithm is now subject to the + constraints of the physical capacity in the system, but can be adjusted + with-in reason via the overload option. Artificially unbalancing the + partition assignment without respect to capacity can introduce unexpected + full devices when a given failure domain does not physically support its + share of the used capacity in the tier. + +When partitions need to be moved around (for example if a device is added to +the cluster), the ring ensures that a minimum number of partitions are moved +at a time, and only one replica of a partition is moved at a time. + +The ring is used by the Proxy server and several background processes +(like replication). See :doc:`overview_ring` for complete information on the +ring. + +---------------- +Storage Policies +---------------- -The partitions of the ring are equally divided among all the devices in the -Swift installation. When partitions need to be moved around (for example if a -device is added to the cluster), the ring ensures that a minimum number of -partitions are moved at a time, and only one replica of a partition is moved at -a time. +Storage Policies provide a way for object storage providers to differentiate +service levels, features and behaviors of a Swift deployment. Each Storage +Policy configured in Swift is exposed to the client via an abstract name. +Each device in the system is assigned to one or more Storage Policies. This +is accomplished through the use of multiple object rings, where each Storage +Policy has an independent object ring, which may include a subset of hardware +implementing a particular differentiation. -Weights can be used to balance the distribution of partitions on drives -across the cluster. This can be useful, for example, when different sized -drives are used in a cluster. +For example, one might have the default policy with 3x replication, and create +a second policy which, when applied to new containers only uses 2x replication. +Another might add SSDs to a set of storage nodes and create a performance tier +storage policy for certain containers to have their objects stored there. Yet +another might be the use of Erasure Coding to define a cold-storage tier. -The ring is used by the Proxy server and several background processes -(like replication). +This mapping is then exposed on a per-container basis, where each container +can be assigned a specific storage policy when it is created, which remains in +effect for the lifetime of the container. Applications require minimal +awareness of storage policies to use them; once a container has been created +with a specific policy, all objects stored in it will be done so in accordance +with that policy. + +The Storage Policies feature is implemented throughout the entire code base so +it is an important concept in understanding Swift architecture. + +See :doc:`overview_policies` for complete information on storage policies. ------------- Object Server @@ -111,6 +161,19 @@ item (object, container, or account) is deleted, a tombstone is set as the latest version of the item. The replicator will see the tombstone and ensure that the item is removed from the entire system. +See :doc:`overview_replication` for complete information on replication. + +-------------- +Reconstruction +-------------- + +The reconstructor is used by Erasure Code policies and is analogous to the +replicator for Replication type policies. See :doc:`overview_erasure_code` +for complete information on both Erasure Code support as well as the +reconstructor. + +.. _architecture_updaters: + -------- Updaters -------- @@ -141,5 +204,4 @@ containers, and accounts. If corruption is found (in the case of bit rot, for example), the file is quarantined, and replication will replace the bad file from another replica. If other errors are found they are logged (for example, an object's listing can't be found on any container server it -should be). - +should be). \ No newline at end of file diff --git a/doc/source/overview_auth.rst b/doc/source/overview_auth.rst index ee0f3fb85d..d5b1be6324 100644 --- a/doc/source/overview_auth.rst +++ b/doc/source/overview_auth.rst @@ -3,12 +3,11 @@ The Auth System =============== -------- -TempAuth +Overview -------- -The auth system for Swift is loosely based on the auth system from the existing -Rackspace architecture -- actually from a few existing auth systems -- and is -therefore a bit disjointed. The distilled points about it are: +Swift supports a number of auth systems that share the following common +characteristics: * The authentication/authorization part can be an external system or a subsystem run within Swift as WSGI middleware @@ -26,51 +25,101 @@ validation. Swift will make calls to the auth system, giving the auth token to be validated. For a valid token, the auth system responds with an overall -expiration in seconds from now. Swift will cache the token up to the expiration +expiration time in seconds from now. To avoid the overhead in validating the same +token over and over again, Swift will cache the +token for a configurable time, but no longer than the expiration time. -The included TempAuth also has the concept of admin and non-admin users within -an account. Admin users can do anything within the account. Non-admin users can -only perform operations per container based on the container's X-Container-Read -and X-Container-Write ACLs. For more information on ACLs, see -:mod:`swift.common.middleware.acl`. +The Swift project includes two auth systems: + +- :ref:`temp_auth` +- :ref:`keystone_auth` + +It is also possible to write your own auth system as described in +:ref:`extending_auth`. + +.. _temp_auth: + +-------- +TempAuth +-------- -Additionally, if the auth system sets the request environ's swift_owner key to -True, the proxy will return additional header information in some requests, -such as the X-Container-Sync-Key for a container GET or HEAD. +TempAuth is used primarily in Swift's functional test environment and can be +used in other test environments (such as :doc:`development_saio`). It is not +recommended to use TempAuth in a production system. However, TempAuth is fully +functional and can be used as a model to develop your own auth system. + +TempAuth has the concept of admin and non-admin users +within an account. Admin users can do anything within the account. +Non-admin users can only perform read operations. However, some +privileged metadata such as X-Container-Sync-Key is not accessible to +non-admin users. + +Users with the special group ``.reseller_admin`` can operate on any account. +For an example usage please see :mod:`swift.common.middleware.tempauth`. +If a request is coming from a reseller the auth system sets the request environ +reseller_request to True. This can be used by other middlewares. + +Other users may be granted the ability to perform operations on +an account or container via ACLs. TempAuth supports two types of ACL: + +- Per container ACLs based on the + container's ``X-Container-Read`` and ``X-Container-Write`` metadata. See + :ref:`container_acls` for more information. + +- Per account ACLs based on the account's ``X-Account-Access-Control`` + metadata. For more information see :ref:`account_acls`. TempAuth will now allow OPTIONS requests to go through without a token. -The user starts a session by sending a ReST request to the auth system to -receive the auth token and a URL to the Swift system. +The TempAuth middleware is responsible for creating its own tokens. A user +makes a request containing their username and password and TempAuth +responds with a token. This token is then used to perform subsequent +requests on the user's account, containers and objects. + +.. _keystone_auth: ------------- Keystone Auth ------------- -Swift is able to authenticate against OpenStack keystone via the -:mod:`swift.common.middleware.keystoneauth` middleware. +Swift is able to authenticate against OpenStack Keystone_. In this +environment, Keystone is responsible for creating and validating +tokens. The :ref:`keystoneauth` middleware is responsible for +implementing the auth system within Swift as described here. + +The :ref:`keystoneauth` middleware supports per container based ACLs on the +container's ``X-Container-Read`` and ``X-Container-Write`` metadata. +For more information see :ref:`container_acls`. -In order to use the ``keystoneauth`` middleware the ``authtoken`` -middleware from python-keystoneclient will need to be configured. +The account-level ACL is not supported by Keystone auth. + +In order to use the ``keystoneauth`` middleware the ``auth_token`` +middleware from KeystoneMiddleware_ will need to be configured. The ``authtoken`` middleware performs the authentication token validation and retrieves actual user authentication information. It -can be found in the python-keystoneclient distribution. +can be found in the KeystoneMiddleware_ distribution. + +The :ref:`keystoneauth` middleware performs authorization and mapping the +Keystone roles to Swift's ACLs. -The ``keystoneauth`` middleware performs authorization and mapping the -``keystone`` roles to Swift's ACLs. +.. _KeystoneMiddleware: https://docs.openstack.org/keystonemiddleware/latest/ +.. _Keystone: https://docs.openstack.org/keystone/latest/ + +.. _configuring_keystone_auth: Configuring Swift to use Keystone ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Configuring Swift to use Keystone is relatively straight -forward. The first step is to ensure that you have the auth_token -middleware installed, distributed with keystone it can either be -dropped in your python path or installed via the keystone package. +Configuring Swift to use Keystone_ +is relatively straightforward. The first +step is to ensure that you have the ``auth_token`` middleware installed. It can +either be dropped in your python path or installed via the KeystoneMiddleware_ +package. You need at first make sure you have a service endpoint of type -``object-store`` in keystone pointing to your Swift proxy. For example +``object-store`` in Keystone pointing to your Swift proxy. For example having this in your ``/etc/keystone/default_catalog.templates`` :: catalog.RegionOne.object_store.name = Swift Service @@ -78,7 +127,7 @@ having this in your ``/etc/keystone/default_catalog.templates`` :: catalog.RegionOne.object_store.adminURL = http://swiftproxy:8080/ catalog.RegionOne.object_store.internalURL = http://swiftproxy:8080/v1/AUTH_$(tenant_id)s -On your Swift Proxy server you will want to adjust your main pipeline +On your Swift proxy server you will want to adjust your main pipeline and add auth_token and keystoneauth in your ``/etc/swift/proxy-server.conf`` like this :: @@ -88,46 +137,234 @@ and add auth_token and keystoneauth in your add the configuration for the authtoken middleware:: [filter:authtoken] - paste.filter_factory = keystoneclient.middleware.auth_token:filter_factory - auth_host = keystonehost - auth_port = 35357 - auth_protocol = http - auth_uri = http://keystonehost:5000/ - admin_tenant_name = service - admin_user = swift - admin_password = password + paste.filter_factory = keystonemiddleware.auth_token:filter_factory + www_authenticate_uri = http://keystonehost:5000/ + auth_url = http://keystonehost:5000/ + auth_plugin = password + project_domain_id = default + user_domain_id = default + project_name = service + username = swift + password = password + cache = swift.cache + include_service_catalog = False + delay_auth_decision = True The actual values for these variables will need to be set depending on -your situation. For more information, please refer to the Keystone -documentation on the ``auth_token`` middleware, but in short: +your situation, but in short: + +* ``www_authenticate_uri`` should point to a Keystone service from which users may + retrieve tokens. This value is used in the `WWW-Authenticate` header that + auth_token sends with any denial response. +* ``auth_url`` points to the Keystone Admin service. This information is + used by the middleware to actually query Keystone about the validity of the + authentication tokens. It is not necessary to append any Keystone API version + number to this URI. +* The auth credentials (``project_domain_id``, ``user_domain_id``, + ``username``, ``project_name``, ``password``) will be used to retrieve an + admin token. That token will be used to authorize user tokens behind the + scenes. These credentials must match the Keystone credentials for the Swift + service. The example values shown here assume a user named 'swift' with admin + role on a project named 'service', both being in the Keystone domain with id + 'default'. Refer to the `KeystoneMiddleware documentation + `_ + for other examples. + +* ``cache`` is set to ``swift.cache``. This means that the middleware + will get the Swift memcache from the request environment. +* ``include_service_catalog`` defaults to ``True`` if not set. This means + that when validating a token, the service catalog is retrieved + and stored in the ``X-Service-Catalog`` header. This is required if you use + access-rules in Application Credentials. You may also need to increase + `max_header_size`. -* Those variables beginning with ``auth_`` point to the Keystone - Admin service. This information is used by the middleware to actually - query Keystone about the validity of the - authentication tokens. -* The admin auth credentials (``admin_user``, ``admin_tenant_name``, - ``admin_password``) will be used to retrieve an admin token. That - token will be used to authorize user tokens behind the scenes. .. note:: - If support is required for unvalidated users (as with anonymous - access) or for tempurl/formpost middleware, authtoken will need - to be configured with delay_auth_decision set to 1. + The authtoken config variable ``delay_auth_decision`` must be set to + ``True``. The default is ``False``, but that breaks public access, + :ref:`staticweb`, :ref:`formpost`, :ref:`tempurl`, and authenticated + capabilities requests (using :ref:`discoverability`). + +and you can finally add the keystoneauth configuration. Here is a simple +configuration:: + + [filter:keystoneauth] + use = egg:swift#keystoneauth + operator_roles = admin, swiftoperator -and you can finally add the keystoneauth configuration:: +Use an appropriate list of roles in operator_roles. For example, in +some systems, the role ``_member_`` or ``Member`` is used to indicate +that the user is allowed to operate on project resources. + +OpenStack Service Using Composite Tokens +---------------------------------------- + +Some OpenStack services such as Cinder and Glance may use +a "service account". In this mode, you configure a separate account where +the service stores project data that it manages. This account is not used +directly by the end-user. Instead, all access is done through the service. + +To access the "service" account, the service must present two tokens: one from +the end-user and another from its own service user. Only when both tokens are +present can the account be accessed. This section describes how to set the +configuration options to correctly control access to both the "normal" and +"service" accounts. + +In this example, end users use the ``AUTH_`` prefix in account names, +whereas services use the ``SERVICE_`` prefix:: + + [filter:keystoneauth] + use = egg:swift#keystoneauth + reseller_prefix = AUTH, SERVICE + operator_roles = admin, swiftoperator + SERVICE_service_roles = service + +The actual values for these variable will need to be set depending on your +situation as follows: + +* The first item in the reseller_prefix list must match Keystone's endpoint + (see ``/etc/keystone/default_catalog.templates`` above). Normally + this is ``AUTH``. +* The second item in the reseller_prefix list is the prefix used by the + OpenStack services(s). You must configure this value (``SERVICE`` in the + example) with whatever the other OpenStack service(s) use. +* Set the operator_roles option to contain a role or roles that end-user's + have on project's they use. +* Set the SERVICE_service_roles value to a role or roles that only the + OpenStack service user has. Do not use a role that is assigned to + "normal" end users. In this example, the role ``service`` is used. + The service user is granted this role to a *single* project only. You do + not need to make the service user a member of every project. + +This configuration works as follows: + +* The end-user presents a user token to an OpenStack service. The service + then makes a Swift request to the account with the ``SERVICE`` prefix. +* The service forwards the original user token with the request. It also + adds it's own service token. +* Swift validates both tokens. When validated, the user token gives the + ``admin`` or ``swiftoperator`` role(s). When validated, the service token + gives the ``service`` role. +* Swift interprets the above configuration as follows: + + * Did the user token provide one of the roles listed in operator_roles? + * Did the service token have the ``service`` role as described by the + ``SERVICE_service_roles`` options. + +* If both conditions are met, the request is granted. Otherwise, Swift + rejects the request. + +In the above example, all services share the same account. You can separate +each service into its own account. For example, the following provides a +dedicated account for each of the Glance and Cinder services. In addition, +you must assign the ``glance_service`` and ``cinder_service`` to the +appropriate service users:: [filter:keystoneauth] use = egg:swift#keystoneauth + reseller_prefix = AUTH, IMAGE, VOLUME operator_roles = admin, swiftoperator + IMAGE_service_roles = glance_service + VOLUME_service_roles = cinder_service + + +Access control using keystoneauth +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By default the only users able to perform operations (e.g. create a container) +on an account are those having a Keystone role for the corresponding Keystone +project that matches one of the roles specified in the ``operator_roles`` +option. + +Users who have one of the ``operator_roles`` will be able to set container ACLs +to grant other users permission to read and/or write objects in specific +containers, using ``X-Container-Read`` and ``X-Container-Write`` headers +respectively. In addition to the ACL formats described +:mod:`here `, keystoneauth supports ACLs using the +format:: + + other_project_id:other_user_id. + +where ``other_project_id`` is the UUID of a Keystone project and +``other_user_id`` is the UUID of a Keystone user. This will allow the other +user to access a container provided their token is scoped on the other +project. Both ``other_project_id`` and ``other_user_id`` may be replaced with +the wildcard character ``*`` which will match any project or user respectively. + +Be sure to use Keystone UUIDs rather than names in container ACLs. + +.. note:: + + For backwards compatibility, keystoneauth will by default grant container + ACLs expressed as ``other_project_name:other_user_name`` (i.e. using + Keystone names rather than UUIDs) in the special case when both the other + project and the other user are in Keystone's default domain and the project + being accessed is also in the default domain. + + For further information see :ref:`keystoneauth` + +Users with the Keystone role defined in ``reseller_admin_role`` +(``ResellerAdmin`` by default) can operate on any account. The auth system +sets the request environ reseller_request to True if a request is coming +from a user with this role. This can be used by other middlewares. + +Troubleshooting tips for keystoneauth deployment +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -By default the only users able to give ACL or to Create other -containers are the ones who has the Keystone role specified in the -``operator_roles`` setting. +Some common mistakes can result in API requests failing when first deploying +keystone with Swift: -This user who have one of those role will be able to give ACLs to -other users on containers, see the documentation on ACL here -:mod:`swift.common.middleware.acl`. +* Incorrect configuration of the Swift endpoint in the Keystone service. + + By default, keystoneauth expects the account part of a URL to have the form + ``AUTH_``. Sometimes the ``AUTH_`` prefix is missed when + configuring Swift endpoints in Keystone, as described in the `Install Guide + `_. This is easily diagnosed by inspecting the + proxy-server log file for a failed request URL and checking that the URL + includes the ``AUTH_`` prefix (or whatever reseller prefix may have been + configured for keystoneauth):: + + GOOD: + proxy-server: 127.0.0.1 127.0.0.1 07/Sep/2016/16/06/58 HEAD /v1/AUTH_cfb8d9d45212408b90bc0776117aec9e HTTP/1.0 204 ... + + BAD: + proxy-server: 127.0.0.1 127.0.0.1 07/Sep/2016/16/07/35 HEAD /v1/cfb8d9d45212408b90bc0776117aec9e HTTP/1.0 403 ... + + +* Incorrect configuration of the ``authtoken`` middleware options in the Swift + proxy server. + + The ``authtoken`` middleware communicates with the Keystone service to + validate tokens that are presented with client requests. To do this + ``authtoken`` must authenticate itself with Keystone using the credentials + configured in the ``[filter:authtoken]`` section of + ``/etc/swift/proxy-server.conf``. Errors in these credentials can result in + ``authtoken`` failing to validate tokens and may be revealed in the proxy + server logs by a message such as:: + + proxy-server: Identity server rejected authorization + + .. note:: + + More detailed log messaging may be seen by setting the ``authtoken`` + option ``log_level = debug``. + + The ``authtoken`` configuration options may be checked by attempting to use + them to communicate directly with Keystone using an ``openstack`` command + line. For example, given the ``authtoken`` configuration sample shown in + :ref:`configuring_keystone_auth`, the following command should return a + service catalog:: + + openstack --os-identity-api-version=3 --os-auth-url=http://keystonehost:5000/ \ + --os-username=swift --os-user-domain-id=default \ + --os-project-name=service --os-project-domain-id=default \ + --os-password=password catalog show object-store + + If this ``openstack`` command fails then it is likely that there is a problem + with the ``authtoken`` configuration. + +.. _extending_auth: -------------- Extending Auth @@ -135,7 +372,8 @@ Extending Auth TempAuth is written as wsgi middleware, so implementing your own auth is as easy as writing new wsgi middleware, and plugging it in to the proxy server. -The KeyStone project and the Swauth project are examples of additional auth -services. -Also, see :doc:`development_auth`. +See :doc:`development_auth` for detailed information on extending the +auth system. + + diff --git a/doc/source/overview_backing_store.rst b/doc/source/overview_backing_store.rst new file mode 100644 index 0000000000..0bb1251a4c --- /dev/null +++ b/doc/source/overview_backing_store.rst @@ -0,0 +1,273 @@ + +============================================= +Using Swift as Backing Store for Service Data +============================================= + +---------- +Background +---------- + +This section provides guidance to OpenStack Service developers for how to +store your users' data in Swift. An example of this is that a user requests +that Nova save a snapshot of a VM. Nova passes the request to Glance, +Glance writes the image to a Swift container as a set of objects. + +Throughout this section, the following terminology and concepts are used: + +* User or end-user. This is a person making a request that will result in + an OpenStack Service making a request to Swift. + +* Project (also known as Tenant). This is the unit of resource ownership. + While data such as snapshot images or block volume backups may be + stored as a result of an end-user's request, the reality is that these + are project data. + +* Service. This is a program or system used by end-users. Specifically, it + is any program or system that is capable of receiving end-user's tokens and + validating the token with the Keystone Service and has a need to store + data in Swift. Glance and Cinder are examples of such Services. + +* Service User. This is a Keystone user that has been assigned to a Service. + This allows the Service to generate and use its own tokens so that it + can interact with other Services as itself. + +* Service Project. This is a project (tenant) that is associated with a + Service. There may be a single project shared by many Services or there + may be a project dedicated to each Service. In this document, the + main purpose of the Service Project is to allow the system operator + to configure specific roles for each Service User. + +------------------------------- +Alternate Backing Store Schemes +------------------------------- + +There are three schemes described here: + +* Dedicated Service Account (Single Tenant) + + Your Service has a dedicated Service Project (hence a single dedicated + Swift account). Data for all users and projects are stored in this + account. Your Service must have a user assigned to it (the Service User). + When you have data to store on behalf of one of your users, you use the + Service User credentials to get a token for the Service Project and + request Swift to store the data in the Service Project. + + With this scheme, data for all users is stored in a single account. This + is transparent to your users and since the credentials for the Service User + are typically not shared with anyone, your users' cannot access their + data by making a request directly to Swift. However, since data belonging + to all users is stored in one account, it presents a single point of + vulnerably to accidental deletion or a leak of the service-user + credentials. + +* Multi Project (Multi Tenant) + + Data belonging to a project is stored in the Swift account + associated with the project. Users make requests to your Service using + a token scoped to a project in the normal way. You can then use this + same token to store the user data in the project's Swift account. + + The effect is that data is stored in multiple projects (aka tenants). + Hence this scheme has been known as the "multi tenant" scheme. + + With this scheme, access is controlled by Keystone. The users must + have a role that allows them to perform the request to your Service. In + addition, they must have a role that also allows them to store data in + the Swift account. By default, the admin or swiftoperator roles are + used for this purpose (specific systems may use other role names). If the + user does not have the appropriate roles, when your Service attempts + to access Swift, the operation will fail. + + Since you are using the user's token to access the data, it follows that + the user can use the same token to access Swift directly -- bypassing your + Service. When end-users are browsing containers, they will also see + your Service's containers and objects -- and may potentially delete + the data. Conversely, there is no single account where all data so leakage + of credentials will only affect a single project/tenant. + +* Service Prefix Account + + Data belonging to a project is stored in a Swift account associated + with the project. This is similar to the Multi Project scheme described + above. However, the Swift account is different than the account that + users access. Specifically, it has a different account prefix. For example, + for the project 1234, the user account is named AUTH_1234. Your Service uses + a different account, for example, SERVICE_1234. + + To access the SERVICE_1234 account, you must present two tokens: the user's + token is put in the X-Auth-Token header. You present your Service's token + in the X-Service-Token header. Swift is configured such that only when both + tokens are presented will it allow access. Specifically, the user cannot + bypass your Service because they only have their own token. Conversely, your + Service can only access the data while it has a copy of the user's token -- + the Service's token by itself will not grant access. + + The data stored in the Service Prefix Account cannot be seen by end-users. + So they cannot delete this data -- they can only access the data if they + make a request through your Service. The data is also more secure. To make + an unauthorized access, someone would need to compromise both an end-user's + and your Service User credentials. Even then, this would only expose one + project -- not other projects. + +The Service Prefix Account scheme combines features of the Dedicated Service +Account and Multi Project schemes. It has the private, dedicated, +characteristics of the Dedicated Service Account scheme but does not present +a single point of attack. Using the Service Prefix Account scheme is a little +more involved than the other schemes, so the rest of this document describes +it more detail. + +------------------------------- +Service Prefix Account Overview +------------------------------- + +The following diagram shows the flow through the system from the end-user, +to your Service and then onto Swift:: + + client + \ + \ : + \ x-auth-token: + \ + SERVICE + \ + \ PUT: /v1/SERVICE_1234// + \ x-auth-token: + \ x-service-token: + \ + Swift + +The sequence of events and actions are as follows: + +* Request arrives at your Service + +* The is validated by the keystonemiddleware.auth_token + middleware. The user's role(s) are used to determine if the user + can perform the request. See :doc:`overview_auth` for technical + information on the authentication system. + +* As part of this request, your Service needs to access Swift (either to + write or read a container or object). In this example, you want to perform + a PUT on /. + +* In the wsgi environment, the auth_token module will have populated the + HTTP_X_SERVICE_CATALOG item. This lists the Swift endpoint and account. + This is something such as https:///v1/AUTH_1234 where ``AUTH_`` + is a prefix and ``1234`` is the project id. + +* The ``AUTH_`` prefix is the default value. However, your system may use a + different prefix. To determine the actual prefix, search for the first + underscore ('_') character in the account name. If there is no underscore + character in the account name, this means there is no prefix. + +* Your Service should have a configuration parameter that provides the + appropriate prefix to use for storing data in Swift. There is more + discussion of this below, but for now assume the prefix is ``SERVICE_``. + +* Replace the prefix (``AUTH_`` in above examples) in the path with + ``SERVICE_``, so the full URL to access the object becomes + https:///v1/SERVICE_1234//. + +* Make the request to Swift, using this URL. In the X-Auth-Token header place + a copy of the . In the X-Service-Token header, place your + Service's token. If you use python-swiftclient you can achieve this + by: + + * Putting the URL in the ``preauthurl`` parameter + * Putting the in ``preauthtoken`` parameter + * Adding the X-Service-Token to the ``headers`` parameter + + +Using the HTTP_X_SERVICE_CATALOG to get Swift Account Name +---------------------------------------------------------- + +The auth_token middleware populates the wsgi environment with information when +it validates the user's token. The HTTP_X_SERVICE_CATALOG item is a JSON +string containing details of the OpenStack endpoints. For Swift, this also +contains the project's Swift account name. Here is an example of a catalog +entry for Swift:: + + "serviceCatalog": [ + ... + { + .... + "type": "object-store", + "endpoints": [ + ... + { + ... + "publicURL": "https:///v1/AUTH_1234", + "region": "" + ... + } + ... + ... + } + } + +To get the End-user's account: + +* Look for an entry with ``type`` of ``object-store`` + +* If there are several regions, there will be several endpoints. Use the + appropriate region name and select the ``publicURL`` item. + +* The Swift account name is the final item in the path ("AUTH_1234" in this + example). + +Getting a Service Token +----------------------- + +A Service Token is no different than any other token and is requested +from Keystone using user credentials and project in the usual way. The core +requirement is that your Service User has the appropriate role. In practice: + +* Your Service must have a user assigned to it (the Service User). + +* Your Service has a project assigned to it (the Service Project). + +* The Service User must have a role on the Service Project. This role is + distinct from any of the normal end-user roles. + +* The role used must the role configured in the /etc/swift/proxy-server.conf. + This is the ``_service_roles`` option. In this example, the role + is the ``service`` role:: + + [keystoneauth] + reseller_prefix = AUTH_, SERVICE_ + SERVICE_service_role = service + +The ``service`` role should only be granted to OpenStack Services. It should +not be granted to users. + +Single or multiple Service Prefixes? +------------------------------------ + +Most of the examples used in this document used a single prefix. The +prefix, ``SERVICE`` was used. By using a single prefix, an operator is +allowing all OpenStack Services to share the same account for data +associated with a given project. For test systems or deployments well protected +on private firewalled networks, this is appropriate. + +However, if one Service is compromised, that Service can access +data created by another Service. To prevent this, multiple Service Prefixes may +be used. This also requires that the operator configure multiple service +roles. For example, in a system that has Glance and Cinder, the following +Swift configuration could be used:: + + [keystoneauth] + reseller_prefix = AUTH_, IMAGE_, BLOCK_ + IMAGE_service_roles = image_service + BLOCK_service_roles = block_service + +The Service User for Glance would be granted the ``image_service`` role on its +Service Project and the Cinder Service user is granted the ``block_service`` +role on its project. In this scheme, if the Cinder Service was compromised, +it would not be able to access any Glance data. + +Container Naming +---------------- + +Since a single Service Prefix is possible, container names should be prefixed +with a unique string to prevent name clashes. We suggest you use the service +type field (as used in the service catalog). For example, The Glance Service +would use "image" as a prefix. diff --git a/doc/source/overview_container_sharding.rst b/doc/source/overview_container_sharding.rst new file mode 100644 index 0000000000..f834b5946e --- /dev/null +++ b/doc/source/overview_container_sharding.rst @@ -0,0 +1,694 @@ +.. _sharding_doc: + +================== +Container Sharding +================== + +Container sharding is an operator controlled feature that may be used to shard +very large container databases into a number of smaller shard containers + +.. note:: + + It is strongly recommended that operators gain experience of sharding + containers in a non-production cluster before using in production. + + The sharding process involves moving all sharding container database + records via the container replication engine; the time taken to complete + sharding is dependent upon the existing cluster load and the performance of + the container database being sharded. + + There is currently no documented process for reversing the sharding + process once sharding has been enabled. + + +---------- +Background +---------- +The metadata for each container in Swift is stored in an SQLite database. This +metadata includes: information about the container such as its name, +modification time and current object count; user metadata that may been written +to the container by clients; a record of every object in the container. The +container database object records are used to generate container listings in +response to container GET requests; each object record stores the object's +name, size, hash and content-type as well as associated timestamps. + +As the number of objects in a container increases then the number of object +records in the container database increases. Eventually the container database +performance starts to degrade and the time taken to update an object record +increases. This can result in object updates timing out, with a corresponding +increase in the backlog of pending :ref:`asynchronous updates +` on object servers. Container databases are typically +replicated on several nodes and any database performance degradation can also +result in longer :doc:`container replication ` times. + +The point at which container database performance starts to degrade depends +upon the choice of hardware in the container ring. Anecdotal evidence suggests +that containers with tens of millions of object records have noticeably +degraded performance. + +This performance degradation can be avoided by ensuring that clients use an +object naming scheme that disperses objects across a number of containers +thereby distributing load across a number of container databases. However, that +is not always desirable nor is it under the control of the cluster operator. + +Swift's container sharding feature provides the operator with a mechanism to +distribute the load on a single client-visible container across multiple, +hidden, shard containers, each of which stores a subset of the container's +object records. Clients are unaware of container sharding; clients continue to +use the same API to access a container that, if sharded, maps to a number of +shard containers within the Swift cluster. + +------------------------ +Deployment and operation +------------------------ + +Upgrade Considerations +---------------------- + +It is essential that all servers in a Swift cluster have been upgraded to +support the container sharding feature before attempting to shard a container. + +Identifying containers in need of sharding +------------------------------------------ + +Container sharding is currently initiated by the ``swift-manage-shard-ranges`` +CLI tool :ref:`described below `. Operators must +first identify containers that are candidates for sharding. To assist with +this, the :ref:`sharder_daemon` inspects the size of containers that it visits +and writes a list of sharding candidates to recon cache. For example:: + + "sharding_candidates": { + "found": 1, + "top": [ + { + "account": "AUTH_test", + "container": "c1", + "file_size": 497763328, + "meta_timestamp": "1525346445.31161", + "node_index": 2, + "object_count": 3349028, + "path": , + "root": "AUTH_test/c1" + } + ] + } + +A container is considered to be a sharding candidate if its object count is +greater than or equal to the ``shard_container_threshold`` option. +The number of candidates reported is limited to a number configured by the +``recon_candidates_limit`` option such that only the largest candidate +containers are included in the ``sharding_candidates`` data. + + +.. _swift-manage-shard-ranges: + +``swift-manage-shard-ranges`` CLI tool +-------------------------------------- + +.. automodule:: swift.cli.manage_shard_ranges + :members: + :show-inheritance: + + +.. _sharder_daemon: + +``container-sharder`` daemon +---------------------------- + +Once sharding has been enabled for a container, the act of sharding is +performed by the :ref:`container-sharder`. The :ref:`container-sharder` daemon +must be running on all container servers. The ``container-sharder`` daemon +periodically visits each container database to perform any container sharding +tasks that are required. + +The ``container-sharder`` daemon requires a ``[container-sharder]`` config +section to exist in the container server configuration file; a sample config +section is shown in the `container-server.conf-sample` file. + +.. note:: + + The ``auto_shard`` option is currently **NOT** recommended for production + systems and should be set to ``false`` (the default value). + + Several of the ``[container-sharder]`` config options are only significant + when the ``auto_shard`` option is enabled. This option enables the + ``container-sharder`` daemon to automatically identify containers that are + candidates for sharding and initiate the sharding process, instead of using + the ``swift-manage-shard-ranges`` tool. + +The container sharder uses an internal client and therefore requires an +internal client configuration file to exist. By default the internal-client +configuration file is expected to be found at +`/etc/swift/internal-client.conf`. An alternative location for the +configuration file may be specified using the ``internal_client_conf_path`` +option in the ``[container-sharder]`` config section. + +The content of the internal-client configuration file should be the same as the +`internal-client.conf-sample` file. In particular, the internal-client +configuration should have:: + + account_autocreate = True + +in the ``[proxy-server]`` section. + +A container database may require several visits by the ``container-sharder`` +daemon before it is fully sharded. On each visit the ``container-sharder`` +daemon will move a subset of object records to new shard containers by cleaving +new shard container databases from the original. By default, two shards are +processed per visit; this number may be configured by the ``cleave_batch_size`` +option. + +The ``container-sharder`` daemon periodically writes progress data for +containers that are being sharded to recon cache. For example:: + + "sharding_in_progress": { + "all": [ + { + "account": "AUTH_test", + "active": 0, + "cleaved": 2, + "container": "c1", + "created": 5, + "db_state": "sharding", + "error": null, + "file_size": 26624, + "found": 0, + "meta_timestamp": "1525349617.46235", + "node_index": 1, + "object_count": 3349030, + "path": , + "processing_time": 0.00381, + "root": "AUTH_test/c1", + "state": "sharding", + "tombstones": -1, + "total_replicate_time": 0.07549, + "total_sharding_time": 210.091, + } + ] + } + +This example indicates that from a total of 7 shard ranges, 2 have been cleaved +whereas 5 remain in created state waiting to be cleaved. + +Shard containers are created in an internal account and not visible to clients. +By default, shard containers for an account ``AUTH_test`` are created in the +internal account ``.shards_AUTH_test``. + +Once a container has started sharding, object updates to that container may be +redirected to the shard container. The ``container-sharder`` daemon is also +responsible for sending updates of a shard's object count and bytes_used to the +original container so that aggegrate object count and bytes used values can be +returned in responses to client requests. + +.. note:: + + The ``container-sharder`` daemon must continue to run on all container + servers in order for shards object stats updates to be generated. + + +-------------- +Under the hood +-------------- + +Terminology +----------- + +================== ==================================================== +Name Description +================== ==================================================== +Root container The original container that lives in the + user's account. It holds references to its + shard containers. +Retiring DB The original database file that is to be sharded. +Fresh DB A database file that will replace the retiring + database. +Epoch A timestamp at which the fresh DB is created; the + epoch value is embedded in the fresh DB filename. +Shard range A range of the object namespace defined by a lower + bound and upper bound. +Shard container A container that holds object records for a shard + range. Shard containers exist in a hidden account + mirroring the user's account. +Parent container The container from which a shard container has been + cleaved. When first sharding a root container each + shard's parent container will be the root container. + When sharding a shard container each shard's parent + container will be the sharding shard container. +Misplaced objects Items that don't belong in a container's shard + range. These will be moved to their correct + location by the container-sharder. +Cleaving The act of moving object records within a shard + range to a shard container database. +Shrinking The act of merging a small shard container into + another shard container in order to delete the + small shard container. +Donor The shard range that is shrinking away. +Acceptor The shard range into which a donor is merged. +================== ==================================================== + + +Finding shard ranges +-------------------- + +The end goal of sharding a container is to replace the original container +database which has grown very large with a number of shard container databases, +each of which is responsible for storing a range of the entire object +namespace. The first step towards achieving this is to identify an appropriate +set of contiguous object namespaces, known as shard ranges, each of which +contains a similar sized portion of the container's current object content. + +Shard ranges cannot simply be selected by sharding the namespace uniformly, +because object names are not guaranteed to be distributed uniformly. If the +container were naively sharded into two shard ranges, one containing all +object names up to `m` and the other containing all object names beyond `m`, +then if all object names actually start with `o` the outcome would be an +extremely unbalanced pair of shard containers. + +It is also too simplistic to assume that every container that requires sharding +can be sharded into two. This might be the goal in the ideal world, but in +practice there will be containers that have grown very large and should be +sharded into many shards. Furthermore, the time required to find the exact +mid-point of the existing object names in a large SQLite database would +increase with container size. + +For these reasons, shard ranges of size `N` are found by searching for the +`Nth` object in the database table, sorted by object name, and then searching +for the `(2 * N)th` object, and so on until all objects have been searched. For +a container that has exactly `2N` objects, the end result is the same as +sharding the container at the midpoint of its object names. In practice +sharding would typically be enabled for containers with great than `2N` objects +and more than two shard ranges will be found, the last one probably containing +less than `N` objects. With containers having large multiples of `N` objects, +shard ranges can be identified in batches which enables more scalable solution. + +To illustrate this process, consider a very large container in a user account +``acct`` that is a candidate for sharding: + +.. image:: images/sharding_unsharded.svg + +The :ref:`swift-manage-shard-ranges` tool ``find`` sub-command searches the +object table for the `Nth` object whose name will become the upper bound of the +first shard range, and the lower bound of the second shard range. The lower +bound of the first shard range is the empty string. + +For the purposes of this example the first upper bound is `cat`: + +.. image:: images/sharding_scan_basic.svg + +:ref:`swift-manage-shard-ranges` continues to search the container to find +further shard ranges, with the final upper bound also being the empty string. + +Enabling sharding +----------------- + +Once shard ranges have been found the :ref:`swift-manage-shard-ranges` +``replace`` sub-command is used to insert them into the `shard_ranges` table +of the container database. In addition to its lower and upper bounds, each +shard range is given a unique name. + +The ``enable`` sub-command then creates some final state required to initiate +sharding the container, including a special shard range record referred to as +the container's `own_shard_range` whose name is equal to the container's path. +This is used to keep a record of the object namespace that the container +covers, which for user containers is always the entire namespace. Sharding of +the container will only begin when its own shard range's state has been set to +``SHARDING``. + +The :class:`~swift.common.utils.ShardRange` class +------------------------------------------------- + +The :class:`~swift.common.utils.ShardRange` class provides methods for +interactng with the attributes and state of a shard range. The class +encapsulates the following properties: + +* The name of the shard range which is also the name of the shard container + used to hold object records in its namespace. +* Lower and upper bounds which define the object namespace of the shard range. +* A deleted flag. +* A timestamp at which the bounds and deleted flag were last modified. +* The object stats for the shard range i.e. object count and bytes used. +* A timestamp at which the object stats were last modified. +* The state of the shard range, and an epoch, which is the timestamp used in + the shard container's database file name. +* A timestamp at which the state and epoch were last modified. + +A shard range progresses through the following states: + +* FOUND: the shard range has been identified in the container that is to be + sharded but no resources have been created for it. +* CREATED: a shard container has been created to store the contents of the + shard range. +* CLEAVED: the sharding container's contents for the shard range have been + copied to the shard container from *at least one replica* of the sharding + container. +* ACTIVE: a sharding container's constituent shard ranges are moved to this + state when all shard ranges in the sharding container have been cleaved. +* SHRINKING: the shard range has been enabled for shrinking; or +* SHARDING: the shard range has been enabled for sharding into further + sub-shards. +* SHARDED: the shard range has completed sharding or shrinking; the container + will typically now have a number of constituent ACTIVE shard ranges. + +.. note:: + + Shard range state represents the most advanced state of the shard range on + any replica of the container. For example, a shard range in CLEAVED state + may not have completed cleaving on all replicas but has cleaved on at least + one replica. + +Fresh and retiring database files +--------------------------------- + +As alluded to earlier, writing to a large container causes increased latency +for the container servers. Once sharding has been initiated on a container it +is desirable to stop writing to the large database; ultimately it will be +unlinked. This is primarily achieved by redirecting object updates to new shard +containers as they are created (see :ref:`redirecting_updates` below), but some +object updates may still need to be accepted by the root container and other +container metadata must still be modifiable. + +To render the large `retiring` database effectively read-only, when the +:ref:`sharder_daemon` finds a container with a set of shard range records, +including an `own_shard_range`, it first creates a fresh database file which +will ultimately replace the existing `retiring` database. For a retiring DB +whose filename is:: + + .db + +the fresh database file name is of the form:: + + _.db + +where `epoch` is a timestamp stored in the container's `own_shard_range`. + +The fresh DB has a copy of the shard ranges table from the retiring DB and all +other container metadata apart from the object records. Once a fresh DB file +has been created it is used to store any new object updates and no more object +records are written to the retiring DB file. + +Once the sharding process has completed, the retiring DB file will be unlinked +leaving only the fresh DB file in the container's directory. There are +therefore three states that the container DB directory may be in during the +sharding process: UNSHARDED, SHARDING and SHARDED. + +.. image:: images/sharding_db_states.svg + +If the container ever shrink to the point that is has no shards then the fresh +DB starts to store object records, behaving the same as an unsharded container. +This is known as the COLLAPSED state. + +In summary, the DB states that any container replica may be in are: + +- UNSHARDED - In this state there is just one standard container database. All + containers are originally in this state. +- SHARDING - There are now two databases, the retiring database and a fresh + database. The fresh database stores any metadata, container level stats, + an object holding table, and a table that stores shard ranges. +- SHARDED - There is only one database, the fresh database, which has one or + more shard ranges in addition to its own shard range. The retiring database + has been unlinked. +- COLLAPSED - There is only one database, the fresh database, which has only + its own shard range and store object records. + +.. note:: + + DB state is unique to each replica of a container and is not necessarily + synchronised with shard range state. + +Creating shard containers +------------------------- + +The :ref:`sharder_daemon` next creates a shard container for each shard range +using the shard range name as the name of the shard container: + +.. image:: /images/sharding_cleave_basic.svg + +Each shard container has an `own_shard_range` record which has the +lower and upper bounds of the object namespace for which it is responsible, and +a reference to the sharding user container, which is referred to as the +`root_container`. Unlike the `root_container`, the shard container's +`own_shard_range` does not cover the entire namepsace. + +A shard range name takes the form ``/`` where `` +is a hidden account and `` is a container name that is derived from +the root container. + +The account name `` used for shard containers is formed by prefixing +the user account with the string ``.shards_``. This avoids namespace collisions +and also keeps all the shard containers out of view from users of the account. + +The container name for each shard container has the form:: + + --- + +where `root container name` is the name of the user container to which the +contents of the shard container belong, `parent container` is the name of the +container from which the shard is being cleaved, `timestamp` is the time at +which the shard range was created and `shard index` is the position of the +shard range in the name-ordered list of shard ranges for the `parent +container`. + +When sharding a user container the parent container name will be the same as +the root container. However, if a *shard container* grows to a size that it +requires sharding, then the parent container name for its shards will be the +name of the sharding shard container. + +For example, consider a user container with path ``AUTH_user/c`` which is +sharded into two shard containers whose name will be:: + + .shards_AUTH_user/c--1234512345.12345-0 + .shards_AUTH_user/c--1234512345.12345-1 + +If the first shard container is subsequently sharded into a further two shard +containers then they will be named:: + + .shards_AUTH_user/c--1234567890.12345-0)>-1234567890.12345-0 + .shards_AUTH_user/c--1234567890.12345-0)>-1234567890.12345-1 + +This naming scheme guarantees that shards, and shards of shards, each have a +unique name of bounded length. + + +Cleaving shard containers +------------------------- + +Having created empty shard containers the sharder daemon will proceed to cleave +objects from the retiring database to each shard range. Cleaving occurs in +batches of two (by default) shard ranges, so if a container has more than two +shard ranges then the daemon must visit it multiple times to complete cleaving. + +To cleave a shard range the daemon creates a shard database for the shard +container on a local device. This device may be one of the shard container's +primary nodes but often it will not. Object records from the corresponding +shard range namespace are then copied from the retiring DB to this shard DB. + +Swift's container replication mechanism is then used to replicate the shard DB +to its primary nodes. Checks are made to ensure that the new shard container DB +has been replicated to a sufficient number of its primary nodes before it is +considered to have been successfully cleaved. By default the daemon requires +successful replication of a new shard broker to at least a quorum of the +container rings replica count, but this requirement can be tuned using the +``shard_replication_quorum`` option. + +Once a shard range has been successfully cleaved from a retiring database the +daemon transitions its state to ``CLEAVED``. It should be noted that this state +transition occurs as soon as any one of the retiring DB replicas has cleaved +the shard range, and therefore does not imply that all retiring DB replicas +have cleaved that range. The significance of the state transition is that the +shard container is now considered suitable for contributing to object listings, +since its contents are present on a quorum of its primary nodes and are the +same as at least one of the retiring DBs for that namespace. + +Once a shard range is in the ``CLEAVED`` state, the requirement for +'successful' cleaving of other instances of the retirng DB may optionally be +relaxed since it is not so imperative that their contents are replicated +*immediately* to their primary nodes. The ``existing_shard_replication_quorum`` +option can be used to reduce the quorum required for a cleaved shard range to +be considered successfully replicated by the sharder daemon. + +.. note:: + + Once cleaved, shard container DBs will continue to be replicated by the + normal `container-replicator` daemon so that they will eventually be fully + replicated to all primary nodes regardless of any replication quorum options + used by the sharder daemon. + +The cleaving progress of each replica of a retiring DB must be +tracked independently of the shard range state. This is done using a per-DB +CleavingContext object that maintains a cleaving cursor for the retiring DB +that it is associated with. The cleaving cursor is simply the upper bound of +the last shard range to have been cleaved *from that particular retiring DB*. + +Each CleavingContext is stored in the sharding container's sysmeta under a key +that is the ``id`` of the retiring DB. Since all container DB files have a +unique ``id``, this guarantees that each retiring DB will have a unique +CleavingContext. Furthermore, if the retiring DB file is changed, for example +by an rsync_then_merge replication operation which might change the contents of +the DB's object table, then it will get a new unique CleavingContext. + +A CleavingContext maintains other state that is used to ensure that a retiring +DB is only considered to be fully cleaved, and ready to be deleted, if *all* of +its object rows have been cleaved to a shard range. + +Once all shard ranges have been cleaved from the retiring DB it is deleted. The +container is now represented by the fresh DB which has a table of shard range +records that point to the shard containers that store the container's object +records. + +.. _redirecting_updates: + +Redirecting object updates +-------------------------- + +Once a shard container exists, object updates arising from new client requests +and async pending files are directed to the shard container instead of the root +container. This takes load off of the root container. + +For a sharded (or partially sharded) container, when the proxy receives a new +object request it issues a GET request to the container for data describing a +shard container to which the object update should be sent. The proxy then +annotates the object request with the shard container location so that the +object server will forward object updates to the shard container. If those +updates fail then the async pending file that is written on the object server +contains the shard container location. + +When the object updater processes async pending files for previously failed +object updates, it may not find a shard container location. In this case the +updater sends the update to the `root container`, which returns a redirection +response with the shard container location. + +.. note:: + + Object updates are directed to shard containers as soon as they exist, even + if the retiring DB object records have not yet been cleaved to the shard + container. This prevents further writes to the retiring DB and also avoids + the fresh DB being polluted by new object updates. The goal is to + ultimately have all object records in the shard containers and none in the + root container. + +Building container listings +--------------------------- + +Listing requests for a sharded container are handled by querying the shard +containers for components of the listing. The proxy forwards the client listing +request to the root container, as it would for an unsharded container, but the +container server responds with a list of shard ranges rather than objects. The +proxy then queries each shard container in namespace order for their listing, +until either the listing length limit is reached or all shard ranges have been +listed. + +While a container is still in the process of sharding, only *cleaved* shard +ranges are used when building a container listing. Shard ranges that have not +yet cleaved will not have any object records from the root container. The root +container continues to provide listings for the uncleaved part of its +namespace. + +.. note:: + + New object updates are redirected to shard containers that have not yet been + cleaved. These updates will not therefore be included in container listings + until their shard range has been cleaved. + +Example request redirection +--------------------------- + +As an example, consider a sharding container in which 3 shard ranges have been +found ending in cat, giraffe and igloo. Their respective shard containers have +been created so update requests for objects up to "igloo" are redirected to the +appropriate shard container. The root DB continues to handle listing requests +and update requests for any object name beyond "igloo". + +.. image:: images/sharding_scan_load.svg + +The sharder daemon cleaves objects from the retiring DB to the shard range DBs; +it also moves any misplaced objects from the root container's fresh DB to the +shard DB. Cleaving progress is represented by the blue line. Once the first +shard range has been cleaved listing requests for that namespace are directed +to the shard container. The root container still provides listings for the +remainder of the namespace. + +.. image:: images/sharding_cleave1_load.svg + +The process continues: the sharder cleaves the next range and a new range is +found with upper bound of "linux". Now the root container only needs to handle +listing requests up to "giraffe" and update requests for objects whose name is +greater than "linux". Load will continue to diminish on the root DB and be +dispersed across the shard DBs. + +.. image:: images/sharding_cleave2_load.svg + + +Container replication +--------------------- + +Shard range records are replicated between container DB replicas in much the +same way as object records are for unsharded containers. However, the usual +replication of object records between replicas of a container is halted as soon +as a container is capable of being sharded. Instead, object records are moved +to their new locations in shard containers. This avoids unnecessary replication +traffic between container replicas. + +To facilitate this, shard ranges are both 'pushed' and 'pulled' during +replication, prior to any attempt to replicate objects. This means that the +node initiating replication learns about shard ranges from the destination node +early during the replication process and is able to skip object replication if +it discovers that it has shard ranges and is able to shard. + +.. note:: + + When the destination DB for container replication is missing then the + 'complete_rsync' replication mechanism is still used and in this case only + both object records and shard range records are copied to the destination + node. + +Container deletion +------------------ + +Sharded containers may be deleted by a ``DELETE`` request just like an +unsharded container. A sharded container must be empty before it can be deleted +which implies that all of its shard containers must have reported that they are +empty. + +Shard containers are *not* immediately deleted when their root container is +deleted; the shard containers remain undeleted so that they are able to +continue to receive object updates that might arrive after the root container +has been deleted. Shard containers continue to update their deleted root +container with their object stats. If a shard container does receive object +updates that cause it to no longer be empty then the root container will no +longer be considered deleted once that shard container sends an object stats +update. + + +Sharding a shard container +-------------------------- + +A shard container may grow to a size that requires it to be sharded. +``swift-manage-shard-ranges`` may be used to identify shard ranges within a +shard container and enable sharding in the same way as for a root container. +When a shard is sharding it notifies the root container of its shard ranges so +that the root container can start to redirect object updates to the new +'sub-shards'. When the shard has completed sharding the root is aware of all +the new sub-shards and the sharding shard deletes its shard range record in the +root container shard ranges table. At this point the root container is aware of +all the new sub-shards which collectively cover the namespace of the +now-deleted shard. + +There is no hierarchy of shards beyond the root container and its immediate +shards. When a shard shards, its sub-shards are effectively re-parented with +the root container. + + +Shrinking a shard container +--------------------------- + +A shard container's contents may reduce to a point where the shard container is +no longer required. If this happens then the shard container may be shrunk into +another shard range. Shrinking is achieved in a similar way to sharding: an +'acceptor' shard range is written to the shrinking shard container's shard +ranges table; unlike sharding, where shard ranges each cover a subset of the +sharding container's namespace, the acceptor shard range is a superset of the +shrinking shard range. + +Once given an acceptor shard range the shrinking shard will cleave itself to +its acceptor, and then delete itself from the root container shard ranges +table. diff --git a/doc/source/overview_container_sync.rst b/doc/source/overview_container_sync.rst index b62136d258..7413911e87 100644 --- a/doc/source/overview_container_sync.rst +++ b/doc/source/overview_container_sync.rst @@ -14,25 +14,120 @@ synchronization key. .. note:: - Container sync will sync object POSTs only if the proxy server is set to - use "object_post_as_copy = true" which is the default. So-called fast - object posts, "object_post_as_copy = false" do not update the container - listings and therefore can't be detected for synchronization. + If you are using the :ref:`Large Objects ` feature and + syncing to another cluster then you will need to ensure that manifest files + and segment files are synced. If segment files are in a different container + than their manifest then both the manifest's container and the segments' + container must be synced. The target container for synced segment files + must always have the same name as their source container in order for them + to be resolved by synced manifests. + + Be aware that manifest files may be synced before segment files even if + they are in the same container and were created after the segment files. + + In the case of :ref:`Static Large Objects `, a GET + request for a manifest whose segments have yet to be completely synced will + fail with none or only part of the large object content being returned. + + In the case of :ref:`Dynamic Large Objects `, a GET + request for a manifest whose segments have yet to be completely synced will + either fail or return unexpected (and most likely incorrect) content. .. note:: - If you are using the large objects feature you will need to ensure both - your manifest file and your segment files are synced if they happen to be - in different containers. + If you are using encryption middleware in the cluster from which objects + are being synced, then you should follow the instructions for + :ref:`container_sync_client_config` to be compatible with encryption. --------------------------------------------- -Configuring a Cluster's Allowable Sync Hosts --------------------------------------------- +.. note:: -The Swift cluster operator must allow synchronization with a set of hosts -before the user can enable container synchronization. First, the backend -container server needs to be given this list of hosts in the -container-server.conf file:: + If you are using symlink middleware in the cluster from which objects + are being synced, then you should follow the instructions for + :ref:`symlink_container_sync_client_config` to be compatible with symlinks. + + Be aware that symlinks may be synced before their targets even if they are + in the same container and were created after the target objects. In such + cases, a GET for the symlink will fail with a ``404 Not Found`` error. If + the target has been overwritten, a GET may produce an older version (for + dynamic links) or a ``409 Conflict`` error (for static links). + +-------------------------- +Configuring Container Sync +-------------------------- + +Create a ``container-sync-realms.conf`` file specifying the allowable clusters +and their information:: + + [realm1] + key = realm1key + key2 = realm1key2 + cluster_clustername1 = https://host1/v1/ + cluster_clustername2 = https://host2/v1/ + + [realm2] + key = realm2key + key2 = realm2key2 + cluster_clustername3 = https://host3/v1/ + cluster_clustername4 = https://host4/v1/ + + +Each section name is the name of a sync realm. A sync realm is a set of +clusters that have agreed to allow container syncing with each other. Realm +names will be considered case insensitive. + +``key`` is the overall cluster-to-cluster key used in combination with the +external users' key that they set on their containers' +``X-Container-Sync-Key`` metadata header values. These keys will be used to +sign each request the container sync daemon makes and used to validate each +incoming container sync request. + +``key2`` is optional and is an additional key incoming requests will be checked +against. This is so you can rotate keys if you wish; you move the existing ``key`` +to ``key2`` and make a new ``key`` value. + +Any values in the realm section whose names begin with ``cluster_`` will +indicate the name and endpoint of a cluster and will be used by external users in +their containers' ``X-Container-Sync-To`` metadata header values with the format +``//realm_name/cluster_name/account_name/container_name``. Realm and cluster +names are considered case insensitive. + +The endpoint is what the container sync daemon will use when sending out +requests to that cluster. Keep in mind this endpoint must be reachable by all +container servers, since that is where the container sync daemon runs. Note +that the endpoint ends with ``/v1/`` and that the container sync daemon will then +add the ``account/container/obj`` name after that. + +Distribute this ``container-sync-realms.conf`` file to all your proxy servers +and container servers. + +You also need to add the container_sync middleware to your proxy pipeline. It +needs to be after any memcache middleware and before any auth middleware. The +``[filter:container_sync]`` section only needs the ``use`` item. For example:: + + [pipeline:main] + pipeline = healthcheck proxy-logging cache container_sync tempauth proxy-logging proxy-server + + [filter:container_sync] + use = egg:swift#container_sync + +The container sync daemon will use an internal client to sync objects. Even if +you don't configure the internal client, the container sync daemon will work +with default configuration. The default configuration is the same as +``internal-client.conf-sample``. If you want to configure the internal client, +please update ``internal_client_conf_path`` in ``container-server.conf``. The +configuration file at the path will be used for the internal client. + +------------------------------------------------------- +Old-Style: Configuring a Cluster's Allowable Sync Hosts +------------------------------------------------------- + +This section is for the old-style of using container sync. See the previous +section, Configuring Container Sync, for the new-style. + +With the old-style, the Swift cluster operator must allow synchronization with +a set of hosts before the user can enable container synchronization. First, the +backend container server needs to be given this list of hosts in the +``container-server.conf`` file:: [DEFAULT] # This is a comma separated list of hosts allowed in the @@ -52,13 +147,79 @@ container-server.conf file:: # Maximum amount of time to spend syncing each container # container_time = 60 -Tracking sync progress, problems, and just general activity can only be -achieved with log processing for this first release of container -synchronization. In that light, you may wish to set the above `log_` options to -direct the container-sync logs to a different file for easier monitoring. -Additionally, it should be noted there is no way for an end user to detect sync -progress or problems other than HEADing both containers and comparing the -overall information. + +---------------------- +Logging Container Sync +---------------------- + +Currently, log processing is the only way to track sync progress, problems, +and even just general activity for container synchronization. In that +light, you may wish to set the above ``log_`` options to direct the +container-sync logs to a different file for easier monitoring. Additionally, it +should be noted there is no way for an end user to monitor sync progress or +detect problems other than HEADing both containers and comparing the overall +information. + + + +----------------------------- +Container Sync Statistics +----------------------------- + +Container Sync INFO level logs contain activity metrics and accounting +information for insightful tracking. +Currently two different statistics are collected: + +About once an hour or so, accumulated statistics of all operations performed +by Container Sync are reported to the log file with the following format:: + + Since (time): (sync) synced [(delete) deletes, (put) puts], (skip) skipped, (fail) failed + +time + last report time +sync + number of containers with sync turned on that were successfully synced +delete + number of successful DELETE object requests to the target cluster +put + number of successful PUT object request to the target cluster +skip + number of containers whose sync has been turned off, but are not + yet cleared from the sync store +fail + number of containers with failure (due to exception, timeout or other + reason) + +For each container synced, per container statistics are reported with the +following format:: + + Container sync report: (container), time window start: (start), time window end: %(end), puts: (puts), posts: (posts), deletes: (deletes), bytes: (bytes), sync_point1: (point1), sync_point2: (point2), total_rows: (total) + +container + account/container statistics are for +start + report start time +end + report end time +puts + number of successful PUT object requests to the target container +posts + N/A (0) +deletes + number of successful DELETE object requests to the target container +bytes + number of bytes sent over the network to the target container +point1 + progress indication - the container's ``x_container_sync_point1`` +point2 + progress indication - the container's ``x_container_sync_point2`` +total + number of objects processed at the container + +It is possible that more than one server syncs a container, therefore log files +from all servers need to be evaluated + + ---------------------------------------------------------- Using the ``swift`` tool to set up synchronized containers @@ -73,6 +234,122 @@ Using the ``swift`` tool to set up synchronized containers You must be the account admin on the account to set synchronization targets and keys. +You simply tell each container where to sync to and give it a secret +synchronization key. First, let's get the account details for our two cluster +accounts:: + + $ swift -A http://cluster1/auth/v1.0 -U test:tester -K testing stat -v + StorageURL: http://cluster1/v1/AUTH_208d1854-e475-4500-b315-81de645d060e + Auth Token: AUTH_tkd5359e46ff9e419fa193dbd367f3cd19 + Account: AUTH_208d1854-e475-4500-b315-81de645d060e + Containers: 0 + Objects: 0 + Bytes: 0 + + $ swift -A http://cluster2/auth/v1.0 -U test2:tester2 -K testing2 stat -v + StorageURL: http://cluster2/v1/AUTH_33cdcad8-09fb-4940-90da-0f00cbf21c7c + Auth Token: AUTH_tk816a1aaf403c49adb92ecfca2f88e430 + Account: AUTH_33cdcad8-09fb-4940-90da-0f00cbf21c7c + Containers: 0 + Objects: 0 + Bytes: 0 + +Now, let's make our first container and tell it to synchronize to a second +we'll make next:: + + $ swift -A http://cluster1/auth/v1.0 -U test:tester -K testing post \ + -t '//realm_name/clustername2/AUTH_33cdcad8-09fb-4940-90da-0f00cbf21c7c/container2' \ + -k 'secret' container1 + +The ``-t`` indicates the cluster to sync to, which is the realm name of the +section from ``container-sync-realms.conf``, followed by the cluster name from +that section (without the ``cluster_`` prefix), followed by the account and container +names we want to sync to. The ``-k`` specifies the secret key the two containers will share for +synchronization; this is the user key, the cluster key in +``container-sync-realms.conf`` will also be used behind the scenes. + +Now, we'll do something similar for the second cluster's container:: + + $ swift -A http://cluster2/auth/v1.0 -U test2:tester2 -K testing2 post \ + -t '//realm_name/clustername1/AUTH_208d1854-e475-4500-b315-81de645d060e/container1' \ + -k 'secret' container2 + +That's it. Now we can upload a bunch of stuff to the first container and watch +as it gets synchronized over to the second:: + + $ swift -A http://cluster1/auth/v1.0 -U test:tester -K testing \ + upload container1 . + photo002.png + photo004.png + photo001.png + photo003.png + + $ swift -A http://cluster2/auth/v1.0 -U test2:tester2 -K testing2 \ + list container2 + + [Nothing there yet, so we wait a bit...] + +.. note:: + + If you're an operator running :ref:`saio` and just testing, each time you + configure a container for synchronization and place objects in the + source container you will need to ensure that container-sync runs + before attempting to retrieve objects from the target container. + That is, you need to run:: + + swift-init container-sync once + +Now expect to see objects copied from the first container to the second:: + + $ swift -A http://cluster2/auth/v1.0 -U test2:tester2 -K testing2 \ + list container2 + photo001.png + photo002.png + photo003.png + photo004.png + +You can also set up a chain of synced containers if you want more than two. +You'd point 1 -> 2, then 2 -> 3, and finally 3 -> 1 for three containers. +They'd all need to share the same secret synchronization key. + +.. _`python-swiftclient`: http://github.com/openstack/python-swiftclient + +----------------------------------- +Using curl (or other tools) instead +----------------------------------- + +So what's ``swift`` doing behind the scenes? Nothing overly complicated. It +translates the ``-t `` option into an ``X-Container-Sync-To: `` +header and the ``-k `` option into an ``X-Container-Sync-Key: `` +header. + +For instance, when we created the first container above and told it to +synchronize to the second, we could have used this curl command:: + + $ curl -i -X POST -H 'X-Auth-Token: AUTH_tkd5359e46ff9e419fa193dbd367f3cd19' \ + -H 'X-Container-Sync-To: //realm_name/clustername2/AUTH_33cdcad8-09fb-4940-90da-0f00cbf21c7c/container2' \ + -H 'X-Container-Sync-Key: secret' \ + 'http://cluster1/v1/AUTH_208d1854-e475-4500-b315-81de645d060e/container1' + HTTP/1.1 204 No Content + Content-Length: 0 + Content-Type: text/plain; charset=UTF-8 + Date: Thu, 24 Feb 2011 22:39:14 GMT + +--------------------------------------------------------------------- +Old-Style: Using the ``swift`` tool to set up synchronized containers +--------------------------------------------------------------------- + +.. note:: + + The ``swift`` tool is available from the `python-swiftclient`_ library. + +.. note:: + + You must be the account admin on the account to set synchronization targets + and keys. + +This is for the old-style of container syncing using ``allowed_sync_hosts``. + You simply tell each container where to sync to and give it a secret synchronization key. First, let's get the account details for our two cluster accounts:: @@ -139,9 +416,11 @@ They'd all need to share the same secret synchronization key. .. _`python-swiftclient`: http://github.com/openstack/python-swiftclient ------------------------------------ -Using curl (or other tools) instead ------------------------------------ +---------------------------------------------- +Old-Style: Using curl (or other tools) instead +---------------------------------------------- + +This is for the old-style of container syncing using ``allowed_sync_hosts``. So what's ``swift`` doing behind the scenes? Nothing overly complicated. It translates the ``-t `` option into an ``X-Container-Sync-To: `` @@ -164,27 +443,41 @@ synchronize to the second, we could have used this curl command:: What's going on behind the scenes, in the cluster? -------------------------------------------------- -The swift-container-sync does the job of sending updates to the remote -container. - -This is done by scanning the local devices for container databases and -checking for x-container-sync-to and x-container-sync-key metadata values. -If they exist, newer rows since the last sync will trigger PUTs or DELETEs -to the other container. - -.. note:: - - The swift-container-sync process runs on each container server in - the cluster and talks to the proxy servers in the remote cluster. - Therefore, the container servers must be permitted to initiate - outbound connections to the remote proxy servers. +Container ring devices have a directory called ``containers``, where container +databases reside. In addition to ``containers``, each container ring device +also has a directory called ``sync-containers``. ``sync-containers`` holds +symlinks to container databases that were configured for container sync using +``x-container-sync-to`` and ``x-container-sync-key`` metadata keys. + +The swift-container-sync process does the job of sending updates to the remote +container. This is done by scanning ``sync-containers`` for container +databases. For each container db found, newer rows since the last sync will +trigger PUTs or DELETEs to the other container. + +``sync-containers`` is maintained as follows: +Whenever the container-server processes a PUT or a POST request that carries +``x-container-sync-to`` and ``x-container-sync-key`` metadata keys the server +creates a symlink to the container database in ``sync-containers``. Whenever +the container server deletes a synced container, the appropriate symlink +is deleted from ``sync-containers``. + +In addition to the container-server, the container-replicator process does the +job of identifying containers that should be synchronized. This is done by +scanning the local devices for container databases and checking for +``x-container-sync-to`` and ``x-container-sync-key`` metadata values. If they exist +then a symlink to the container database is created in a ``sync-containers`` +sub-directory on the same device. + +Similarly, when the container sync metadata keys are deleted, the container +server and container-replicator would take care of deleting the symlinks +from ``sync-containers``. .. note:: - Container sync will sync object POSTs only if the proxy server is set to - use "object_post_as_copy = true" which is the default. So-called fast - object posts, "object_post_as_copy = false" do not update the container - listings and therefore can't be detected for synchronization. + The swift-container-sync process runs on each container server in the + cluster and talks to the proxy servers (or load balancers) in the remote + cluster. Therefore, the container servers must be permitted to initiate + outbound connections to the remote proxy servers (or load balancers). The actual syncing is slightly more complicated to make use of the three (or number-of-replicas) main nodes for a container without each trying to @@ -195,7 +488,7 @@ Two sync points are kept in each container database. When syncing a container, the container-sync process figures out which replica of the container it has. In a standard 3-replica scenario, the process will have either replica number 0, 1, or 2. This is used to figure out -which rows are belong to this sync process and which ones don't. +which rows belong to this sync process and which ones don't. An example may help. Assume a replica count of 3 and database row IDs are 1..6. Also, assume that container-sync is running on this @@ -223,7 +516,7 @@ hash of the object name, so it's not always guaranteed to be exactly one out of every three rows, but it usually gets close. For the sake of example, let's say that this process ends up owning rows 2 and 5. -Once it's finished syncing those rows, it updates SP1 to be the +Once it's finished trying to sync those rows, it updates SP1 to be the biggest row-id that it's seen, which is 6 in this example. :: SP2 SP1 @@ -241,19 +534,23 @@ container, creating new rows in the database. :: On the next run, the container-sync starts off looking at rows with ids between SP1 and SP2. This time, there are a bunch of them. The -sync process takes the ones it *does not* own and syncs them. Again, -this is based on the hashes, so this will be everything it didn't sync -before. In this example, that's rows 0, 1, 3, 4, and 6. - -Under normal circumstances, the container-sync processes for the other -replicas will have already taken care of synchronizing those rows, so -this is a set of quick checks. However, if one of those other sync +sync process try to sync all of them. If it succeeds, it will set +SP2 to equal SP1. If it fails, it will set SP2 to the failed object +and will continue to try all other objects till SP1, setting SP2 to +the first object that failed. + +Under normal circumstances, the container-sync processes +will have already taken care of synchronizing all rows, between SP1 +and SP2, resulting in a set of quick checks. +However, if one of the sync processes failed for some reason, then this is a vital fallback to make sure all the objects in the container get synchronized. Without this seemingly-redundant work, any container-sync failure results in -unsynchronized objects. +unsynchronized objects. Note that the container sync will persistently +retry to sync any faulty object until success, while logging each failure. -Once it's done with the fallback rows, SP2 is advanced to SP1. :: +Once it's done with the fallback rows, and assuming no faults occurred, +SP2 is advanced to SP1. :: SP2 SP1 diff --git a/doc/source/overview_encryption.rst b/doc/source/overview_encryption.rst new file mode 100644 index 0000000000..beab7ba11d --- /dev/null +++ b/doc/source/overview_encryption.rst @@ -0,0 +1,812 @@ +================= +Object Encryption +================= + +Swift supports the optional encryption of object data at rest on storage nodes. +The encryption of object data is intended to mitigate the risk of users' data +being read if an unauthorised party were to gain physical access to a disk. + +.. note:: + + Swift's data-at-rest encryption accepts plaintext object data from the + client, encrypts it in the cluster, and stores the encrypted data. This + protects object data from inadvertently being exposed if a data drive + leaves the Swift cluster. If a user wishes to ensure that the plaintext + data is always encrypted while in transit and in storage, it is strongly + recommended that the data be encrypted before sending it to the Swift + cluster. Encrypting on the client side is the only way to ensure that the + data is fully encrypted for its entire lifecycle. + +Encryption of data at rest is implemented by middleware that may be included in +the proxy server WSGI pipeline. The feature is internal to a Swift cluster and +not exposed through the API. Clients are unaware that data is encrypted by this +feature internally to the Swift service; internally encrypted data should never +be returned to clients via the Swift API. + +The following data are encrypted while at rest in Swift: + +* Object content i.e. the content of an object PUT request's body +* The entity tag (ETag) of objects that have non-zero content +* All custom user object metadata values i.e. metadata sent using + X-Object-Meta- prefixed headers with PUT or POST requests + +Any data or metadata not included in the list above are not encrypted, +including: + +* Account, container and object names +* Account and container custom user metadata values +* All custom user metadata names +* Object Content-Type values +* Object size +* System metadata + +.. note:: + + This feature is intended to provide `confidentiality` of data that is at + rest i.e. to protect user data from being read by an attacker that gains + access to disks on which object data is stored. + + This feature is not intended to prevent undetectable `modification` + of user data at rest. + + This feature is not intended to protect against an attacker that gains + access to Swift's internal network connections, or gains access to key + material or is able to modify the Swift code running on Swift nodes. + +.. _encryption_deployment: + +------------------------ +Deployment and operation +------------------------ + +Encryption is deployed by adding two middleware filters to the proxy +server WSGI pipeline and including their respective filter configuration +sections in the `proxy-server.conf` file. :ref:`Additional steps +` are required if the container sync feature is +being used. + +The `keymaster` and `encryption` middleware filters must be to the right of all +other middleware in the pipeline apart from the final proxy-logging middleware, +and in the order shown in this example:: + + keymaster encryption proxy-logging proxy-server + + [filter:keymaster] + use = egg:swift#keymaster + encryption_root_secret = your_secret + + [filter:encryption] + use = egg:swift#encryption + # disable_encryption = False + +See the `proxy-server.conf-sample` file for further details on the middleware +configuration options. + +Keymaster middleware +-------------------- + +The `keymaster` middleware must be configured with a root secret before it is +used. By default the `keymaster` middleware will use the root secret configured +using the ``encryption_root_secret`` option in the middleware filter section of +the `proxy-server.conf` file, for example:: + + [filter:keymaster] + use = egg:swift#keymaster + encryption_root_secret = your_secret + +Root secret values MUST be at least 44 valid base-64 characters and +should be consistent across all proxy servers. The minimum length of 44 has +been chosen because it is the length of a base-64 encoded 32 byte value. + +.. note:: + + The ``encryption_root_secret`` option holds the master secret key used for + encryption. The security of all encrypted data critically depends on this + key and it should therefore be set to a high-entropy value. For example, a + suitable ``encryption_root_secret`` may be obtained by base-64 encoding a + 32 byte (or longer) value generated by a cryptographically secure random + number generator. + + The ``encryption_root_secret`` value is necessary to recover any encrypted + data from the storage system, and therefore, it must be guarded against + accidental loss. Its value (and consequently, the proxy-server.conf file) + should not be stored on any disk that is in any account, container or + object ring. + + The ``encryption_root_secret`` value should not be changed once deployed. + Doing so would prevent Swift from properly decrypting data that was + encrypted using the former value, and would therefore result in the loss of + that data. + +One method for generating a suitable value for ``encryption_root_secret`` is to +use the ``openssl`` command line tool:: + + openssl rand -base64 32 + + +Separate keymaster configuration file +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``encryption_root_secret`` option may alternatively be specified in a +separate config file at a path specified by the ``keymaster_config_path`` +option, for example:: + + [filter:keymaster] + use = egg:swift#keymaster + keymaster_config_path = /etc/swift/keymaster.conf + +This has the advantage of allowing multiple processes which need to be +encryption-aware (for example, proxy-server and container-sync) to share the +same config file, ensuring that consistent encryption keys are used by those +processes. It also allows the keymaster configuration file to have different +permissions than the `proxy-server.conf` file. + +A separate keymaster config file should have a ``[keymaster]`` section +containing the ``encryption_root_secret`` option:: + + [keymaster] + encryption_root_secret = your_secret + + +.. note:: + + Alternative keymaster middleware is available to retrieve encryption root + secrets from an :ref:`external key management system + ` such as `Barbican + `_ rather than storing root secrets in + configuration files. + +Once deployed, the encryption filter will by default encrypt object data and +metadata when handling PUT and POST requests and decrypt object data and +metadata when handling GET and HEAD requests. COPY requests are transformed +into GET and PUT requests by the :ref:`copy` middleware before reaching the +encryption middleware and as a result object data and metadata is decrypted and +re-encrypted when copied. + +.. _changing_the_root_secret: + +Changing the encryption root secret +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +From time to time it may be desirable to change the root secret that is used to +derive encryption keys for new data written to the cluster. The `keymaster` +middleware allows alternative root secrets to be specified in its configuration +using options of the form:: + + encryption_root_secret_ = + +where ``secret_id`` is a unique identifier for the root secret and ``secret +value`` is a value that meets the requirements for a root secret described +above. + +Only one root secret is used to encrypt new data at any moment in time. This +root secret is specified using the ``active_root_secret_id`` option. If +specified, the value of this option should be one of the configured root secret +``secret_id`` values; otherwise the value of ``encryption_root_secret`` will be +taken as the default active root secret. + +.. note:: + + The active root secret is only used to derive keys for new data written to + the cluster. Changing the active root secret does not cause any existing + data to be re-encrypted. + +Existing encrypted data will be decrypted using the root secret that was active +when that data was written. All previous active root secrets must therefore +remain in the middleware configuration in order for decryption of existing data +to succeed. Existing encrypted data will reference previous root secret by +the ``secret_id`` so it must be kept consistent in the configuration. + +.. note:: + + Do not remove or change any previously active ```` or ````. + +For example, the following keymaster configuration file specifies three root +secrets, with the value of ``encryption_root_secret_2`` being the current +active root secret:: + + [keymaster] + active_root_secret_id = 2 + encryption_root_secret = your_secret + encryption_root_secret_1 = your_secret_1 + encryption_root_secret_2 = your_secret_2 + +.. note:: + + To ensure there is no loss of data availability, deploying a new key to + your cluster requires a two-stage config change. First, add the new key + to the ``encryption_root_secret_`` option and restart the + proxy-server. Do this for all proxies. Next, set the + ``active_root_secret_id`` option to the new secret id and restart the + proxy. Again, do this for all proxies. This process ensures that all + proxies will have the new key available for *decryption* before any proxy + uses it for *encryption*. + +Encryption middleware +--------------------- + +Once deployed, the encryption filter will by default encrypt object data and +metadata when handling PUT and POST requests and decrypt object data and +metadata when handling GET and HEAD requests. COPY requests are transformed +into GET and PUT requests by the :ref:`copy` middleware before reaching the +encryption middleware and as a result object data and metadata is decrypted and +re-encrypted when copied. + + +.. _encryption_root_secret_in_external_kms: + +Encryption Root Secret in External Key Management System +-------------------------------------------------------- + +The benefits of using a dedicated system for storing the encryption root secret +include the auditing and access control infrastructure that are already in +place in such a system, and the fact that an encryption root secret stored in a +key management system (KMS) may be backed by a hardware security module (HSM) +for additional security. Another significant benefit of storing the root +encryption secret in an external KMS is that it is in this case never stored on +a disk in the Swift cluster. + +Swift supports fetching encryption root secrets from a `Barbican +`_ service or a KMIP_ service using the +``kms_keymaster`` or ``kmip_keymaster`` middleware respectively. + +.. _KMIP: https://www.oasis-open.org/committees/kmip/ + +Encryption Root Secret in a Barbican KMS +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Make sure the required dependencies are installed for retrieving an encryption +root secret from an external KMS. This can be done when installing Swift (add +the ``-e`` flag to install as a development version) by changing to the Swift +directory and running the following command to install Swift together with +the ``kms_keymaster`` extra dependencies:: + + sudo pip install .[kms_keymaster] + +Another way to install the dependencies is by making sure the +following lines exist in the requirements.txt file, and installing them using +``pip install -r requirements.txt``:: + + cryptography>=1.6 # BSD/Apache-2.0 + castellan>=0.6.0 + +.. note:: + + If any of the required packages is already installed, the ``--upgrade`` + flag may be required for the ``pip`` commands in order for the required + minimum version to be installed. + +To make use of an encryption root secret stored in an external KMS, +replace the keymaster middleware with the kms_keymaster middleware in the +proxy server WSGI pipeline in `proxy-server.conf`, in the order shown in this +example:: + + kms_keymaster encryption proxy-logging proxy-server + +and add a section to the same file:: + + [filter:kms_keymaster] + use = egg:swift#kms_keymaster + keymaster_config_path = file_with_kms_keymaster_config + +Create or edit the file `file_with_kms_keymaster_config` referenced above. +For further details on the middleware configuration options, see the +`keymaster.conf-sample` file. An example of the content of this file, with +optional parameters omitted, is below:: + + [kms_keymaster] + key_id = changeme + username = swift + password = password + project_name = swift + auth_endpoint = http://keystonehost:5000/v3 + +The encryption root secret shall be created and stored in the external key +management system before it can be used by the keymaster. It shall be stored +as a symmetric key, with content type ``application/octet-stream``, +``base64`` content encoding, ``AES`` algorithm, bit length ``256``, and secret +type ``symmetric``. The mode ``ctr`` may also be stored for informational +purposes - it is not currently checked by the keymaster. + +The following command can be used to store the currently configured +``encryption_root_secret`` value from the `proxy-server.conf` file +in Barbican:: + + openstack secret store --name swift_root_secret \ + --payload-content-type="application/octet-stream" \ + --payload-content-encoding="base64" --algorithm aes --bit-length 256 \ + --mode ctr --secret-type symmetric --payload + +Alternatively, the existing root secret can also be stored in Barbican using +`curl `__. + +.. note:: + + The credentials used to store the secret in Barbican shall be the same + ones that the proxy server uses to retrieve the secret, i.e., the ones + configured in the `keymaster.conf` file. For clarity reasons the commands + shown here omit the credentials - they may be specified explicitly, or in + environment variables. + +Instead of using an existing root secret, Barbican can also be asked to +generate a new 256-bit root secret, with content type +``application/octet-stream`` and algorithm ``AES`` (the ``mode`` parameter is +currently optional):: + + openstack secret order create --name swift_root_secret \ + --payload-content-type="application/octet-stream" --algorithm aes \ + --bit-length 256 --mode ctr key + +The ``order create`` creates an asynchronous request to create the actual +secret. +The order can be retrieved using ``openstack secret order get``, and once the +order completes successfully, the output will show the key id of the generated +root secret. +Keys currently stored in Barbican can be listed using the +``openstack secret list`` command. + +.. note:: + + Both the order (the asynchronous request for creating or storing a secret), + and the actual secret itself, have similar unique identifiers. Once the + order has been completed, the key id is shown in the output of the ``order + get`` command. + +The keymaster uses the explicitly configured username and password (and +project name etc.) from the `keymaster.conf` file for retrieving the encryption +root secret from an external key management system. The `Castellan library +`_ is used to communicate with +Barbican. + +For the proxy server, reading the encryption root secret directly from the +`proxy-server.conf` file, from the `keymaster.conf` file pointed to +from the `proxy-server.conf` file, or from an external key management system +such as Barbican, are all functionally equivalent. In case reading the +encryption root secret from the external key management system fails, the +proxy server will not start up. If the encryption root secret is retrieved +successfully, it is cached in memory in the proxy server. + +For further details on the configuration options, see the +`[filter:kms_keymaster]` section in the `proxy-server.conf-sample` file, and +the `keymaster.conf-sample` file. + + +Encryption Root Secret in a KMIP service +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This middleware enables Swift to fetch a root secret from a KMIP_ service. The +root secret is expected to have been previously created in the KMIP_ service +and is referenced by its unique identifier. The secret should be an AES-256 +symmetric key. + +To use this middleware Swift must be installed with the extra required +dependencies:: + + sudo pip install .[kmip_keymaster] + +Add the ``-e`` flag to install as a development version. + +Edit the swift `proxy-server.conf` file to insert the middleware in the wsgi +pipeline, replacing any other keymaster middleware:: + + [pipeline:main] + pipeline = catch_errors gatekeeper healthcheck proxy-logging \ + kmip_keymaster encryption proxy-logging proxy-server + +and add a new filter section:: + + [filter:kmip_keymaster] + use = egg:swift#kmip_keymaster + key_id = + host = + port = + certfile = /path/to/client/cert.pem + keyfile = /path/to/client/key.pem + ca_certs = /path/to/server/cert.pem + username = + password = + +Apart from ``use`` and ``key_id`` the options are as defined for a PyKMIP +client. The authoritative definition of these options can be found at +``_. + +The value of the ``key_id`` option should be the unique identifier for a secret +that will be retrieved from the KMIP_ service. + +The keymaster configuration can alternatively be defined in a separate config +file by using the ``keymaster_config_path`` option:: + + [filter:kmip_keymaster] + use = egg:swift#kmip_keymaster + keymaster_config_path = /etc/swift/kmip_keymaster.conf + +In this case, the ``filter:kmip_keymaster`` section should contain no other +options than ``use`` and ``keymaster_config_path``. All other options should be +defined in the separate config file in a section named ``kmip_keymaster``. For +example:: + + [kmip_keymaster] + key_id = 1234567890 + host = 127.0.0.1 + port = 5696 + certfile = /etc/swift/kmip_client.crt + keyfile = /etc/swift/kmip_client.key + ca_certs = /etc/swift/kmip_server.crt + username = swift + password = swift_password + +Changing the encryption root secret of external KMS's +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Because the KMS and KMIP keymaster's derive from the default KeyMaster they +also have to ability to define multiple keys. The only difference is the key +option names. Instead of using the form `encryption_root_secret_` +both external KMS's use `key_id_`, as it is an extension of their +existing configuration. For example:: + + ... + key_id = 1234567890 + key_id_foo = 0987654321 + key_id_bar = 5432106789 + active_root_secret_id = foo + ... + +Other then that, the process is the same as :ref:`changing_the_root_secret`. + +Upgrade Considerations +---------------------- + +When upgrading an existing cluster to deploy encryption, the following sequence +of steps is recommended: + +#. Upgrade all object servers +#. Upgrade all proxy servers +#. Add keymaster and encryption middlewares to every proxy server's middleware + pipeline with the encryption ``disable_encryption`` option set to ``True`` + and the keymaster ``encryption_root_secret`` value set as described above. +#. If required, follow the steps for :ref:`container_sync_client_config`. +#. Finally, change the encryption ``disable_encryption`` option to ``False`` + +Objects that existed in the cluster prior to the keymaster and encryption +middlewares being deployed are still readable with GET and HEAD requests. The +content of those objects will not be encrypted unless they are written again by +a PUT or COPY request. Any user metadata of those objects will not be encrypted +unless it is written again by a PUT, POST or COPY request. + +Disabling Encryption +-------------------- + +Once deployed, the keymaster and encryption middlewares should not be removed +from the pipeline. To do so will cause encrypted object data and/or metadata to +be returned in response to GET or HEAD requests for objects that were +previously encrypted. + +Encryption of inbound object data may be disabled by setting the encryption +``disable_encryption`` option to ``True``, in which case existing encrypted +objects will remain encrypted but new data written with PUT, POST or COPY +requests will not be encrypted. The keymaster and encryption middlewares should +remain in the pipeline even when encryption of new objects is not required. The +encryption middleware is needed to handle GET requests for objects that may +have been previously encrypted. The keymaster is needed to provide keys for +those requests. + +.. _container_sync_client_config: + +Container sync configuration +---------------------------- + +If container sync is being used then the keymaster and encryption middlewares +must be added to the container sync internal client pipeline. The following +configuration steps are required: + +#. Create a custom internal client configuration file for container sync (if + one is not already in use) based on the sample file + `internal-client.conf-sample`. For example, copy + `internal-client.conf-sample` to `/etc/swift/container-sync-client.conf`. +#. Modify this file to include the middlewares in the pipeline in + the same way as described above for the proxy server. +#. Modify the container-sync section of all container server config files to + point to this internal client config file using the + ``internal_client_conf_path`` option. For example:: + + internal_client_conf_path = /etc/swift/container-sync-client.conf + +.. note:: + + The ``encryption_root_secret`` value is necessary to recover any encrypted + data from the storage system, and therefore, it must be guarded against + accidental loss. Its value (and consequently, the custom internal client + configuration file) should not be stored on any disk that is in any + account, container or object ring. + +.. note:: + + These container sync configuration steps will be necessary for container + sync probe tests to pass if the encryption middlewares are included in the + proxy pipeline of a test cluster. + +-------------- +Implementation +-------------- + +Encryption scheme +----------------- + +Plaintext data is encrypted to ciphertext using the AES cipher with 256-bit +keys implemented by the python `cryptography package +`_. The cipher is used in counter +(CTR) mode so that any byte or range of bytes in the ciphertext may be +decrypted independently of any other bytes in the ciphertext. This enables very +simple handling of ranged GETs. + +In general an item of unencrypted data, ``plaintext``, is transformed to an +item of encrypted data, ``ciphertext``:: + + ciphertext = E(plaintext, k, iv) + +where ``E`` is the encryption function, ``k`` is an encryption key and ``iv`` +is a unique initialization vector (IV) chosen for each encryption context. For +example, the object body is one encryption context with a randomly chosen IV. +The IV is stored as metadata of the encrypted item so that it is available for +decryption:: + + plaintext = D(ciphertext, k, iv) + +where ``D`` is the decryption function. + +The implementation of CTR mode follows `NIST SP800-38A +`_, and the +full IV passed to the encryption or decryption function serves as the initial +counter block. + +In general any encrypted item has accompanying crypto-metadata that describes +the IV and the cipher algorithm used for the encryption:: + + crypto_metadata = {"iv": <16 byte value>, + "cipher": "AES_CTR_256"} + +This crypto-metadata is stored either with the ciphertext (for user +metadata and etags) or as a separate header (for object bodies). + +Key management +-------------- + +A keymaster middleware is responsible for providing the keys required for each +encryption and decryption operation. Two keys are required when handling object +requests: a `container key` that is uniquely associated with the container path +and an `object key` that is uniquely associated with the object path. These +keys are made available to the encryption middleware via a callback function +that the keymaster installs in the WSGI request environ. + +The current keymaster implementation derives container and object keys from the +``encryption_root_secret`` in a deterministic way by constructing a SHA256 +HMAC using the ``encryption_root_secret`` as a key and the container or object +path as a message, for example:: + + object_key = HMAC(encryption_root_secret, "/a/c/o") + +Other strategies for providing object and container keys may be employed by +future implementations of alternative keymaster middleware. + +During each object PUT, a random key is generated to encrypt the object body. +This random key is then encrypted using the object key provided by the +keymaster. This makes it safe to store the encrypted random key alongside the +encrypted object data and metadata. + +This process of `key wrapping` enables more efficient re-keying events when the +object key may need to be replaced and consequently any data encrypted using +that key must be re-encrypted. Key wrapping minimizes the amount of data +encrypted using those keys to just other randomly chosen keys which can be +re-wrapped efficiently without needing to re-encrypt the larger amounts of data +that were encrypted using the random keys. + +.. note:: + + Re-keying is not currently implemented. Key wrapping is implemented + in anticipation of future re-keying operations. + + +Encryption middleware +--------------------- + +The encryption middleware is composed of an `encrypter` component and a +`decrypter` component. + +Encrypter operation +^^^^^^^^^^^^^^^^^^^ + +Custom user metadata +++++++++++++++++++++ + +The encrypter encrypts each item of custom user metadata using the object key +provided by the keymaster and an IV that is randomly chosen for that metadata +item. The encrypted values are stored as :ref:`transient_sysmeta` with +associated crypto-metadata appended to the encrypted value. For example:: + + X-Object-Meta-Private1: value1 + X-Object-Meta-Private2: value2 + +are transformed to:: + + X-Object-Transient-Sysmeta-Crypto-Meta-Private1: + E(value1, object_key, header_iv_1); swift_meta={"iv": header_iv_1, + "cipher": "AES_CTR_256"} + X-Object-Transient-Sysmeta-Crypto-Meta-Private2: + E(value2, object_key, header_iv_2); swift_meta={"iv": header_iv_2, + "cipher": "AES_CTR_256"} + +The unencrypted custom user metadata headers are removed. + +Object body ++++++++++++ + +Encryption of an object body is performed using a randomly chosen body key +and a randomly chosen IV:: + + body_ciphertext = E(body_plaintext, body_key, body_iv) + +The body_key is wrapped using the object key provided by the keymaster and a +randomly chosen IV:: + + wrapped_body_key = E(body_key, object_key, body_key_iv) + +The encrypter stores the associated crypto-metadata in a system metadata +header:: + + X-Object-Sysmeta-Crypto-Body-Meta: + {"iv": body_iv, + "cipher": "AES_CTR_256", + "body_key": {"key": wrapped_body_key, + "iv": body_key_iv}} + +Note that in this case there is an extra item of crypto-metadata which stores +the wrapped body key and its IV. + +Entity tag +++++++++++ + +While encrypting the object body the encrypter also calculates the ETag (md5 +digest) of the plaintext body. This value is encrypted using the object key +provided by the keymaster and a randomly chosen IV, and saved as an item of +system metadata, with associated crypto-metadata appended to the encrypted +value:: + + X-Object-Sysmeta-Crypto-Etag: + E(md5(plaintext), object_key, etag_iv); swift_meta={"iv": etag_iv, + "cipher": "AES_CTR_256"} + +The encrypter also forces an encrypted version of the plaintext ETag to be sent +with container updates by adding an update override header to the PUT request. +The associated crypto-metadata is appended to the encrypted ETag value of this +update override header:: + + X-Object-Sysmeta-Container-Update-Override-Etag: + E(md5(plaintext), container_key, override_etag_iv); + meta={"iv": override_etag_iv, "cipher": "AES_CTR_256"} + +The container key is used for this encryption so that the decrypter is able +to decrypt the ETags in container listings when handling a container request, +since object keys may not be available in that context. + +Since the plaintext ETag value is only known once the encrypter has completed +processing the entire object body, the ``X-Object-Sysmeta-Crypto-Etag`` and +``X-Object-Sysmeta-Container-Update-Override-Etag`` headers are sent after the +encrypted object body using the proxy server's support for request footers. + +.. _conditional_requests: + +Conditional Requests +++++++++++++++++++++ + +In general, an object server evaluates conditional requests with +``If[-None]-Match`` headers by comparing values listed in an +``If[-None]-Match`` header against the ETag that is stored in the object +metadata. This is not possible when the ETag stored in object metadata has been +encrypted. The encrypter therefore calculates an HMAC using the object key and +the ETag while handling object PUT requests, and stores this under the metadata +key ``X-Object-Sysmeta-Crypto-Etag-Mac``:: + + X-Object-Sysmeta-Crypto-Etag-Mac: HMAC(object_key, md5(plaintext)) + +Like other ETag-related metadata, this is sent after the encrypted object body +using the proxy server's support for request footers. + +The encrypter similarly calculates an HMAC for each ETag value included in +``If[-None]-Match`` headers of conditional GET or HEAD requests, and appends +these to the ``If[-None]-Match`` header. The encrypter also sets the +``X-Backend-Etag-Is-At`` header to point to the previously stored +``X-Object-Sysmeta-Crypto-Etag-Mac`` metadata so that the object server +evaluates the conditional request by comparing the HMAC values included in the +``If[-None]-Match`` with the value stored under +``X-Object-Sysmeta-Crypto-Etag-Mac``. For example, given a conditional request +with header:: + + If-Match: match_etag + +the encrypter would transform the request headers to include:: + + If-Match: match_etag,HMAC(object_key, match_etag) + X-Backend-Etag-Is-At: X-Object-Sysmeta-Crypto-Etag-Mac + +This enables the object server to perform an encrypted comparison to check +whether the ETags match, without leaking the ETag itself or leaking information +about the object body. + +Decrypter operation +^^^^^^^^^^^^^^^^^^^ + +For each GET or HEAD request to an object, the decrypter inspects the response +for encrypted items (revealed by crypto-metadata headers), and if any are +discovered then it will: + +#. Fetch the object and container keys from the keymaster via its callback +#. Decrypt the ``X-Object-Sysmeta-Crypto-Etag`` value +#. Decrypt the ``X-Object-Sysmeta-Container-Update-Override-Etag`` value +#. Decrypt metadata header values using the object key +#. Decrypt the wrapped body key found in ``X-Object-Sysmeta-Crypto-Body-Meta`` +#. Decrypt the body using the body key + +For each GET request to a container that would include ETags in its response +body, the decrypter will: + +#. GET the response body with the container listing +#. Fetch the container key from the keymaster via its callback +#. Decrypt any encrypted ETag entries in the container listing using the + container key + + +Impact on other Swift services and features +------------------------------------------- + +Encryption has no impact on :ref:`versioned_writes` other than that any +previously unencrypted objects will be encrypted as they are copied to or from +the versions container. Keymaster and encryption middlewares should be placed +after ``versioned_writes`` in the proxy server pipeline, as described in +:ref:`encryption_deployment`. + +`Container Sync` uses an internal client to GET objects that are to be sync'd. +This internal client must be configured to use the keymaster and encryption +middlewares as described :ref:`above `. + +Encryption has no impact on the `object-auditor` service. Since the ETag +header saved with the object at rest is the md5 sum of the encrypted object +body then the auditor will verify that encrypted data is valid. + +Encryption has no impact on the `object-expirer` service. ``X-Delete-At`` and +``X-Delete-After`` headers are not encrypted. + +Encryption has no impact on the `object-replicator` and `object-reconstructor` +services. These services are unaware of the object or EC fragment data being +encrypted. + +Encryption has no impact on the `container-reconciler` service. The +`container-reconciler` uses an internal client to move objects between +different policy rings. The reconciler's pipeline *MUST NOT* have encryption +enabled. The destination object has the same URL as the source object and the +object is moved without re-encryption. + + +Considerations for developers +----------------------------- + +Developers should be aware that keymaster and encryption middlewares rely on +the path of an object remaining unchanged. The included keymaster derives keys +for containers and objects based on their paths and the +``encryption_root_secret``. The keymaster does not rely on object metadata to +inform its generation of keys for GET and HEAD requests because when handling +:ref:`conditional_requests` it is required to provide the object key before any +metadata has been read from the object. + +Developers should therefore give careful consideration to any new features that +would relocate object data and metadata within a Swift cluster by means that do +not cause the object data and metadata to pass through the encryption +middlewares in the proxy pipeline and be re-encrypted. + +The crypto-metadata associated with each encrypted item does include some +`key_id` metadata that is provided by the keymaster and contains the path used +to derive keys. This `key_id` metadata is persisted in anticipation of future +scenarios when it may be necessary to decrypt an object that has been relocated +without re-encrypting, in which case the metadata could be used to derive the +keys that were used for encryption. However, this alone is not sufficient to +handle conditional requests and to decrypt container listings where objects +have been relocated, and further work will be required to solve those issues. diff --git a/doc/source/overview_erasure_code.rst b/doc/source/overview_erasure_code.rst new file mode 100644 index 0000000000..26cf7aa828 --- /dev/null +++ b/doc/source/overview_erasure_code.rst @@ -0,0 +1,921 @@ +==================== +Erasure Code Support +==================== + +******************************* +History and Theory of Operation +******************************* + +There's a lot of good material out there on Erasure Code (EC) theory, this short +introduction is just meant to provide some basic context to help the reader +better understand the implementation in Swift. + +Erasure Coding for storage applications grew out of Coding Theory as far back as +the 1960s with the Reed-Solomon codes. These codes have been used for years in +applications ranging from CDs to DVDs to general communications and, yes, even +in the space program starting with Voyager! The basic idea is that some amount +of data is broken up into smaller pieces called fragments and coded in such a +way that it can be transmitted with the ability to tolerate the loss of some +number of the coded fragments. That's where the word "erasure" comes in, if you +transmit 14 fragments and only 13 are received then one of them is said to be +"erased". The word "erasure" provides an important distinction with EC; it +isn't about detecting errors, it's about dealing with failures. Another +important element of EC is that the number of erasures that can be tolerated can +be adjusted to meet the needs of the application. + +At a high level EC works by using a specific scheme to break up a single data +buffer into several smaller data buffers then, depending on the scheme, +performing some encoding operation on that data in order to generate additional +information. So you end up with more data than you started with and that extra +data is often called "parity". Note that there are many, many different +encoding techniques that vary both in how they organize and manipulate the data +as well by what means they use to calculate parity. For example, one scheme +might rely on `Galois Field Arithmetic `_ while others may work with only XOR. The number of variations and +details about their differences are well beyond the scope of this introduction, +but we will talk more about a few of them when we get into the implementation of +EC in Swift. + +Overview of EC Support in Swift +================================ + +First and foremost, from an application perspective EC support is totally +transparent. There are no EC related external API; a container is simply created +using a Storage Policy defined to use EC and then interaction with the cluster +is the same as any other durability policy. + +EC is implemented in Swift as a Storage Policy, see :doc:`overview_policies` for +complete details on Storage Policies. Because support is implemented as a +Storage Policy, all of the storage devices associated with your cluster's EC +capability can be isolated. It is entirely possible to share devices between +storage policies, but for EC it may make more sense to not only use separate +devices but possibly even entire nodes dedicated for EC. + +Which direction one chooses depends on why the EC policy is being deployed. If, +for example, there is a production replication policy in place already and the +goal is to add a cold storage tier such that the existing nodes performing +replication are impacted as little as possible, adding a new set of nodes +dedicated to EC might make the most sense but also incurs the most cost. On the +other hand, if EC is being added as a capability to provide additional +durability for a specific set of applications and the existing infrastructure is +well suited for EC (sufficient number of nodes, zones for the EC scheme that is +chosen) then leveraging the existing infrastructure such that the EC ring shares +nodes with the replication ring makes the most sense. These are some of the +main considerations: + +* Layout of existing infrastructure. +* Cost of adding dedicated EC nodes (or just dedicated EC devices). +* Intended usage model(s). + +The Swift code base does not include any of the algorithms necessary to perform +the actual encoding and decoding of data; that is left to external libraries. +The Storage Policies architecture is leveraged to enable EC on a per container +basis -- the object rings are still used to determine the placement of EC data +fragments. Although there are several code paths that are unique to an operation +associated with an EC policy, an external dependency to an Erasure Code library +is what Swift counts on to perform the low level EC functions. The use of an +external library allows for maximum flexibility as there are a significant +number of options out there, each with its owns pros and cons that can vary +greatly from one use case to another. + +PyECLib: External Erasure Code Library +======================================= + +PyECLib is a Python Erasure Coding Library originally designed and written as +part of the effort to add EC support to the Swift project, however it is an +independent project. The library provides a well-defined and simple Python +interface and internally implements a plug-in architecture allowing it to take +advantage of many well-known C libraries such as: + +* Jerasure and GFComplete at http://jerasure.org. +* Intel(R) ISA-L at http://01.org/intel%C2%AE-storage-acceleration-library-open-source-version. +* Or write your own! + +PyECLib uses a C based library called liberasurecode to implement the plug in +infrastructure; liberasurecode is available at: + +* liberasurecode: https://github.com/openstack/liberasurecode + +PyECLib itself therefore allows for not only choice but further extensibility as +well. PyECLib also comes with a handy utility to help determine the best +algorithm to use based on the equipment that will be used (processors and server +configurations may vary in performance per algorithm). More on this will be +covered in the configuration section. PyECLib is included as a Swift +requirement. + +For complete details see `PyECLib `_ + +Storing and Retrieving Objects +============================== + +We will discuss the details of how PUT and GET work in the "Under the Hood" +section later on. The key point here is that all of the erasure code work goes +on behind the scenes; this summary is a high level information overview only. + +The PUT flow looks like this: + +#. The proxy server streams in an object and buffers up "a segment" of data + (size is configurable). +#. The proxy server calls on PyECLib to encode the data into smaller fragments. +#. The proxy streams the encoded fragments out to the storage nodes based on + ring locations. +#. Repeat until the client is done sending data. +#. The client is notified of completion when a quorum is met. + +The GET flow looks like this: + +#. The proxy server makes simultaneous requests to participating nodes. +#. As soon as the proxy has the fragments it needs, it calls on PyECLib to + decode the data. +#. The proxy streams the decoded data it has back to the client. +#. Repeat until the proxy is done sending data back to the client. + +It may sound like, from this high level overview, that using EC is going to +cause an explosion in the number of actual files stored in each node's local +file system. Although it is true that more files will be stored (because an +object is broken into pieces), the implementation works to minimize this where +possible, more details are available in the Under the Hood section. + +Handoff Nodes +============= + +In EC policies, similarly to replication, handoff nodes are a set of storage +nodes used to augment the list of primary nodes responsible for storing an +erasure coded object. These handoff nodes are used in the event that one or more +of the primaries are unavailable. Handoff nodes are still selected with an +attempt to achieve maximum separation of the data being placed. + +Reconstruction +============== + +For an EC policy, reconstruction is analogous to the process of replication for +a replication type policy -- essentially "the reconstructor" replaces "the +replicator" for EC policy types. The basic framework of reconstruction is very +similar to that of replication with a few notable exceptions: + +* Because EC does not actually replicate partitions, it needs to operate at a + finer granularity than what is provided with rsync, therefore EC leverages + much of ssync behind the scenes (you do not need to manually configure ssync). +* Once a pair of nodes has determined the need to replace a missing object + fragment, instead of pushing over a copy like replication would do, the + reconstructor has to read in enough surviving fragments from other nodes and + perform a local reconstruction before it has the correct data to push to the + other node. +* A reconstructor does not talk to all other reconstructors in the set of nodes + responsible for an EC partition, this would be far too chatty, instead each + reconstructor is responsible for sync'ing with the partition's closest two + neighbors (closest meaning left and right on the ring). + +.. note:: + + EC work (encode and decode) takes place both on the proxy nodes, for PUT/GET + operations, as well as on the storage nodes for reconstruction. As with + replication, reconstruction can be the result of rebalancing, bit-rot, drive + failure or reverting data from a hand-off node back to its primary. + +************************** +Performance Considerations +************************** + +In general, EC has different performance characteristics than replicated data. +EC requires substantially more CPU to read and write data, and is more suited +for larger objects that are not frequently accessed (e.g. backups). + +Operators are encouraged to characterize the performance of various EC schemes +and share their observations with the developer community. + + +.. _using_ec_policy: + +**************************** +Using an Erasure Code Policy +**************************** + +To use an EC policy, the administrator simply needs to define an EC policy in +`swift.conf` and create/configure the associated object ring. An example of how +an EC policy can be setup is shown below:: + + [storage-policy:2] + name = ec104 + policy_type = erasure_coding + ec_type = liberasurecode_rs_vand + ec_num_data_fragments = 10 + ec_num_parity_fragments = 4 + ec_object_segment_size = 1048576 + +Let's take a closer look at each configuration parameter: + +* ``name``: This is a standard storage policy parameter. + See :doc:`overview_policies` for details. +* ``policy_type``: Set this to ``erasure_coding`` to indicate that this is an EC + policy. +* ``ec_type``: Set this value according to the available options in the selected + PyECLib back-end. This specifies the EC scheme that is to be used. For + example the option shown here selects Vandermonde Reed-Solomon encoding while + an option of ``flat_xor_hd_3`` would select Flat-XOR based HD combination + codes. See the `PyECLib `_ page for + full details. +* ``ec_num_data_fragments``: The total number of fragments that will be + comprised of data. +* ``ec_num_parity_fragments``: The total number of fragments that will be + comprised of parity. +* ``ec_object_segment_size``: The amount of data that will be buffered up before + feeding a segment into the encoder/decoder. The default value is 1048576. + +When PyECLib encodes an object, it will break it into N fragments. However, what +is important during configuration, is how many of those are data and how many +are parity. So in the example above, PyECLib will actually break an object in +14 different fragments, 10 of them will be made up of actual object data and 4 +of them will be made of parity data (calculations depending on ec_type). + +When deciding which devices to use in the EC policy's object ring, be sure to +carefully consider the performance impacts. Running some performance +benchmarking in a test environment for your configuration is highly recommended +before deployment. + +To create the EC policy's object ring, the only difference in the usage of the +``swift-ring-builder create`` command is the ``replicas`` parameter. The +``replicas`` value is the number of fragments spread across the object servers +associated with the ring; ``replicas`` must be equal to the sum of +``ec_num_data_fragments`` and ``ec_num_parity_fragments``. For example:: + + swift-ring-builder object-1.builder create 10 14 1 + +Note that in this example the ``replicas`` value of ``14`` is based on the sum of +``10`` EC data fragments and ``4`` EC parity fragments. + +Once you have configured your EC policy in `swift.conf` and created your object +ring, your application is ready to start using EC simply by creating a container +with the specified policy name and interacting as usual. + +.. note:: + + It's important to note that once you have deployed a policy and have created + objects with that policy, these configurations options cannot be changed. In + case a change in the configuration is desired, you must create a new policy + and migrate the data to a new container. + +.. warning:: + + Using ``isa_l_rs_vand`` with more than 4 parity fragments creates fragments + which may in some circumstances fail to reconstruct properly or (with + liberasurecode < 1.3.1) reconstruct corrupted data. New policies that need + large numbers of parity fragments should consider using ``isa_l_rs_cauchy``. + Any existing affected policies must be marked deprecated, and data in + containers with that policy should be migrated to a new policy. + +Migrating Between Policies +========================== + +A common usage of EC is to migrate less commonly accessed data from a more +expensive but lower latency policy such as replication. When an application +determines that it wants to move data from a replication policy to an EC policy, +it simply needs to move the data from the replicated container to an EC +container that was created with the target durability policy. + + +********* +Global EC +********* + +The following recommendations are made when deploying an EC policy that spans +multiple regions in a :doc:`Global Cluster `: + +* The global EC policy should use :ref:`ec_duplication` in conjunction with a + :ref:`Composite Ring `, as described below. +* Proxy servers should be :ref:`configured to use read affinity + ` to prefer reading from their local region for + the global EC policy. :ref:`proxy_server_per_policy_config` allows this to be + configured for individual policies. + +.. note:: + + Before deploying a Global EC policy, consideration should be given to the + :ref:`global_ec_known_issues`, in particular the relatively poor + performance anticipated from the object-reconstructor. + +.. _ec_duplication: + +EC Duplication +============== + +EC Duplication enables Swift to make duplicated copies of fragments of erasure +coded objects. If an EC storage policy is configured with a non-default +``ec_duplication_factor`` of ``N > 1``, then the policy will create ``N`` +duplicates of each unique fragment that is returned from the configured EC +engine. + +Duplication of EC fragments is optimal for Global EC storage policies, which +require dispersion of fragment data across failure domains. Without fragment +duplication, common EC parameters will not distribute enough unique fragments +between large failure domains to allow for a rebuild using fragments from any +one domain. For example a uniformly distributed ``10+4`` EC policy schema +would place 7 fragments in each of two failure domains, which is less in each +failure domain than the 10 fragments needed to rebuild a missing fragment. + +Without fragment duplication, an EC policy schema must be adjusted to include +additional parity fragments in order to guarantee the number of fragments in +each failure domain is greater than the number required to rebuild. For +example, a uniformly distributed ``10+18`` EC policy schema would place 14 +fragments in each of two failure domains, which is more than sufficient in each +failure domain to rebuild a missing fragment. However, empirical testing has +shown encoding a schema with ``num_parity > num_data`` (such as ``10+18``) is +less efficient than using duplication of fragments. EC fragment duplication +enables Swift's Global EC to maintain more independence between failure domains +without sacrificing efficiency on read/write or rebuild! + +The ``ec_duplication_factor`` option may be configured in `swift.conf` in each +``storage-policy`` section. The option may be omitted - the default value is +``1`` (i.e. no duplication):: + + [storage-policy:2] + name = ec104 + policy_type = erasure_coding + ec_type = liberasurecode_rs_vand + ec_num_data_fragments = 10 + ec_num_parity_fragments = 4 + ec_object_segment_size = 1048576 + ec_duplication_factor = 2 + +.. warning:: + + EC duplication is intended for use with Global EC policies. To ensure + independent availability of data in all regions, the + ``ec_duplication_factor`` option should only be used in conjunction with + :ref:`composite_rings`, as described in this document. + +In this example, a ``10+4`` schema and a duplication factor of ``2`` will +result in ``(10+4)x2 = 28`` fragments being stored (we will use the shorthand +``10+4x2`` to denote that policy configuration) . The ring for this policy +should be configured with 28 replicas (i.e. ``(ec_num_data_fragments + +ec_num_parity_fragments) * ec_duplication_factor``). A ``10+4x2`` schema +**can** allow a multi-region deployment to rebuild an object to full durability +even when *more* than 14 fragments are unavailable. This is advantageous with +respect to a ``10+18`` configuration not only because reads from data fragments +will be more common and more efficient, but also because a ``10+4x2`` can grow +into a ``10+4x3`` to expand into another region. + +EC duplication with composite rings +----------------------------------- + +It is recommended that EC Duplication is used with :ref:`composite_rings` in +order to disperse duplicate fragments across regions. + +When EC duplication is used, it is highly desirable to have one duplicate of +each fragment placed in each region. This ensures that a set of +``ec_num_data_fragments`` unique fragments (the minimum needed to reconstruct +an object) can always be assembled from a single region. This in turn means +that objects are robust in the event of an entire region becoming unavailable. + +This can be achieved by using a :ref:`composite ring ` with +the following properties: + +* The number of component rings in the composite ring is equal to the + ``ec_duplication_factor`` for the policy. +* Each *component* ring has a number of ``replicas`` that is equal to the sum + of ``ec_num_data_fragments`` and ``ec_num_parity_fragments``. +* Each component ring is populated with devices in a unique region. + +This arrangement results in each component ring in the composite ring, and +therefore each region, having one copy of each fragment. + +For example, consider a Swift cluster with two regions, ``region1`` and +``region2`` and a ``4+2x2`` EC policy schema. This policy should use a +composite ring with two component rings, ``ring1`` and ``ring2``, having +devices exclusively in regions ``region1`` and ``region2`` respectively. Each +component ring should have ``replicas = 6``. As a result, the first 6 +fragments for an object will always be placed in ``ring1`` (i.e. in +``region1``) and the second 6 duplicate fragments will always be placed in +``ring2`` (i.e. in ``region2``). + +Conversely, a conventional ring spanning the two regions may give a suboptimal +distribution of duplicates across the regions; it is possible for duplicates of +the same fragment to be placed in the same region, and consequently for another +region to have no copies of that fragment. This may make it impossible to +assemble a set of ``ec_num_data_fragments`` unique fragments from a single +region. For example, the conventional ring could have a pathologically +sub-optimal placement such as:: + + r1 + #0#d.data + #0#d.data + #2#d.data + #2#d.data + #4#d.data + #4#d.data + r2 + #1#d.data + #1#d.data + #3#d.data + #3#d.data + #5#d.data + #5#d.data + +In this case, the object cannot be reconstructed from a single region; +``region1`` has only the fragments with index ``0, 2, 4`` and ``region2`` has +the other 3 indexes, but we need 4 unique indexes to be able to rebuild an +object. + +Node Selection Strategy for Reads +--------------------------------- + +Proxy servers require a set of *unique* fragment indexes to decode the original +object when handling a GET request to an EC policy. With a conventional EC +policy, this is very likely to be the outcome of reading fragments from a +random selection of backend nodes. With an EC Duplication policy it is +significantly more likely that responses from a *random* selection of backend +nodes might include some duplicated fragments. + +For this reason it is strongly recommended that EC Duplication always be +deployed in combination with :ref:`composite_rings` and :ref:`proxy server read +affinity `. + +Under normal conditions with the recommended deployment, read affinity will +cause a proxy server to first attempt to read fragments from nodes in its local +region. These fragments are guaranteed to be unique with respect to each other. +Even if there are a small number of local failures, unique local parity +fragments will make up the difference. However, should enough local primary +storage nodes fail, such that sufficient unique fragments are not available in +the local region, a global EC cluster will proceed to read fragments from the +other region(s). Random reads from the remote region are not guaranteed to +return unique fragments; with EC Duplication there is a significantly high +probability that the proxy server will encounter a fragment that is a duplicate +of one it has already found in the local region. The proxy server will ignore +these and make additional requests until it accumulates the required set of +unique fragments, potentially searching all the primary and handoff locations +in the local and remote regions before ultimately failing the read. + +A global EC deployment configured as recommended is therefore extremely +resilient. However, under extreme failure conditions read handling can be +inefficient because nodes in other regions are guaranteed to have some +fragments which are duplicates of those the proxy server has already received. +Work is in progress to improve the proxy server node selection strategy such +that when it is necessary to read from other regions, nodes that are likely to +have useful fragments are preferred over those that are likely to return a +duplicate. + +.. _global_ec_known_issues: + +Known Issues +============ + +Efficient Cross Region Rebuild +------------------------------ + +Work is also in progress to improve the object-reconstructor efficiency for +Global EC policies. Unlike the proxy server, the reconstructor does not apply +any read affinity settings when gathering fragments. It is therefore likely to +receive duplicated fragments (i.e. make wasted backend GET requests) while +performing *every* fragment reconstruction. + +Additionally, other reconstructor optimisations for Global EC are under +investigation: + +* Since fragments are duplicated between regions it may in some cases be more + attractive to restore failed fragments from their duplicates in another + region instead of rebuilding them from other fragments in the local region. + +* Conversely, to avoid WAN transfer it may be more attractive to rebuild + fragments from local parity. + +* During rebalance it will always be more attractive to revert a fragment from + it's old-primary to it's new primary rather than rebuilding or transferring a + duplicate from the remote region. + + +************** +Under the Hood +************** + +Now that we've explained a little about EC support in Swift and how to +configure and use it, let's explore how EC fits in at the nuts-n-bolts level. + +Terminology +=========== + +The term 'fragment' has been used already to describe the output of the EC +process (a series of fragments) however we need to define some other key terms +here before going any deeper. Without paying special attention to using the +correct terms consistently, it is very easy to get confused in a hurry! + +* **chunk**: HTTP chunks received over wire (term not used to describe any EC + specific operation). +* **segment**: Not to be confused with SLO/DLO use of the word, in EC we call a + segment a series of consecutive HTTP chunks buffered up before performing an + EC operation. +* **fragment**: Data and parity 'fragments' are generated when erasure coding + transformation is applied to a segment. +* **EC archive**: A concatenation of EC fragments; to a storage node this looks + like an object. +* **ec_ndata**: Number of EC data fragments. +* **ec_nparity**: Number of EC parity fragments. + +Middleware +========== + +Middleware remains unchanged. For most middleware (e.g., SLO/DLO) the fact that +the proxy is fragmenting incoming objects is transparent. For list endpoints, +however, it is a bit different. A caller of list endpoints will get back the +locations of all of the fragments. The caller will be unable to re-assemble the +original object with this information, however the node locations may still +prove to be useful information for some applications. + +On Disk Storage +=============== + +EC archives are stored on disk in their respective objects-N directory based on +their policy index. See :doc:`overview_policies` for details on per policy +directory information. + +In addition to the object timestamp, the filenames of EC archives encode other +information related to the archive: + +* The fragment archive index. This is required for a few reasons. For one, it + allows us to store fragment archives of different indexes on the same storage + node which is not typical however it is possible in many circumstances. + Without unique filenames for the different EC archive files in a set, we + would be at risk of overwriting one archive of index `n` with another of + index `m` in some scenarios. + + The index is appended to the filename just before the ``.data`` extension. + For example, the filename for a fragment archive storing the 5th fragment + would be:: + + 1418673556.92690#5.data + +* The durable state of the archive. The meaning of this will be described in + more detail later, but a fragment archive that is considered durable has an + additional ``#d`` string included in its filename immediately before the + ``.data`` extension. For example:: + + 1418673556.92690#5#d.data + +A policy-specific transformation function is therefore used to build the +archive filename. These functions are implemented in the diskfile module as +methods of policy specific sub classes of ``BaseDiskFileManager``. + +The transformation function for the replication policy is simply a NOP. + +.. note:: + + In older versions the durable state of an archive was represented by an + additional file called the ``.durable`` file instead of the ``#d`` + substring in the ``.data`` filename. The ``.durable`` for the example above + would be:: + + 1418673556.92690.durable + + +Proxy Server +============ + +High Level +---------- + +The Proxy Server handles Erasure Coding in a different manner than replication, +therefore there are several code paths unique to EC policies either though sub +classing or simple conditionals. Taking a closer look at the PUT and the GET +paths will help make this clearer. But first, a high level overview of how an +object flows through the system: + +.. image:: images/ec_overview.png + +Note how: + +* Incoming objects are buffered into segments at the proxy. +* Segments are erasure coded into fragments at the proxy. +* The proxy stripes fragments across participating nodes such that the on-disk + stored files that we call a fragment archive is appended with each new + fragment. + +This scheme makes it possible to minimize the number of on-disk files given our +segmenting and fragmenting. + +Multi_Phase Conversation +------------------------ + +Multi-part MIME document support is used to allow the proxy to engage in a +handshake conversation with the storage node for processing PUT requests. This +is required for a few different reasons. + +#. From the perspective of the storage node, a fragment archive is really just + another object, we need a mechanism to send down the original object etag + after all fragment archives have landed. +#. Without introducing strong consistency semantics, the proxy needs a mechanism + to know when a quorum of fragment archives have actually made it to disk + before it can inform the client of a successful PUT. + +MIME supports a conversation between the proxy and the storage nodes for every +PUT. This provides us with the ability to handle a PUT in one connection and +assure that we have the essence of a 2 phase commit, basically having the proxy +communicate back to the storage nodes once it has confirmation that a quorum of +fragment archives in the set have been written. + +For the first phase of the conversation the proxy requires a quorum of +`ec_ndata + 1` fragment archives to be successfully put to storage nodes. This +ensures that the object could still be reconstructed even if one of the +fragment archives becomes unavailable. As described above, each fragment +archive file is named:: + + #.data + +where ``ts`` is the timestamp and ``frag_index`` is the fragment archive index. + +During the second phase of the conversation the proxy communicates a +confirmation to storage nodes that the fragment archive quorum has been +achieved. This causes each storage node to rename the fragment archive written +in the first phase of the conversation to include the substring ``#d`` in its +name:: + + ##d.data + +This indicates to the object server that this fragment archive is `durable` and +that there is a set of data files that are durable at timestamp ``ts``. + +For the second phase of the conversation the proxy requires a quorum of +`ec_ndata + 1` successful commits on storage nodes. This ensures that there are +sufficient committed fragment archives for the object to be reconstructed even +if one becomes unavailable. The reconstructor ensures that the durable state is +replicated on storage nodes where it may be missing. + +Note that the completion of the commit phase of the conversation +is also a signal for the object server to go ahead and immediately delete older +timestamp files for this object. This is critical as we do not want to delete +the older object until the storage node has confirmation from the proxy, via the +multi-phase conversation, that the other nodes have landed enough for a quorum. + +The basic flow looks like this: + +#. The Proxy Server erasure codes and streams the object fragments + (ec_ndata + ec_nparity) to the storage nodes. +#. The storage nodes store objects as EC archives and upon finishing object + data/metadata write, send a 1st-phase response to proxy. +#. Upon quorum of storage nodes responses, the proxy initiates 2nd-phase by + sending commit confirmations to object servers. +#. Upon receipt of commit message, object servers rename ``.data`` files to + include the ``#d`` substring, indicating successful PUT, and send a final + response to the proxy server. +#. The proxy waits for `ec_ndata + 1` object servers to respond with a + success (2xx) status before responding to the client with a successful + status. + +Here is a high level example of what the conversation looks like:: + + proxy: PUT /p/a/c/o + Transfer-Encoding': 'chunked' + Expect': '100-continue' + X-Backend-Obj-Multiphase-Commit: yes + obj: 100 Continue + X-Obj-Multiphase-Commit: yes + proxy: --MIMEboundary + X-Document: object body + + --MIMEboundary + X-Document: object metadata + Content-MD5: + + --MIMEboundary + #.data file> + obj: 100 Continue + + proxy: X-Document: put commit + commit_confirmation + --MIMEboundary-- + #.data to ##d.data> + obj: 20x + =2 2xx responses> + proxy: 2xx -> client + +A few key points on the durable state of a fragment archive: + +* A durable fragment archive means that there exist sufficient other fragment + archives elsewhere in the cluster (durable and/or non-durable) to reconstruct + the object. +* When a proxy does a GET, it will require at least one object server to + respond with a fragment archive is durable before reconstructing and + returning the object to the client. + +Partial PUT Failures +-------------------- + +A partial PUT failure has a few different modes. In one scenario the Proxy +Server is alive through the entire PUT conversation. This is a very +straightforward case. The client will receive a good response if and only if a +quorum of fragment archives were successfully landed on their storage nodes. +In this case the Reconstructor will discover the missing fragment archives, +perform a reconstruction and deliver those fragment archives to their nodes. + +The more interesting case is what happens if the proxy dies in the middle of a +conversation. If it turns out that a quorum had been met and the commit phase +of the conversation finished, its as simple as the previous case in that the +reconstructor will repair things. However, if the commit didn't get a chance to +happen then some number of the storage nodes have .data files on them (fragment +archives) but none of them knows whether there are enough elsewhere for the +entire object to be reconstructed. In this case the client will not have +received a 2xx response so there is no issue there, however, it is left to the +storage nodes to clean up the stale fragment archives. Work is ongoing in this +area to enable the proxy to play a role in reviving these fragment archives, +however, for the current release, a proxy failure after the start of a +conversation but before the commit message will simply result in a PUT failure. + +GET +--- + +The GET for EC is different enough from replication that subclassing the +`BaseObjectController` to the `ECObjectController` enables an efficient way to +implement the high level steps described earlier: + +#. The proxy server makes simultaneous requests to `ec_ndata` primary object + server nodes with goal of finding a set of `ec_ndata` distinct EC archives + at the same timestamp, and an indication from at least one object server + that a durable fragment archive exists for that timestamp. If this goal is + not achieved with the first `ec_ndata` requests then the proxy server + continues to issue requests to the remaining primary nodes and then handoff + nodes. +#. As soon as the proxy server has found a usable set of `ec_ndata` EC + archives, it starts to call PyECLib to decode fragments as they are returned + by the object server nodes. +#. The proxy server creates Etag and content length headers for the client + response since each EC archive's metadata is valid only for that archive. +#. The proxy streams the decoded data it has back to the client. + +Note that the proxy does not require all objects servers to have a durable +fragment archive to return in response to a GET. The proxy will be satisfied if +just one object server has a durable fragment archive at the same timestamp as +EC archives returned from other object servers. This means that the proxy can +successfully GET an object that had missing durable state on some nodes when it +was PUT (i.e. a partial PUT failure occurred). + +Note also that an object server may inform the proxy server that it has more +than one EC archive for different timestamps and/or fragment indexes, which may +cause the proxy server to issue multiple requests for distinct EC archives to +that object server. (This situation can temporarily occur after a ring +rebalance when a handoff node storing an archive has become a primary node and +received its primary archive but not yet moved the handoff archive to its +primary node.) + +The proxy may receive EC archives having different timestamps, and may +receive several EC archives having the same index. The proxy therefore +ensures that it has sufficient EC archives with the same timestamp +and distinct fragment indexes before considering a GET to be successful. + +Object Server +============= + +The Object Server, like the Proxy Server, supports MIME conversations as +described in the proxy section earlier. This includes processing of the commit +message and decoding various sections of the MIME document to extract the footer +which includes things like the entire object etag. + +DiskFile +-------- + +Erasure code policies use subclassed ``ECDiskFile``, ``ECDiskFileWriter``, +``ECDiskFileReader`` and ``ECDiskFileManager`` to implement EC specific +handling of on disk files. This includes things like file name manipulation to +include the fragment index and durable state in the filename, construction of +EC specific ``hashes.pkl`` file to include fragment index information, etc. + +Metadata +^^^^^^^^ + +There are few different categories of metadata that are associated with EC: + +System Metadata: EC has a set of object level system metadata that it +attaches to each of the EC archives. The metadata is for internal use only: + +* ``X-Object-Sysmeta-EC-Etag``: The Etag of the original object. +* ``X-Object-Sysmeta-EC-Content-Length``: The content length of the original + object. +* ``X-Object-Sysmeta-EC-Frag-Index``: The fragment index for the object. +* ``X-Object-Sysmeta-EC-Scheme``: Description of the EC policy used to encode + the object. +* ``X-Object-Sysmeta-EC-Segment-Size``: The segment size used for the object. + +User Metadata: User metadata is unaffected by EC, however, a full copy of the +user metadata is stored with every EC archive. This is required as the +reconstructor needs this information and each reconstructor only communicates +with its closest neighbors on the ring. + +PyECLib Metadata: PyECLib stores a small amount of metadata on a per fragment +basis. This metadata is not documented here as it is opaque to Swift. + +Database Updates +================ + +As account and container rings are not associated with a Storage Policy, there +is no change to how these database updates occur when using an EC policy. + +The Reconstructor +================= + +The Reconstructor performs analogous functions to the replicator: + +#. Recovering from disk drive failure. +#. Moving data around because of a rebalance. +#. Reverting data back to a primary from a handoff. +#. Recovering fragment archives from bit rot discovered by the auditor. + +However, under the hood it operates quite differently. The following are some +of the key elements in understanding how the reconstructor operates. + +Unlike the replicator, the work that the reconstructor does is not always as +easy to break down into the 2 basic tasks of synchronize or revert (move data +from handoff back to primary) because of the fact that one storage node can +house fragment archives of various indexes and each index really \"belongs\" to +a different node. So, whereas when the replicator is reverting data from a +handoff it has just one node to send its data to, the reconstructor can have +several. Additionally, it is not always the case that the processing of a +particular suffix directory means one or the other job type for the entire +directory (as it does for replication). The scenarios that create these mixed +situations can be pretty complex so we will just focus on what the +reconstructor does here and not a detailed explanation of why. + +Job Construction and Processing +------------------------------- + +Because of the nature of the work it has to do as described above, the +reconstructor builds jobs for a single job processor. The job itself contains +all of the information needed for the processor to execute the job which may be +a synchronization or a data reversion. There may be a mix of jobs that +perform both of these operations on the same suffix directory. + +Jobs are constructed on a per-partition basis and then per-fragment-index basis. +That is, there will be one job for every fragment index in a partition. +Performing this construction \"up front\" like this helps minimize the +interaction between nodes collecting hashes.pkl information. + +Once a set of jobs for a partition has been constructed, those jobs are sent off +to threads for execution. The single job processor then performs the necessary +actions, working closely with ssync to carry out its instructions. For data +reversion, the actual objects themselves are cleaned up via the ssync module and +once that partition's set of jobs is complete, the reconstructor will attempt to +remove the relevant directory structures. + +Job construction must account for a variety of scenarios, including: + +#. A partition directory with all fragment indexes matching the local node + index. This is the case where everything is where it belongs and we just + need to compare hashes and sync if needed. Here we simply sync with our + partners. +#. A partition directory with at least one local fragment index and mix of + others. Here we need to sync with our partners where fragment indexes + matches the local_id, all others are sync'd with their home nodes and then + deleted. +#. A partition directory with no local fragment index and just one or more of + others. Here we sync with just the home nodes for the fragment indexes that + we have and then all the local archives are deleted. This is the basic + handoff reversion case. + +.. note:: + A \"home node\" is the node where the fragment index encoded in the + fragment archive's filename matches the node index of a node in the primary + partition list. + +Node Communication +------------------ + +The replicators talk to all nodes who have a copy of their object, typically +just 2 other nodes. For EC, having each reconstructor node talk to all nodes +would incur a large amount of overhead as there will typically be a much larger +number of nodes participating in the EC scheme. Therefore, the reconstructor is +built to talk to its adjacent nodes on the ring only. These nodes are typically +referred to as partners. + +Reconstruction +-------------- + +Reconstruction can be thought of sort of like replication but with an extra step +in the middle. The reconstructor is hard-wired to use ssync to determine what is +missing and desired by the other side. However, before an object is sent over +the wire it needs to be reconstructed from the remaining fragments as the local +fragment is just that - a different fragment index than what the other end is +asking for. + +Thus, there are hooks in ssync for EC based policies. One case would be for +basic reconstruction which, at a high level, looks like this: + +* Determine which nodes need to be contacted to collect other EC archives needed + to perform reconstruction. +* Update the etag and fragment index metadata elements of the newly constructed + fragment archive. +* Establish a connection to the target nodes and give ssync a DiskFileLike class + from which it can stream data. + +The reader in this class gathers fragments from the nodes and uses PyECLib to +reconstruct each segment before yielding data back to ssync. Essentially what +this means is that data is buffered, in memory, on a per segment basis at the +node performing reconstruction and each segment is dynamically reconstructed and +delivered to ``ssync_sender`` where the ``send_put()`` method will ship them on +over. The sender is then responsible for deleting the objects as they are sent +in the case of data reversion. + +The Auditor +=========== + +Because the auditor already operates on a per storage policy basis, there are no +specific auditor changes associated with EC. Each EC archive looks like, and is +treated like, a regular object from the perspective of the auditor. Therefore, +if the auditor finds bit-rot in an EC archive, it simply quarantines it and the +reconstructor will take care of the rest just as the replicator does for +replication policies. diff --git a/doc/source/overview_expiring_objects.rst b/doc/source/overview_expiring_objects.rst index ba91570963..9fe0fefe5e 100644 --- a/doc/source/overview_expiring_objects.rst +++ b/doc/source/overview_expiring_objects.rst @@ -2,42 +2,187 @@ Expiring Object Support ======================= -The ``swift-object-expirer`` offers scheduled deletion of objects. The Swift client would use the ``X-Delete-At`` or ``X-Delete-After`` headers during an object ``PUT`` or ``POST`` and the cluster would automatically quit serving that object at the specified time and would shortly thereafter remove the object from the system. +The ``swift-object-expirer`` offers scheduled deletion of objects. The Swift +client would use the ``X-Delete-At`` or ``X-Delete-After`` headers during an +object ``PUT`` or ``POST`` and the cluster would automatically quit serving +that object at the specified time and would shortly thereafter remove the +object from the system. -The ``X-Delete-At`` header takes a Unix Epoch timestamp, in integer form; for example: ``1317070737`` represents ``Mon Sep 26 20:58:57 2011 UTC``. +The ``X-Delete-At`` header takes a Unix Epoch timestamp, in integer form; for +example: ``1317070737`` represents ``Mon Sep 26 20:58:57 2011 UTC``. -The ``X-Delete-After`` header takes a integer number of seconds. The proxy server that receives the request will convert this header into an ``X-Delete-At`` header using its current time plus the value given. +The ``X-Delete-After`` header takes a positive integer number of seconds. The +proxy server that receives the request will convert this header into an +``X-Delete-At`` header using the request timestamp plus the value given. -As expiring objects are added to the system, the object servers will record the expirations in a hidden ``.expiring_objects`` account for the ``swift-object-expirer`` to handle later. +If both the ``X-Delete-At`` and ``X-Delete-After`` headers are sent with a +request then the ``X-Delete-After`` header will take precedence. -Just one instance of the ``swift-object-expirer`` daemon needs to run for a cluster. This isn't exactly automatic failover high availability, but if this daemon doesn't run for a few hours it should not be any real issue. The expired-but-not-yet-deleted objects will still ``404 Not Found`` if someone tries to ``GET`` or ``HEAD`` them and they'll just be deleted a bit later when the daemon is restarted. +As expiring objects are added to the system, the object servers will record the +expirations in a hidden ``.expiring_objects`` account for the +``swift-object-expirer`` to handle later. -The daemon uses the ``/etc/swift/object-expirer.conf`` by default, and here is a quick sample conf file:: +Usually, just one instance of the ``swift-object-expirer`` daemon needs to run +for a cluster. This isn't exactly automatic failover high availability, but if +this daemon doesn't run for a few hours it should not be any real issue. The +expired-but-not-yet-deleted objects will still ``404 Not Found`` if someone +tries to ``GET`` or ``HEAD`` them and they'll just be deleted a bit later when +the daemon is restarted. + +By default, the ``swift-object-expirer`` daemon will run with a concurrency of +1. Increase this value to get more concurrency. A concurrency of 1 may not be +enough to delete expiring objects in a timely fashion for a particular Swift +cluster. + +It is possible to run multiple daemons to do different parts of the work if a +single process with a concurrency of more than 1 is not enough (see the sample +config file for details). + +To run the ``swift-object-expirer`` as multiple processes, set ``processes`` to +the number of processes (either in the config file or on the command line). +Then run one process for each part. Use ``process`` to specify the part of the +work to be done by a process using the command line or the config. So, for +example, if you'd like to run three processes, set ``processes`` to 3 and run +three processes with ``process`` set to 0, 1, and 2 for the three processes. +If multiple processes are used, it's necessary to run one for each part of the +work or that part of the work will not be done. + +By default the daemon looks for two different config files. When launching, +the process searches for the ``[object-expirer]`` section in the + +``/etc/swift/object-server.conf`` config. If the section or the config is missing +it will then look for and use the ``/etc/swift/object-expirer.conf`` config. +The latter config file is considered deprecated and is searched for to aid +in cluster upgrades. + +Delay Reaping of Objects from Disk +---------------------------------- + +Swift's expiring object ``x-delete-at`` feature can be used to have the cluster +reap user's objects automatically from disk on their behalf when they no longer +want them stored in their account. In some cases it may be necessary to +"intervene" in the expected expiration process to prevent accidental or +premature data loss if an object marked for expiration should NOT be deleted +immediately when it expires for whatever reason. In these cases +``swift-object-expirer`` offers configuration of a ``delay_reaping`` value +on accounts and containers, which provides a delay between when an object +is marked for deletion, or expired, and when it is actually reaped from disk. +When this is set in the object expirer config the object expirer leaves expired +objects on disk (and in container listings) for the ``delay_reaping`` time. +After this delay has passed objects will be reaped as normal. + +The ``delay_reaping`` value can be set either at an account level or a +container level. When set at an account level, the object expirer will +only reap objects within the account after the delay. A container level +``delay_reaping`` works similarly for containers and overrides an account +level ``delay_reaping`` value. + +The ``delay_reaping`` values are set in the ``[object-expirer]`` section in +either the object-server or object-expirer config files. They are configured +with dynamic config option names prefixed with ``delay_reaping_`` +at the account level and ``delay_reaping_/`` at the container +level, with the ``delay_reaping`` value in seconds. + +Here is an example of ``delay_reaping`` configs in the``object-expirer`` +section in the ``object-server.conf``:: - [DEFAULT] - # swift_dir = /etc/swift - # user = swift - # You can specify default log routing here if you want: - # log_name = swift - # log_facility = LOG_LOCAL0 - # log_level = INFO - [object-expirer] - interval = 300 - - [pipeline:main] - pipeline = catch_errors cache proxy-server - - [app:proxy-server] - use = egg:swift#proxy - # See proxy-server.conf-sample for options - - [filter:cache] - use = egg:swift#memcache - # See proxy-server.conf-sample for options - - [filter:catch_errors] - use = egg:swift#catch_errors - # See proxy-server.conf-sample for options - -The daemon needs to run on a machine with access to all the backend servers in the cluster, but does not need proxy server or public access. The daemon will use its own internal proxy code instance to access the backend servers. + delay_reaping_AUTH_test = 300.0 + delay_reaping_AUTH_test2 = 86400.0 + delay_reaping_AUTH_test/test = 0.0 + delay_reaping_AUTH_test/test2 = 600.0 + +.. note:: + A container level ``delay_reaping`` value does not require an account level + ``delay_reaping`` value but overrides the account level value for the same + account if it exists. By default, no ``delay_reaping`` value is configured + for any accounts or containers. + +Accessing Objects After Expiration +---------------------------------- + +By default, objects that expire become inaccessible, even to the account owner. +The object may not have been deleted, but any GET/HEAD/POST client request for +the object will respond 404 Not Found after the ``x-delete-at`` timestamp +has passed. + +The ``swift-proxy-server`` offers the ability to globally configure a flag to +allow requests to access expired objects that have not yet been deleted. +When this flag is enabled, a user can make a GET, HEAD, or POST request with +the header ``x-open-expired`` set to true to access the expired object. + +The global configuration is an opt-in flag that can be set in the +``[proxy-server]`` section of the ``proxy-server.conf`` file. It is configured +with a single flag ``allow_open_expired`` set to true or false. By default, +this flag is set to false. + +Here is an example in the ``proxy-server`` section in ``proxy-server.conf``:: + + [proxy-server] + allow_open_expired = false + +To discover whether this flag is set, you can send a **GET** request to the +``/info`` :ref:`discoverability ` path. This will return +configuration data in JSON format where the value of ``allow_open_expired`` is +exposed. + +When using a temporary URL to access the object, this feature is not enabled. +This means that adding the header will not allow requests to temporary URLs +to access expired objects. + +Upgrading impact: General Task Queue vs Legacy Queue +---------------------------------------------------- + +The expirer daemon will be moving to a new general task-queue based design that +will divide the work across all object servers, as such only expirers defined +in the object-server config will be able to use the new system. + +The legacy object expirer config is documented in +``etc/object-expirer.conf-sample``. The alternative object-server config +section is documented in ``etc/object-server.conf-sample``. + +The parameters in both files are identical except for a new option in the +object-server ``[object-expirer]`` section, ``dequeue_from_legacy`` +which when set to ``True`` will tell the expirer that in addition to using +the new task queueing system to also check the legacy (soon to be deprecated) +queue. + +.. note:: + The new task-queue system has not been completed yet. So an expirer's with + ``dequeue_from_legacy`` set to ``False`` will currently do nothing. + +By default ``dequeue_from_legacy`` will be ``False``, it is necessary to +be set to ``True`` explicitly while migrating from the old expiring queue. + +Any expirer using the old config ``/etc/swift/object-expirer.conf`` will not +use the new general task queue. It'll ignore the ``dequeue_from_legacy`` +and will only check the legacy queue. Meaning it'll run as a legacy expirer. + +Why is this important? If you are currently running object-expirers on nodes +that are not object storage nodes, then for the time being they will still +work but only by dequeuing from the old queue. +When the new general task queue is introduced, expirers will be required to +run on the object servers so that any new objects added can be removed. +If you're in this situation, you can safely setup the new expirer +section in the ``object-server.conf`` to deal with the new queue and leave the +legacy expirers running elsewhere. + +However, if your old expirers are running on the object-servers, the most +common topology, then you would add the new section to all object servers, to +deal the new queue. In order to maintain the same number of expirers checking +the legacy queue, pick the same number of nodes as you previously had and turn +on ``dequeue_from_legacy`` on those nodes only. Also note on these nodes +you'd need to keep the legacy ``process`` and ``processes`` options to maintain +the concurrency level for the legacy queue. + +.. note:: + Be careful not to enable ``dequeue_from_legacy`` on too many expirers as + all legacy tasks are stored in a single hidden account and the same hidden + containers. On a large cluster one may inadvertently overload the + acccount/container servers handling the legacy expirer queue. + +.. note:: + When running legacy expirers, the daemon needs to run on a machine with + access to all the backend servers in the cluster, but does not need proxy + server or public access. The daemon will use its own internal proxy code + instance to access the backend servers. diff --git a/doc/source/overview_global_cluster.rst b/doc/source/overview_global_cluster.rst new file mode 100644 index 0000000000..2f1c40bf88 --- /dev/null +++ b/doc/source/overview_global_cluster.rst @@ -0,0 +1,149 @@ +=============== +Global Clusters +=============== + +-------- +Overview +-------- + +Swift's default configuration is currently designed to work in a +single region, where a region is defined as a group of machines with +high-bandwidth, low-latency links between them. However, configuration +options exist that make running a performant multi-region Swift +cluster possible. + +For the rest of this section, we will assume a two-region Swift +cluster: region 1 in San Francisco (SF), and region 2 in New York +(NY). Each region shall contain within it 3 zones, numbered 1, 2, and +3, for a total of 6 zones. + +.. _configuring_global_clusters: + +--------------------------- +Configuring Global Clusters +--------------------------- + +.. note:: + + The proxy-server configuration options described below can be given generic + settings in the ``[app:proxy-server]`` configuration section and/or given + specific settings for individual policies using + :ref:`proxy_server_per_policy_config`. + +~~~~~~~~~~~~~ +read_affinity +~~~~~~~~~~~~~ + +This setting, combined with sorting_method setting, makes the proxy +server prefer local backend servers for GET and HEAD requests over +non-local ones. For example, it is preferable for an SF proxy server +to service object GET requests by talking to SF object servers, as the +client will receive lower latency and higher throughput. + +By default, Swift randomly chooses one of the three replicas to give +to the client, thereby spreading the load evenly. In the case of a +geographically-distributed cluster, the administrator is likely to +prioritize keeping traffic local over even distribution of results. +This is where the read_affinity setting comes in. + +Example:: + + [app:proxy-server] + sorting_method = affinity + read_affinity = r1=100 + +This will make the proxy attempt to service GET and HEAD requests from +backends in region 1 before contacting any backends in region 2. +However, if no region 1 backends are available (due to replica +placement, failed hardware, or other reasons), then the proxy will +fall back to backend servers in other regions. + +Example:: + + [app:proxy-server] + sorting_method = affinity + read_affinity = r1z1=100, r1=200 + +This will make the proxy attempt to service GET and HEAD requests from +backends in region 1 zone 1, then backends in region 1, then any other +backends. If a proxy is physically close to a particular zone or +zones, this can provide bandwidth savings. For example, if a zone +corresponds to servers in a particular rack, and the proxy server is +in that same rack, then setting read_affinity to prefer reads from +within the rack will result in less traffic between the top-of-rack +switches. + +The read_affinity setting may contain any number of region/zone +specifiers; the priority number (after the equals sign) determines the +ordering in which backend servers will be contacted. A lower number +means higher priority. + +Note that read_affinity only affects the ordering of primary nodes +(see ring docs for definition of primary node), not the ordering of +handoff nodes. + +~~~~~~~~~~~~~~ +write_affinity +~~~~~~~~~~~~~~ + +This setting makes the proxy server prefer local backend servers for +object PUT requests over non-local ones. For example, it may be +preferable for an SF proxy server to service object PUT requests +by talking to SF object servers, as the client will receive lower +latency and higher throughput. However, if this setting is used, note +that a NY proxy server handling a GET request for an object that was +PUT using write affinity may have to fetch it across the WAN link, as +the object won't immediately have any replicas in NY. However, +replication will move the object's replicas to their proper homes in +both SF and NY. + +One potential issue with write_affinity is, end user may get 404 error when +deleting objects before replication. The write_affinity_handoff_delete_count +setting is used together with write_affinity in order to solve that issue. +With its default configuration, Swift will calculate the proper number of +handoff nodes to send requests to. + +Note that only object PUT/DELETE requests are affected by the write_affinity +setting; POST, GET, HEAD, OPTIONS, and account/container PUT requests are +not affected. + +This setting lets you trade data distribution for throughput. If +write_affinity is enabled, then object replicas will initially be +stored all within a particular region or zone, thereby decreasing the +quality of the data distribution, but the replicas will be distributed +over fast WAN links, giving higher throughput to clients. Note that +the replicators will eventually move objects to their proper, +well-distributed homes. + +The write_affinity setting is useful only when you don't typically +read objects immediately after writing them. For example, consider a +workload of mainly backups: if you have a bunch of machines in NY that +periodically write backups to Swift, then odds are that you don't then +immediately read those backups in SF. If your workload doesn't look +like that, then you probably shouldn't use write_affinity. + +The write_affinity_node_count setting is only useful in conjunction +with write_affinity; it governs how many local object servers will be +tried before falling back to non-local ones. + +Example:: + + [app:proxy-server] + write_affinity = r1 + write_affinity_node_count = 2 * replicas + +Assuming 3 replicas, this configuration will make object PUTs try +storing the object's replicas on up to 6 disks ("2 * replicas") in +region 1 ("r1"). Proxy server tries to find 3 devices for storing the +object. While a device is unavailable, it queries the ring for the 4th +device and so on until 6th device. If the 6th disk is still unavailable, +the last replica will be sent to other region. It doesn't mean there'll +have 6 replicas in region 1. + + +You should be aware that, if you have data coming into SF faster than +your replicators are transferring it to NY, then your cluster's data +distribution will get worse and worse over time as objects pile up in SF. +If this happens, it is recommended to disable write_affinity and simply let +object PUTs traverse the WAN link, as that will naturally limit the +object growth rate to what your WAN link can handle. diff --git a/doc/source/overview_large_objects.rst b/doc/source/overview_large_objects.rst index 52176172e9..e96c87015f 100644 --- a/doc/source/overview_large_objects.rst +++ b/doc/source/overview_large_objects.rst @@ -1,3 +1,5 @@ +.. _large-objects: + ==================== Large Object Support ==================== @@ -13,88 +15,39 @@ special manifest file is created that, when downloaded, sends all the segments concatenated as a single object. This also offers much greater upload speed with the possibility of parallel uploads of the segments. -------------------------------------- -Using ``swift`` for Segmented Objects -------------------------------------- - -The quickest way to try out this feature is use the ``swift`` Swift Tool -included with the `python-swiftclient`_ library. You can use the ``-S`` -option to specify the segment size to use when splitting a large file. For -example:: +.. _dynamic-large-objects: - swift upload test_container -S 1073741824 large_file +.. _dlo-doc: -This would split the large_file into 1G segments and begin uploading those -segments in parallel. Once all the segments have been uploaded, ``swift`` will -then create the manifest file so the segments can be downloaded as one. +--------------------- +Dynamic Large Objects +--------------------- -So now, the following ``swift`` command would download the entire large object:: +.. automodule:: swift.common.middleware.dlo + :members: + :show-inheritance: - swift download test_container large_file +.. _static-large-objects: -``swift`` uses a strict convention for its segmented object -support. In the above example it will upload all the segments into a -second container named test_container_segments. These segments will -have names like large_file/1290206778.25/21474836480/00000000, -large_file/1290206778.25/21474836480/00000001, etc. +.. _slo-doc: -The main benefit for using a separate container is that the main container -listings will not be polluted with all the segment names. The reason for using -the segment name format of /// is so that an -upload of a new file with the same name won't overwrite the contents of the -first until the last moment when the manifest file is updated. +-------------------- +Static Large Objects +-------------------- -``swift`` will manage these segment files for you, deleting old segments on -deletes and overwrites, etc. You can override this behavior with the -``--leave-segments`` option if desired; this is useful if you want to have -multiple versions of the same large object available. - -.. _`python-swiftclient`: http://github.com/openstack/python-swiftclient +.. automodule:: swift.common.middleware.slo + :members: + :show-inheritance: ---------- Direct API ---------- -You can also work with the segments and manifests directly with HTTP -requests instead of having ``swift`` do that for you. You can just -upload the segments like you would any other object and the manifest -is just a zero-byte file with an extra ``X-Object-Manifest`` header. - -All the object segments need to be in the same container, have a common object -name prefix, and their names sort in the order they should be concatenated. -They don't have to be in the same container as the manifest file will be, which -is useful to keep container listings clean as explained above with ``swift``. - -The manifest file is simply a zero-byte file with the extra -``X-Object-Manifest: /`` header, where ```` is -the container the object segments are in and ```` is the common prefix -for all the segments. - -It is best to upload all the segments first and then create or update the -manifest. In this way, the full object won't be available for downloading until -the upload is complete. Also, you can upload a new set of segments to a second -location and then update the manifest to point to this new location. During the -upload of the new segments, the original manifest will still be available to -download the first set of segments. - -Here's an example using ``curl`` with tiny 1-byte segments:: - - # First, upload the segments - curl -X PUT -H 'X-Auth-Token: ' \ - http:///container/myobject/1 --data-binary '1' - curl -X PUT -H 'X-Auth-Token: ' \ - http:///container/myobject/2 --data-binary '2' - curl -X PUT -H 'X-Auth-Token: ' \ - http:///container/myobject/3 --data-binary '3' - - # Next, create the manifest file - curl -X PUT -H 'X-Auth-Token: ' \ - -H 'X-Object-Manifest: container/myobject/' \ - http:///container/myobject --data-binary '' - - # And now we can download the segments as a single object - curl -H 'X-Auth-Token: ' \ - http:///container/myobject +SLO support centers around the user generated manifest file. After the user +has uploaded the segments into their account a manifest file needs to be +built and uploaded. All object segments, must be at least 1 byte +in size. Please see the SLO docs for :ref:`slo-doc` further +details. ---------------- Additional Notes @@ -104,6 +57,10 @@ Additional Notes /`` header will be returned with the concatenated object so you can tell where it's getting its segments from. +* When updating a manifest object using a POST request, a + ``X-Object-Manifest`` header must be included for the object to + continue to behave as a manifest object. + * The response's ``Content-Length`` for a ``GET`` or ``HEAD`` on the manifest file will be the sum of all the segments in the ``/`` listing, dynamically. So, uploading additional segments after the manifest is @@ -117,11 +74,11 @@ Additional Notes * The response's ``ETag`` for a ``GET`` or ``HEAD`` on the manifest file will be the MD5 sum of the concatenated string of ETags for each of the segments - in the ``/`` listing, dynamically. Usually in Swift the - ETag is the MD5 sum of the contents of the object, and that holds true for - each segment independently. But, it's not feasible to generate such an ETag - for the manifest itself, so this method was chosen to at least offer change - detection. + in the manifest (for DLO, from the listing ``/``). + Usually in Swift the ETag is the MD5 sum of the contents of the object, and + that holds true for each segment independently. But it's not meaningful to + generate such an ETag for the manifest itself so this method was chosen to + at least offer change detection. .. note:: @@ -134,10 +91,10 @@ Additional Notes History ------- -Large object support has gone through various iterations before settling on -this implementation. +Dynamic large object support has gone through various iterations before +settling on this implementation. -The primary factor driving the limitation of object size in swift is +The primary factor driving the limitation of object size in Swift is maintaining balance among the partitions of the ring. To maintain an even dispersion of disk usage throughout the cluster the obvious storage pattern was to simply split larger objects into smaller segments, which could then be @@ -168,20 +125,32 @@ The current "user manifest" design was chosen in order to provide a transparent download of large objects to the client and still provide the uploading client a clean API to support segmented uploads. -Alternative "explicit" user manifest options were discussed which would have -required a pre-defined format for listing the segments to "finalize" the -segmented upload. While this may offer some potential advantages, it was -decided that pushing an added burden onto the client which could potentially -limit adoption should be avoided in favor of a simpler "API" (essentially just -the format of the 'X-Object-Manifest' header). - -During development it was noted that this "implicit" user manifest approach -which is based on the path prefix can be potentially affected by the eventual -consistency window of the container listings, which could theoretically cause -a GET on the manifest object to return an invalid whole object for that short -term. In reality you're unlikely to encounter this scenario unless you're -running very high concurrency uploads against a small testing environment -which isn't running the object-updaters or container-replicators. - -Like all of swift, Large Object Support is living feature which will continue -to improve and may change over time. +To meet an many use cases as possible Swift supports two types of large +object manifests. Dynamic and static large object manifests both support +the same idea of allowing the user to upload many segments to be later +downloaded as a single file. + +Dynamic large objects rely on a container listing to provide the manifest. +This has the advantage of allowing the user to add/removes segments from the +manifest at any time. It has the disadvantage of relying on eventually +consistent container listings. All three copies of the container dbs must +be updated for a complete list to be guaranteed. Also, all segments must +be in a single container, which can limit concurrent upload speed. + +Static large objects rely on a user provided manifest file. A user can +upload objects into multiple containers and then reference those objects +(segments) in a self generated manifest file. Future GETs to that file will +download the concatenation of the specified segments. This has the advantage of +being able to immediately download the complete object once the manifest has +been successfully PUT. Being able to upload segments into separate containers +also improves concurrent upload speed. It has the disadvantage that the +manifest is finalized once PUT. Any changes to it means it has to be replaced. + +Between these two methods the user has great flexibility in how (s)he chooses +to upload and retrieve large objects to Swift. Swift does not, however, stop +the user from harming themselves. In both cases the segments are deletable by +the user at any time. If a segment was deleted by mistake, a dynamic large +object, having no way of knowing it was ever there, would happily ignore the +deleted file and the user will get an incomplete file. A static large object +would, when failing to retrieve the object specified in the manifest, drop the +connection and the user would receive partial results. diff --git a/doc/source/overview_object_versioning.rst b/doc/source/overview_object_versioning.rst deleted file mode 100644 index c9355a9b51..0000000000 --- a/doc/source/overview_object_versioning.rst +++ /dev/null @@ -1,77 +0,0 @@ -================= -Object Versioning -================= - --------- -Overview --------- - -Object versioning in swift is implemented by setting a flag on the container -to tell swift to version all objects in the container. The flag is the -``X-Versions-Location`` header on the container, and its value is the -container where the versions are stored. It is recommended to use a different -``X-Versions-Location`` container for each container that is being versioned. - -When data is ``PUT`` into a versioned container (a container with the -versioning flag turned on), the existing data in the file is redirected to a -new object and the data in the ``PUT`` request is saved as the data for the -versioned object. The new object name (for the previous version) is -``//``, where ``length`` -is the 3-character zero-padded hexidecimal length of the ```` and -```` is the timestamp of when the previous version was created. - -A ``GET`` to a versioned object will return the current version of the object -without having to do any request redirects or metadata lookups. - -A ``POST`` to a versioned object will update the object metadata as normal, -but will not create a new version of the object. In other words, new versions -are only created when the content of the object changes. - -A ``DELETE`` to a versioned object will only remove the current version of the -object. If you have 5 total versions of the object, you must delete the -object 5 times to completely remove the object. - -Note: A large object manifest file cannot be versioned, but a large object -manifest may point to versioned segments. - --------------------------------------------------- -How to Enable Object Versioning in a Swift Cluster --------------------------------------------------- - -Set ``allow_versions`` to ``True`` in the container server config. - ------------------------ -Examples Using ``curl`` ------------------------ - -First, create a container with the ``X-Versions-Location`` header or add the -header to an existing container. Also make sure the container referenced by -the ``X-Versions-Location`` exists. In this example, the name of that -container is "versions":: - - curl -i -XPUT -H "X-Auth-Token: " \ - -H "X-Versions-Location: versions" http:///container - curl -i -XPUT -H "X-Auth-Token: " http:///versions - -Create an object (the first version):: - - curl -i -XPUT --data-binary 1 -H "X-Auth-Token: " \ - http:///container/myobject - -Now create a new version of that object:: - - curl -i -XPUT --data-binary 2 -H "X-Auth-Token: " \ - http:///container/myobject - -See a listing of the older versions of the object:: - - curl -i -H "X-Auth-Token: " \ - http:///versions?prefix=008myobject/ - -Now delete the current version of the object and see that the older version is -gone:: - - curl -i -XDELETE -H "X-Auth-Token: " \ - http:///container/myobject - curl -i -H "X-Auth-Token: " \ - http:///versions?prefix=008myobject/ diff --git a/doc/source/overview_policies.rst b/doc/source/overview_policies.rst new file mode 100644 index 0000000000..822db5037e --- /dev/null +++ b/doc/source/overview_policies.rst @@ -0,0 +1,680 @@ +================ +Storage Policies +================ + +Storage Policies allow for some level of segmenting the cluster for various +purposes through the creation of multiple object rings. The Storage Policies +feature is implemented throughout the entire code base so it is an important +concept in understanding Swift architecture. + +As described in :doc:`overview_ring`, Swift uses modified hashing rings to +determine where data should reside in the cluster. There is a separate ring for +account databases, container databases, and there is also one object ring per +storage policy. Each object ring behaves exactly the same way and is maintained +in the same manner, but with policies, different devices can belong to different +rings. By supporting multiple object rings, Swift allows the application and/or +deployer to essentially segregate the object storage within a single cluster. +There are many reasons why this might be desirable: + +* Different levels of durability: If a provider wants to offer, for example, + 2x replication and 3x replication but doesn't want to maintain 2 separate + clusters, they would setup a 2x and a 3x replication policy and assign the + nodes to their respective rings. Furthermore, if a provider wanted to offer a + cold storage tier, they could create an erasure coded policy. + +* Performance: Just as SSDs can be used as the exclusive members of an account + or database ring, an SSD-only object ring can be created as well and used to + implement a low-latency/high performance policy. + +* Collecting nodes into group: Different object rings may have different + physical servers so that objects in specific storage policies are always + placed in a particular data center or geography. + +* Different Storage implementations: Another example would be to collect + together a set of nodes that use a different Diskfile (e.g., Kinetic, + GlusterFS) and use a policy to direct traffic just to those nodes. + +* Different read and write affinity settings: proxy-servers can be configured + to use different read and write affinity options for each policy. See + :ref:`proxy_server_per_policy_config` for more details. + +.. note:: + + Today, Swift supports two different policy types: Replication and Erasure + Code. See :doc:`overview_erasure_code` for details. + + Also note that Diskfile refers to backend object storage plug-in + architecture. See :doc:`development_ondisk_backends` for details. + +----------------------- +Containers and Policies +----------------------- + +Policies are implemented at the container level. There are many advantages to +this approach, not the least of which is how easy it makes life on +applications that want to take advantage of them. It also ensures that +Storage Policies remain a core feature of Swift independent of the auth +implementation. Policies were not implemented at the account/auth layer +because it would require changes to all auth systems in use by Swift +deployers. Each container has a new special immutable metadata element called +the storage policy index. Note that internally, Swift relies on policy +indexes and not policy names. Policy names exist for human readability and +translation is managed in the proxy. When a container is created, one new +optional header is supported to specify the policy name. If no name is +specified, the default policy is used (and if no other policies defined, +Policy-0 is considered the default). We will be covering the difference +between default and Policy-0 in the next section. + +Policies are assigned when a container is created. Once a container has been +assigned a policy, it cannot be changed (unless it is deleted/recreated). The +implications on data placement/movement for large datasets would make this a +task best left for applications to perform. Therefore, if a container has an +existing policy of, for example 3x replication, and one wanted to migrate that +data to an Erasure Code policy, the application would create another container +specifying the other policy parameters and then simply move the data from one +container to the other. Policies apply on a per container basis allowing for +minimal application awareness; once a container has been created with a specific +policy, all objects stored in it will be done so in accordance with that policy. +If a container with a specific name is deleted (requires the container be empty) +a new container may be created with the same name without any restriction on +storage policy enforced by the deleted container which previously shared the +same name. + +Containers have a many-to-one relationship with policies meaning that any number +of containers can share one policy. There is no limit to how many containers +can use a specific policy. + +The notion of associating a ring with a container introduces an interesting +scenario: What would happen if 2 containers of the same name were created with +different Storage Policies on either side of a network outage at the same time? +Furthermore, what would happen if objects were placed in those containers, a +whole bunch of them, and then later the network outage was restored? Well, +without special care it would be a big problem as an application could end up +using the wrong ring to try and find an object. Luckily there is a solution for +this problem, a daemon known as the Container Reconciler works tirelessly to +identify and rectify this potential scenario. + +-------------------- +Container Reconciler +-------------------- + +Because atomicity of container creation cannot be enforced in a +distributed eventually consistent system, object writes into the wrong +storage policy must be eventually merged into the correct storage policy +by an asynchronous daemon. Recovery from object writes during a network +partition which resulted in a split brain container created with +different storage policies are handled by the +`swift-container-reconciler` daemon. + +The container reconciler works off a queue similar to the +object-expirer. The queue is populated during container-replication. +It is never considered incorrect to enqueue an object to be evaluated by +the container-reconciler because if there is nothing wrong with the location +of the object the reconciler will simply dequeue it. The +container-reconciler queue is an indexed log for the real location of an +object for which a discrepancy in the storage policy of the container was +discovered. + +To determine the correct storage policy of a container, it is necessary +to update the status_changed_at field in the container_stat table when a +container changes status from deleted to re-created. This transaction +log allows the container-replicator to update the correct storage policy +both when replicating a container and handling REPLICATE requests. + +Because each object write is a separate distributed transaction it is +not possible to determine the correctness of the storage policy for each +object write with respect to the entire transaction log at a given +container database. As such, container databases will always record the +object write regardless of the storage policy on a per object row basis. +Object byte and count stats are tracked per storage policy in each +container and reconciled using normal object row merge semantics. + +The object rows are ensured to be fully durable during replication using +the normal container replication. After the container +replicator pushes its object rows to available primary nodes any +misplaced object rows are bulk loaded into containers based off the +object timestamp under the ``.misplaced_objects`` system account. The +rows are initially written to a handoff container on the local node, and +at the end of the replication pass the ``.misplaced_objects`` containers are +replicated to the correct primary nodes. + +The container-reconciler processes the ``.misplaced_objects`` containers in +descending order and reaps its containers as the objects represented by +the rows are successfully reconciled. The container-reconciler will +always validate the correct storage policy for enqueued objects using +direct container HEAD requests which are accelerated via caching. + +Because failure of individual storage nodes in aggregate is assumed to +be common at scale, the container-reconciler will make forward progress +with a simple quorum majority. During a combination of failures and +rebalances it is possible that a quorum could provide an incomplete +record of the correct storage policy - so an object write may have to be +applied more than once. Because storage nodes and container databases +will not process writes with an ``X-Timestamp`` less than or equal to +their existing record when objects writes are re-applied their timestamp +is slightly incremented. In order for this increment to be applied +transparently to the client a second vector of time has been added to +Swift for internal use. See :class:`~swift.common.utils.Timestamp`. + +As the reconciler applies object writes to the correct storage policy it +cleans up writes which no longer apply to the incorrect storage policy +and removes the rows from the ``.misplaced_objects`` containers. After all +rows have been successfully processed it sleeps and will periodically +check for newly enqueued rows to be discovered during container +replication. + +.. _default-policy: + +------------------------- +Default versus 'Policy-0' +------------------------- + +Storage Policies is a versatile feature intended to support both new and +pre-existing clusters with the same level of flexibility. For that reason, we +introduce the ``Policy-0`` concept which is not the same as the "default" +policy. As you will see when we begin to configure policies, each policy has +a single name and an arbitrary number of aliases (human friendly, +configurable) as well as an index (or simply policy number). Swift reserves +index 0 to map to the object ring that's present in all installations +(e.g., ``/etc/swift/object.ring.gz``). You can name this policy anything you +like, and if no policies are defined it will report itself as ``Policy-0``, +however you cannot change the index as there must always be a policy with +index 0. + +Another important concept is the default policy which can be any policy +in the cluster. The default policy is the policy that is automatically +chosen when a container creation request is sent without a storage +policy being specified. :ref:`configure-policy` describes how to set the +default policy. The difference from ``Policy-0`` is subtle but +extremely important. ``Policy-0`` is what is used by Swift when +accessing pre-storage-policy containers which won't have a policy - in +this case we would not use the default as it might not have the same +policy as legacy containers. When no other policies are defined, Swift +will always choose ``Policy-0`` as the default. + +In other words, default means "create using this policy if nothing else is +specified" and ``Policy-0`` means "use the legacy policy if a container doesn't +have one" which really means use ``object.ring.gz`` for lookups. + +.. note:: + + With the Storage Policy based code, it's not possible to create a + container that doesn't have a policy. If nothing is provided, Swift will + still select the default and assign it to the container. For containers + created before Storage Policies were introduced, the legacy Policy-0 will + be used. + +.. _deprecate-policy: + +-------------------- +Deprecating Policies +-------------------- + +There will be times when a policy is no longer desired; however simply +deleting the policy and associated rings would be problematic for existing +data. In order to ensure that resources are not orphaned in the cluster (left +on disk but no longer accessible) and to provide proper messaging to +applications when a policy needs to be retired, the notion of deprecation is +used. :ref:`configure-policy` describes how to deprecate a policy. + +Swift's behavior with deprecated policies is as follows: + +* The deprecated policy will not appear in /info +* PUT/GET/DELETE/POST/HEAD are still allowed on the pre-existing containers + created with a deprecated policy +* Clients will get an ''400 Bad Request'' error when trying to create a new + container using the deprecated policy +* Clients still have access to policy statistics via HEAD on pre-existing + containers + +.. note:: + + A policy cannot be both the default and deprecated. If you deprecate the + default policy, you must specify a new default. + +You can also use the deprecated feature to rollout new policies. If you +want to test a new storage policy before making it generally available +you could deprecate the policy when you initially roll it the new +configuration and rings to all nodes. Being deprecated will render it +innate and unable to be used. To test it you will need to create a +container with that storage policy; which will require a single proxy +instance (or a set of proxy-servers which are only internally +accessible) that has been one-off configured with the new policy NOT +marked deprecated. Once the container has been created with the new +storage policy any client authorized to use that container will be able +to add and access data stored in that container in the new storage +policy. When satisfied you can roll out a new ``swift.conf`` which does +not mark the policy as deprecated to all nodes. + +.. _configure-policy: + +-------------------- +Configuring Policies +-------------------- + +.. note:: + + See :doc:`policies_saio` for a step by step guide on adding a policy to the + SAIO setup. + +It is important that the deployer have a solid understanding of the semantics +for configuring policies. Configuring a policy is a three-step process: + +#. Edit your ``/etc/swift/swift.conf`` file to define your new policy. +#. Create the corresponding policy object ring file. +#. (Optional) Create policy-specific proxy-server configuration settings. + +Defining a policy +----------------- + +Each policy is defined by a section in the ``/etc/swift/swift.conf`` file. The +section name must be of the form ``[storage-policy:]`` where ```` is the +policy index. There's no reason other than readability that policy indexes be +sequential but the following rules are enforced: + +* If a policy with index ``0`` is not declared and no other policies are + defined, Swift will create a default policy with index ``0``. +* The policy index must be a non-negative integer. +* Policy indexes must be unique. + +.. warning:: + + The index of a policy should never be changed once a policy has been + created and used. Changing a policy index may cause loss of access to data. + +Each policy section contains the following options: + +* ``name = `` (required) + - The primary name of the policy. + - Policy names are case insensitive. + - Policy names must contain only letters, digits or a dash. + - Policy names must be unique. + - Policy names can be changed. + - The name ``Policy-0`` can only be used for the policy with + index ``0``. + - To avoid confusion with policy indexes it is strongly recommended that + policy names are not numbers (e.g. '1'). However, for backwards + compatibility, names that are numbers are supported. +* ``aliases = [, , ...]`` (optional) + - A comma-separated list of alternative names for the policy. + - The default value is an empty list (i.e. no aliases). + - All alias names must follow the rules for the ``name`` option. + - Aliases can be added to and removed from the list. + - Aliases can be useful to retain support for old primary names if the + primary name is changed. +* ``default = [true|false]`` (optional) + - If ``true`` then this policy will be used when the client does not + specify a policy. + - The default value is ``false``. + - The default policy can be changed at any time, by setting + ``default = true`` in the desired policy section. + - If no policy is declared as the default and no other policies are + defined, the policy with index ``0`` is set as the default; + - Otherwise, exactly one policy must be declared default. + - Deprecated policies cannot be declared the default. + - See :ref:`default-policy` for more information. +* ``deprecated = [true|false]`` (optional) + - If ``true`` then new containers cannot be created using this policy. + - The default value is ``false``. + - Any policy may be deprecated by adding the ``deprecated`` option to + the desired policy section. However, a deprecated policy may not also + be declared the default. Therefore, since there must always be a + default policy, there must also always be at least one policy which + is not deprecated. + - See :ref:`deprecate-policy` for more information. +* ``policy_type = [replication|erasure_coding]`` (optional) + - The option ``policy_type`` is used to distinguish between different + policy types. + - The default value is ``replication``. + - When defining an EC policy use the value ``erasure_coding``. +* ``diskfile_module = `` (optional) + - The option ``diskfile_module`` is used to load an alternate backend + object storage plug-in architecture. + - The default value is ``egg:swift#replication.fs`` or + ``egg:swift#erasure_coding.fs`` depending on the policy type. The scheme + and package name are optionals and default to ``egg`` and ``swift``. + +The EC policy type has additional required options. See +:ref:`using_ec_policy` for details. + +The following is an example of a properly configured ``swift.conf`` file. See +:doc:`policies_saio` for full instructions on setting up an all-in-one with +this example configuration.:: + + [swift-hash] + # random unique strings that can never change (DO NOT LOSE) + # Use only printable chars (python -c "import string; print(string.printable)") + swift_hash_path_prefix = changeme + swift_hash_path_suffix = changeme + + [storage-policy:0] + name = gold + aliases = yellow, orange + policy_type = replication + default = yes + + [storage-policy:1] + name = silver + policy_type = replication + diskfile_module = replication.fs + deprecated = yes + + +Creating a ring +--------------- + +Once ``swift.conf`` is configured for a new policy, a new ring must be created. +The ring tools are not policy name aware so it's critical that the correct +policy index be used when creating the new policy's ring file. Additional +object rings are created using ``swift-ring-builder`` in the same manner as the +legacy ring except that ``-N`` is appended after the word ``object`` in the +builder file name, where ``N`` matches the policy index used in ``swift.conf``. +So, to create the ring for policy index ``1``:: + + swift-ring-builder object-1.builder create 10 3 1 + +Continue to use the same naming convention when using ``swift-ring-builder`` to +add devices, rebalance etc. This naming convention is also used in the pattern +for per-policy storage node data directories. + +.. note:: + + The same drives can indeed be used for multiple policies and the details + of how that's managed on disk will be covered in a later section, it's + important to understand the implications of such a configuration before + setting one up. Make sure it's really what you want to do, in many cases + it will be, but in others maybe not. + + +Proxy server configuration (optional) +------------------------------------- + +The :ref:`proxy-server` configuration options related to read and write +affinity may optionally be overridden for individual storage policies. See +:ref:`proxy_server_per_policy_config` for more details. + + +-------------- +Using Policies +-------------- + +Using policies is very simple - a policy is only specified when a container is +initially created. There are no other API changes. Creating a container can +be done without any special policy information:: + + curl -v -X PUT -H 'X-Auth-Token: ' \ + http://127.0.0.1:8080/v1/AUTH_test/myCont0 + +Which will result in a container created that is associated with the +policy name 'gold' assuming we're using the swift.conf example from +above. It would use 'gold' because it was specified as the default. +Now, when we put an object into this container, it will get placed on +nodes that are part of the ring we created for policy 'gold'. + +If we wanted to explicitly state that we wanted policy 'gold' the command +would simply need to include a new header as shown below:: + + curl -v -X PUT -H 'X-Auth-Token: ' \ + -H 'X-Storage-Policy: gold' http://127.0.0.1:8080/v1/AUTH_test/myCont0 + +And that's it! The application does not need to specify the policy name ever +again. There are some illegal operations however: + +* If an invalid (typo, non-existent) policy is specified: 400 Bad Request +* if you try to change the policy either via PUT or POST: 409 Conflict + +If you'd like to see how the storage in the cluster is being used, simply HEAD +the account and you'll see not only the cumulative numbers, as before, but +per policy statistics as well. In the example below there's 3 objects total +with two of them in policy 'gold' and one in policy 'silver':: + + curl -i -X HEAD -H 'X-Auth-Token: ' \ + http://127.0.0.1:8080/v1/AUTH_test + +and your results will include (some output removed for readability):: + + X-Account-Container-Count: 3 + X-Account-Object-Count: 3 + X-Account-Bytes-Used: 21 + X-Storage-Policy-Gold-Object-Count: 2 + X-Storage-Policy-Gold-Bytes-Used: 14 + X-Storage-Policy-Silver-Object-Count: 1 + X-Storage-Policy-Silver-Bytes-Used: 7 + +-------------- +Under the Hood +-------------- + +Now that we've explained a little about what Policies are and how to +configure/use them, let's explore how Storage Policies fit in at the +nuts-n-bolts level. + +Parsing and Configuring +----------------------- + +The module, :ref:`storage_policy`, is responsible for parsing the +``swift.conf`` file, validating the input, and creating a global collection of +configured policies via class :class:`.StoragePolicyCollection`. This +collection is made up of policies of class :class:`.StoragePolicy`. The +collection class includes handy functions for getting to a policy either by +name or by index , getting info about the policies, etc. There's also one +very important function, :meth:`~.StoragePolicyCollection.get_object_ring`. +Object rings are members of the :class:`.StoragePolicy` class and are +actually not instantiated until the :meth:`~.StoragePolicy.load_ring` +method is called. Any caller anywhere in the code base that needs to access +an object ring must use the :data:`.POLICIES` global singleton to access the +:meth:`~.StoragePolicyCollection.get_object_ring` function and provide the +policy index which will call :meth:`~.StoragePolicy.load_ring` if +needed; however, when starting request handling services such as the +:ref:`proxy-server` rings are proactively loaded to provide moderate +protection against a mis-configuration resulting in a run time error. The +global is instantiated when Swift starts and provides a mechanism to patch +policies for the test code. + +Middleware +---------- + +Middleware can take advantage of policies through the :data:`.POLICIES` global +and by importing :func:`.get_container_info` to gain access to the policy index +associated with the container in question. From the index it can then use the +:data:`.POLICIES` singleton to grab the right ring. For example, +:ref:`list_endpoints` is policy aware using the means just described. Another +example is :ref:`recon` which will report the md5 sums for all of the rings. + +Proxy Server +------------ + +The :ref:`proxy-server` module's role in Storage Policies is essentially to make +sure the correct ring is used as its member element. Before policies, the one +object ring would be instantiated when the :class:`.Application` class was +instantiated and could be overridden by test code via init parameter. With +policies, however, there is no init parameter and the :class:`.Application` +class instead depends on the :data:`.POLICIES` global singleton to retrieve the +ring which is instantiated the first time it's needed. So, instead of an object +ring member of the :class:`.Application` class, there is an accessor function, +:meth:`~.Application.get_object_ring`, that gets the ring from +:data:`.POLICIES`. + +In general, when any module running on the proxy requires an object ring, it +does so via first getting the policy index from the cached container info. The +exception is during container creation where it uses the policy name from the +request header to look up policy index from the :data:`.POLICIES` global. Once +the proxy has determined the policy index, it can use the +:meth:`~.Application.get_object_ring` method described earlier to gain access to +the correct ring. It then has the responsibility of passing the index +information, not the policy name, on to the back-end servers via the header ``X +-Backend-Storage-Policy-Index``. Going the other way, the proxy also strips the +index out of headers that go back to clients, and makes sure they only see the +friendly policy names. + +On Disk Storage +--------------- + +Policies each have their own directories on the back-end servers and are +identified by their storage policy indexes. Organizing the back-end directory +structures by policy index helps keep track of things and also allows for +sharing of disks between policies which may or may not make sense depending on +the needs of the provider. More on this later, but for now be aware of the +following directory naming convention: + +* ``/objects`` maps to objects associated with Policy-0 +* ``/objects-N`` maps to storage policy index #N +* ``/async_pending`` maps to async pending update for Policy-0 +* ``/async_pending-N`` maps to async pending update for storage policy index #N +* ``/tmp`` maps to the DiskFile temporary directory for Policy-0 +* ``/tmp-N`` maps to the DiskFile temporary directory for policy index #N +* ``/quarantined/objects`` maps to the quarantine directory for Policy-0 +* ``/quarantined/objects-N`` maps to the quarantine directory for policy index #N + +Note that these directory names are actually owned by the specific Diskfile +implementation, the names shown above are used by the default Diskfile. + +Object Server +------------- + +The :ref:`object-server` is not involved with selecting the storage policy +placement directly. However, because of how back-end directory structures are +setup for policies, as described earlier, the object server modules do play a +role. When the object server gets a :class:`.Diskfile`, it passes in the +policy index and leaves the actual directory naming/structure mechanisms to +:class:`.Diskfile`. By passing in the index, the instance of +:class:`.Diskfile` being used will assure that data is properly located in the +tree based on its policy. + +For the same reason, the :ref:`object-updater` also is policy aware. As +previously described, different policies use different async pending directories +so the updater needs to know how to scan them appropriately. + +The :ref:`object-replicator` is policy aware in that, depending on the policy, +it may have to do drastically different things, or maybe not. For example, the +difference in handling a replication job for 2x versus 3x is trivial; however, +the difference in handling replication between 3x and erasure code is most +definitely not. In fact, the term 'replication' really isn't appropriate for +some policies like erasure code; however, the majority of the framework for +collecting and processing jobs is common. Thus, those functions in the +replicator are leveraged for all policies and then there is policy specific code +required for each policy, added when the policy is defined if needed. + +The ssync functionality is policy aware for the same reason. Some of the +other modules may not obviously be affected, but the back-end directory +structure owned by :class:`.Diskfile` requires the policy index +parameter. Therefore ssync being policy aware really means passing the +policy index along. See :class:`~swift.obj.ssync_sender` and +:class:`~swift.obj.ssync_receiver` for more information on ssync. + +For :class:`.Diskfile` itself, being policy aware is all about managing the +back-end structure using the provided policy index. In other words, callers who +get a :class:`.Diskfile` instance provide a policy index and +:class:`.Diskfile`'s job is to keep data separated via this index (however it +chooses) such that policies can share the same media/nodes if desired. The +included implementation of :class:`.Diskfile` lays out the directory structure +described earlier but that's owned within :class:`.Diskfile`; external modules +have no visibility into that detail. A common function is provided to map +various directory names and/or strings based on their policy index. For example +:class:`.Diskfile` defines :func:`~swift.obj.diskfile.get_data_dir` which builds +off of a generic :func:`.get_policy_string` to consistently build policy aware +strings for various usage. + +Container Server +---------------- + +The :ref:`container-server` plays a very important role in Storage Policies, it +is responsible for handling the assignment of a policy to a container and the +prevention of bad things like changing policies or picking the wrong policy to +use when nothing is specified (recall earlier discussion on Policy-0 versus +default). + +The :ref:`container-updater` is policy aware, however its job is very simple, to +pass the policy index along to the :ref:`account-server` via a request header. + +The :ref:`container-backend` is responsible for both altering existing DB +schema as well as assuring new DBs are created with a schema that supports +storage policies. The "on-demand" migration of container schemas allows Swift +to upgrade without downtime (sqlite's alter statements are fast regardless of +row count). To support rolling upgrades (and downgrades) the incompatible +schema changes to the ``container_stat`` table are made to a +``container_info`` table, and the ``container_stat`` table is replaced with a +view that includes an ``INSTEAD OF UPDATE`` trigger which makes it behave like +the old table. + +The policy index is stored here for use in reporting information +about the container as well as managing split-brain scenario induced +discrepancies between containers and their storage policies. Furthermore, +during split-brain, containers must be prepared to track object updates from +multiple policies so the object table also includes a +``storage_policy_index`` column. Per-policy object counts and bytes are +updated in the ``policy_stat`` table using ``INSERT`` and ``DELETE`` triggers +similar to the pre-policy triggers that updated ``container_stat`` directly. + +The :ref:`container-replicator` daemon will pro-actively migrate legacy +schemas as part of its normal consistency checking process when it updates the +``reconciler_sync_point`` entry in the ``container_info`` table. This ensures +that read heavy containers which do not encounter any writes will still get +migrated to be fully compatible with the post-storage-policy queries without +having to fall back and retry queries with the legacy schema to service +container read requests. + +The :ref:`container-sync-daemon` functionality only needs to be policy aware in +that it accesses the object rings. Therefore, it needs to pull the policy index +out of the container information and use it to select the appropriate object +ring from the :data:`.POLICIES` global. + +Account Server +-------------- + +The :ref:`account-server`'s role in Storage Policies is really limited to +reporting. When a HEAD request is made on an account (see example provided +earlier), the account server is provided with the storage policy index and +builds the ``object_count`` and ``byte_count`` information for the client on a +per policy basis. + +The account servers are able to report per-storage-policy object and byte +counts because of some policy specific DB schema changes. A policy specific +table, ``policy_stat``, maintains information on a per policy basis (one row +per policy) in the same manner in which the ``account_stat`` table does. The +``account_stat`` table still serves the same purpose and is not replaced by +``policy_stat``, it holds the total account stats whereas ``policy_stat`` just +has the break downs. The backend is also responsible for migrating +pre-storage-policy accounts by altering the DB schema and populating the +``policy_stat`` table for Policy-0 with current ``account_stat`` data at that +point in time. + +The per-storage-policy object and byte counts are not updated with each object +PUT and DELETE request, instead container updates to the account server are +performed asynchronously by the ``swift-container-updater``. + +.. _upgrade-policy: + +Upgrading and Confirming Functionality +-------------------------------------- + +Upgrading to a version of Swift that has Storage Policy support is not +difficult, in fact, the cluster administrator isn't required to make any special +configuration changes to get going. Swift will automatically begin using the +existing object ring as both the default ring and the Policy-0 ring. Adding the +declaration of policy 0 is totally optional and in its absence, the name given +to the implicit policy 0 will be 'Policy-0'. Let's say for testing purposes +that you wanted to take an existing cluster that already has lots of data on it +and upgrade to Swift with Storage Policies. From there you want to go ahead and +create a policy and test a few things out. All you need to do is: + +#. Upgrade all of your Swift nodes to a policy-aware version of Swift +#. Define your policies in ``/etc/swift/swift.conf`` +#. Create the corresponding object rings +#. Create containers and objects and confirm their placement is as expected + +For a specific example that takes you through these steps, please see +:doc:`policies_saio` + +.. note:: + + If you downgrade from a Storage Policy enabled version of Swift to an + older version that doesn't support policies, you will not be able to + access any data stored in policies other than the policy with index 0 but + those objects WILL appear in container listings (possibly as duplicates if + there was a network partition and un-reconciled objects). It is EXTREMELY + important that you perform any necessary integration testing on the + upgraded deployment before enabling an additional storage policy to ensure + a consistent API experience for your clients. DO NOT downgrade to a + version of Swift that does not support storage policies once you expose + multiple storage policies. diff --git a/doc/source/overview_reaper.rst b/doc/source/overview_reaper.rst index 0488a92863..7e791c31f0 100644 --- a/doc/source/overview_reaper.rst +++ b/doc/source/overview_reaper.rst @@ -18,7 +18,7 @@ account-server.conf to delay the actual deletion of data. At this time, there is no utility to undelete an account; one would have to update the account database replicas directly, setting the status column to an empty string and updating the put_timestamp to be greater than the delete_timestamp. (On the -TODO list is writing a utility to perform this task, preferably through a ReST +TODO list is writing a utility to perform this task, preferably through a REST call.) The account reaper runs on each account server and scans the server @@ -40,6 +40,12 @@ troublesome spot. The account reaper will keep trying to delete an account until it eventually becomes empty, at which point the database reclaim process within the db_replicator will eventually remove the database files. +Sometimes a persistent error state can prevent some object or container +from being deleted. If this happens, you will see a message such as "Account + has not been reaped since " in the log. You can control when +this is logged with the reap_warn_after value in the [account-reaper] section +of the account-server.conf file. By default this is 30 days. + ------- History ------- @@ -47,7 +53,7 @@ History At first, a simple approach of deleting an account through completely external calls was considered as it required no changes to the system. All data would simply be deleted in the same way the actual user would, through the public -ReST API. However, the downside was that it would use proxy resources and log +REST API. However, the downside was that it would use proxy resources and log everything when it didn't really need to. Also, it would likely need a dedicated server or two, just for issuing the delete requests. diff --git a/doc/source/overview_replication.rst b/doc/source/overview_replication.rst index ab2b2c1523..ad9d78fc01 100644 --- a/doc/source/overview_replication.rst +++ b/doc/source/overview_replication.rst @@ -2,39 +2,168 @@ Replication =========== -Since each replica in swift functions independently, and clients generally require only a simple majority of nodes responding to consider an operation successful, transient failures like network partitions can quickly cause replicas to diverge. These differences are eventually reconciled by asynchronous, peer-to-peer replicator processes. The replicator processes traverse their local filesystems, concurrently performing operations in a manner that balances load across physical disks. - -Replication uses a push model, with records and files generally only being copied from local to remote replicas. This is important because data on the node may not belong there (as in the case of handoffs and ring changes), and a replicator can't know what data exists elsewhere in the cluster that it should pull in. It's the duty of any node that contains data to ensure that data gets to where it belongs. Replica placement is handled by the ring. - -Every deleted record or file in the system is marked by a tombstone, so that deletions can be replicated alongside creations. These tombstones are cleaned up by the replication process after a period of time referred to as the consistency window, which is related to replication duration and how long transient failures can remove a node from the cluster. Tombstone cleanup must be tied to replication to reach replica convergence. - -If a replicator detects that a remote drive is has failed, it will use the ring's "get_more_nodes" interface to choose an alternate node to synchronize with. The replicator can maintain desired levels of replication in the face of disk failures, though some replicas may not be in an immediately usable location. Note that the replicator doesn't maintain desired levels of replication in the case of other failures (e.g. entire node failures) because the most of such failures are transient. - -Replication is an area of active development, and likely rife with potential improvements to speed and correctness. - -There are two major classes of replicator - the db replicator, which replicates accounts and containers, and the object replicator, which replicates object data. - +Because each replica in Swift functions independently, and clients generally +require only a simple majority of nodes responding to consider an operation +successful, transient failures like network partitions can quickly cause +replicas to diverge. These differences are eventually reconciled by +asynchronous, peer-to-peer replicator processes. The replicator processes +traverse their local filesystems, concurrently performing operations in a +manner that balances load across physical disks. + +Replication uses a push model, with records and files generally only being +copied from local to remote replicas. This is important because data on the +node may not belong there (as in the case of handoffs and ring changes), and a +replicator can't know what data exists elsewhere in the cluster that it should +pull in. It's the duty of any node that contains data to ensure that data gets +to where it belongs. Replica placement is handled by the ring. + +Every deleted record or file in the system is marked by a tombstone, so that +deletions can be replicated alongside creations. The replication process cleans +up tombstones after a time period known as the consistency window. +The consistency window encompasses replication duration and how long transient +failure can remove a node from the cluster. Tombstone cleanup must +be tied to replication to reach replica convergence. + +If a replicator detects that a remote drive has failed, the replicator uses +the get_more_nodes interface for the ring to choose an alternate node with +which to synchronize. The replicator can maintain desired levels of replication +in the face of disk failures, though some replicas may not be in an immediately +usable location. Note that the replicator doesn't maintain desired levels of +replication when other failures, such as entire node failures, occur because +most failure are transient. + +Replication is an area of active development, and likely rife with potential +improvements to speed and correctness. + +There are two major classes of replicator - the db replicator, which +replicates accounts and containers, and the object replicator, which +replicates object data. -------------- DB Replication -------------- -The first step performed by db replication is a low-cost hash comparison to find out whether or not two replicas already match. Under normal operation, this check is able to verify that most databases in the system are already synchronized very quickly. If the hashes differ, the replicator brings the databases in sync by sharing records added since the last sync point. +The first step performed by db replication is a low-cost hash comparison to +determine whether two replicas already match. Under normal operation, +this check is able to verify that most databases in the system are already +synchronized very quickly. If the hashes differ, the replicator brings the +databases in sync by sharing records added since the last sync point. -This sync point is a high water mark noting the last record at which two databases were known to be in sync, and is stored in each database as a tuple of the remote database id and record id. Database ids are unique amongst all replicas of the database, and record ids are monotonically increasing integers. After all new records have been pushed to the remote database, the entire sync table of the local database is pushed, so the remote database knows it's now in sync with everyone the local database has previously synchronized with. +This sync point is a high water mark noting the last record at which two +databases were known to be in sync, and is stored in each database as a tuple +of the remote database id and record id. Database ids are unique amongst all +replicas of the database, and record ids are monotonically increasing +integers. After all new records have been pushed to the remote database, the +entire sync table of the local database is pushed, so the remote database +can guarantee that it is in sync with everything with which the local database +has previously synchronized. -If a replica is found to be missing entirely, the whole local database file is transmitted to the peer using rsync(1) and vested with a new unique id. +If a replica is found to be missing entirely, the whole local database file is +transmitted to the peer using rsync(1) and vested with a new unique id. -In practice, DB replication can process hundreds of databases per concurrency setting per second (up to the number of available CPUs or disks) and is bound by the number of DB transactions that must be performed. +In practice, DB replication can process hundreds of databases per concurrency +setting per second (up to the number of available CPUs or disks) and is bound +by the number of DB transactions that must be performed. ------------------ Object Replication ------------------ -The initial implementation of object replication simply performed an rsync to push data from a local partition to all remote servers it was expected to exist on. While this performed adequately at small scale, replication times skyrocketed once directory structures could no longer be held in RAM. We now use a modification of this scheme in which a hash of the contents for each suffix directory is saved to a per-partition hashes file. The hash for a suffix directory is invalidated when the contents of that suffix directory are modified. - -The object replication process reads in these hash files, calculating any invalidated hashes. It then transmits the hashes to each remote server that should hold the partition, and only suffix directories with differing hashes on the remote server are rsynced. After pushing files to the remote server, the replication process notifies it to recalculate hashes for the rsynced suffix directories. - -Performance of object replication is generally bound by the number of uncached directories it has to traverse, usually as a result of invalidated suffix directory hashes. Using write volume and partition counts from our running systems, it was designed so that around 2% of the hash space on a normal node will be invalidated per day, which has experimentally given us acceptable replication speeds. - +The initial implementation of object replication simply performed an rsync to +push data from a local partition to all remote servers it was expected to +exist on. While this performed adequately at small scale, replication times +skyrocketed once directory structures could no longer be held in RAM. We now +use a modification of this scheme in which a hash of the contents for each +suffix directory is saved to a per-partition hashes file. The hash for a +suffix directory is invalidated when the contents of that suffix directory are +modified. + +The object replication process reads in these hash files, calculating any +invalidated hashes. It then transmits the hashes to each remote server that +should hold the partition, and only suffix directories with differing hashes +on the remote server are rsynced. After pushing files to the remote server, +the replication process notifies it to recalculate hashes for the rsynced +suffix directories. + +Performance of object replication is generally bound by the number of uncached +directories it has to traverse, usually as a result of invalidated suffix +directory hashes. Using write volume and partition counts from our running +systems, it was designed so that around 2% of the hash space on a normal node +will be invalidated per day, which has experimentally given us acceptable +replication speeds. + +.. _ssync: + +Work continues with a new ssync method where rsync is not used at all and +instead all-Swift code is used to transfer the objects. At first, this ssync +will just strive to emulate the rsync behavior. Once deemed stable it will open +the way for future improvements in replication since we'll be able to easily +add code in the replication path instead of trying to alter the rsync code +base and distributing such modifications. + +One of the first improvements planned is an "index.db" that will replace the +hashes.pkl. This will allow quicker updates to that data as well as more +streamlined queries. Quite likely we'll implement a better scheme than the +current one hashes.pkl uses (hash-trees, that sort of thing). + +Another improvement planned all along the way is separating the local disk +structure from the protocol path structure. This separation will allow ring +resizing at some point, or at least ring-doubling. + +Note that for objects being stored with an Erasure Code policy, the replicator +daemon is not involved. Instead, the reconstructor is used by Erasure Code +policies and is analogous to the replicator for Replication type policies. +See :doc:`overview_erasure_code` for complete information on both Erasure Code +support as well as the reconstructor. + +---------- +Hashes.pkl +---------- + +The hashes.pkl file is a key element for both replication and reconstruction +(for Erasure Coding). Both daemons use this file to determine if any kind of +action is required between nodes that are participating in the durability +scheme. The file itself is a pickled dictionary with slightly different +formats depending on whether the policy is Replication or Erasure Code. In +either case, however, the same basic information is provided between the +nodes. The dictionary contains a dictionary where the key is a suffix +directory name and the value is the MD5 hash of the directory listing for +that suffix. In this manner, the daemon can quickly identify differences +between local and remote suffix directories on a per partition basis as the +scope of any one hashes.pkl file is a partition directory. + +For Erasure Code policies, there is a little more information required. An +object's hash directory may contain multiple fragments of a single object in +the event that the node is acting as a handoff or perhaps if a rebalance is +underway. Each fragment of an object is stored with a fragment index, so +the hashes.pkl for an Erasure Code partition will still be a dictionary +keyed on the suffix directory name, however, the value is another dictionary +keyed on the fragment index with subsequent MD5 hashes for each one as +values. Some files within an object hash directory don't require a fragment +index so None is used to represent those. Below are examples of what these +dictionaries might look like. + +Replication hashes.pkl:: + + {'a43': '72018c5fbfae934e1f56069ad4425627', + 'b23': '12348c5fbfae934e1f56069ad4421234'} + +Erasure Code hashes.pkl:: + + {'a43': {None: '72018c5fbfae934e1f56069ad4425627', + 2: 'b6dd6db937cb8748f50a5b6e4bc3b808'}, + 'b23': {None: '12348c5fbfae934e1f56069ad4421234', + 1: '45676db937cb8748f50a5b6e4bc34567'}} + + + + + +----------------------------- +Dedicated replication network +----------------------------- + +Swift has support for using dedicated network for replication traffic. +For more information see :ref:`Overview of dedicated replication network +`. diff --git a/doc/source/overview_ring.rst b/doc/source/overview_ring.rst index 9af54959db..d5415c2c7c 100644 --- a/doc/source/overview_ring.rst +++ b/doc/source/overview_ring.rst @@ -4,48 +4,67 @@ The Rings The rings determine where data should reside in the cluster. There is a separate ring for account databases, container databases, and individual -objects but each ring works in the same way. These rings are externally -managed, in that the server processes themselves do not modify the rings, they -are instead given new rings modified by other tools. - -The ring uses a configurable number of bits from a path's MD5 hash as a -partition index that designates a device. The number of bits kept from the hash -is known as the partition power, and 2 to the partition power indicates the -partition count. Partitioning the full MD5 hash ring allows other parts of the -cluster to work in batches of items at once which ends up either more efficient -or at least less complex than working with each item separately or the entire -cluster all at once. - -Another configurable value is the replica count, which indicates how many of -the partition->device assignments comprise a single ring. For a given partition -number, each replica's device will not be in the same zone as any other -replica's device. Zones can be used to group devices based on physical -locations, power separations, network separations, or any other attribute that -would lessen multiple replicas being unavailable at the same time. +object storage policies but each ring works in the same way. These rings are +externally managed. The server processes themselves do not modify the +rings; they are instead given new rings modified by other tools. + +The ring uses a configurable number of bits from the MD5 hash of an item's path +as a partition index that designates the device(s) on which that item should +be stored. The number of bits kept from the hash is known as the partition +power, and 2 to the partition power indicates the partition count. Partitioning +the full MD5 hash ring allows the cluster components to process resources in +batches. This ends up either more efficient or at least less complex than +working with each item separately or the entire cluster all at once. + +Another configurable value is the replica count, which indicates how many +devices to assign for each partition in the ring. By having multiple devices +responsible for each partition, the cluster can recover from drive or network +failures. + +Devices are added to the ring to describe the capacity available for +partition replica assignments. Devices are placed into failure domains +consisting of region, zone, and server. Regions can be used to describe +geographical systems characterized by lower bandwidth or higher latency between +machines in different regions. Many rings will consist of only a single +region. Zones can be used to group devices based on physical locations, power +separations, network separations, or any other attribute that would lessen +multiple replicas being unavailable at the same time. + +Devices are given a weight which describes the relative storage capacity +contributed by the device in comparison to other devices. + +When building a ring, replicas for each partition will be assigned to devices +according to the devices' weights. Additionally, each replica of a partition +will preferentially be assigned to a device whose failure domain does not +already have a replica for that partition. Only a single replica of a +partition may be assigned to each device - you must have at least as many +devices as replicas. + +.. _ring_builder: ------------ Ring Builder ------------ The rings are built and managed manually by a utility called the ring-builder. -The ring-builder assigns partitions to devices and writes an optimized Python -structure to a gzipped, serialized file on disk for shipping out to the servers. -The server processes just check the modification time of the file occasionally -and reload their in-memory copies of the ring structure as needed. Because of -how the ring-builder manages changes to the ring, using a slightly older ring -usually just means one of the three replicas for a subset of the partitions -will be incorrect, which can be easily worked around. - -The ring-builder also keeps its own builder file with the ring information and -additional data required to build future rings. It is very important to keep -multiple backup copies of these builder files. One option is to copy the -builder files out to every server while copying the ring files themselves. -Another is to upload the builder files into the cluster itself. Complete loss -of a builder file will mean creating a new ring from scratch, nearly all -partitions will end up assigned to different devices, and therefore nearly all -data stored will have to be replicated to new locations. So, recovery from a -builder file loss is possible, but data will definitely be unreachable for an -extended time. +The ring-builder assigns partitions to devices and writes an optimized +structure to a gzipped, serialized file on disk for shipping out to the +servers. The server processes check the modification time of the file +occasionally and reload their in-memory copies of the ring structure as needed. +Because of how the ring-builder manages changes to the ring, using a slightly +older ring usually just means that for a subset of the partitions the device +for one of the replicas will be incorrect, which can be easily worked around. + +The ring-builder also keeps a separate builder file which includes the ring +information as well as additional data required to build future rings. It is +very important to keep multiple backup copies of these builder files. One +option is to copy the builder files out to every server while copying the ring +files themselves. Another is to upload the builder files into the cluster +itself. Complete loss of a builder file will mean creating a new ring from +scratch, nearly all partitions will end up assigned to different devices, and +therefore nearly all data stored will have to be replicated to new locations. +So, recovery from a builder file loss is possible, but data will definitely be +unreachable for an extended time. ------------------- Ring Data Structure @@ -60,128 +79,316 @@ to calculate the partition for the hash. List of Devices *************** -The list of devices is known internally to the Ring class as devs. Each item in -the list of devices is a dictionary with the following keys: - -====== ======= ============================================================== -id integer The index into the list devices. -zone integer The zone the devices resides in. -weight float The relative weight of the device in comparison to other - devices. This usually corresponds directly to the amount of - disk space the device has compared to other devices. For - instance a device with 1 terabyte of space might have a weight - of 100.0 and another device with 2 terabytes of space might - have a weight of 200.0. This weight can also be used to bring - back into balance a device that has ended up with more or less - data than desired over time. A good average weight of 100.0 - allows flexibility in lowering the weight later if necessary. -ip string The IP address of the server containing the device. -port int The TCP port the listening server process uses that serves - requests for the device. -device string The on disk name of the device on the server. - For example: sdb1 -meta string A general-use field for storing additional information for the - device. This information isn't used directly by the server - processes, but can be useful in debugging. For example, the - date and time of installation and hardware manufacturer could - be stored here. -====== ======= ============================================================== - -Note: The list of devices may contain holes, or indexes set to None, for -devices that have been removed from the cluster. Generally, device ids are not -reused. Also, some devices may be temporarily disabled by setting their weight -to 0.0. To obtain a list of active devices (for uptime polling, for example) -the Python code would look like: ``devices = [device for device in self.devs if -device and device['weight']]`` +The list of devices is known internally to the Ring class as ``devs``. Each +item in the list of devices is a dictionary with the following keys: + +.. table:: + :widths: 10 10 80 + + ====== ======= ============================================================== + id integer The index into the list of devices. + zone integer The zone in which the device resides. + region integer The region in which the zone resides. + weight float The relative weight of the device in comparison to other + devices. This usually corresponds directly to the amount of + disk space the device has compared to other devices. For + instance a device with 1 terabyte of space might have a weight + of 100.0 and another device with 2 terabytes of space might + have a weight of 200.0. This weight can also be used to bring + back into balance a device that has ended up with more or less + data than desired over time. A good average weight of 100.0 + allows flexibility in lowering the weight later if necessary. + ip string The IP address or hostname of the server containing the device. + port int The TCP port on which the server process listens to serve + requests for the device. + device string The on-disk name of the device on the server. + For example: ``sdb1`` + meta string A general-use field for storing additional information for the + device. This information isn't used directly by the server + processes, but can be useful in debugging. For example, the + date and time of installation and hardware manufacturer could + be stored here. + ====== ======= ============================================================== + +.. note:: + The list of devices may contain holes, or indexes set to ``None``, for + devices that have been removed from the cluster. However, device ids are + reused. Device ids are reused to avoid potentially running out of device id + slots when there are available slots (from prior removal of devices). A + consequence of this device id reuse is that the device id (integer value) + does not necessarily correspond with the chronology of when the device was + added to the ring. Also, some devices may be temporarily disabled by + setting their weight to ``0.0``. To obtain a list of active devices (for + uptime polling, for example) the Python code would look like:: + + devices = list(self._iter_devs()) ************************* Partition Assignment List ************************* -This is a list of array('I') of devices ids. The outermost list contains an -array('I') for each replica. Each array('I') has a length equal to the -partition count for the ring. Each integer in the array('I') is an index into -the above list of devices. The partition list is known internally to the Ring -class as _replica2part2dev_id. +The partition assignment list is known internally to the Ring class as +``_replica2part2dev_id``. This is a list of ``array('H')``\s, one for each +replica. Each ``array('H')`` has a length equal to the partition count for the +ring. Each integer in the ``array('H')`` is an index into the above list of +devices. So, to create a list of device dictionaries assigned to a partition, the Python -code would look like: ``devices = [self.devs[part2dev_id[partition]] for -part2dev_id in self._replica2part2dev_id]`` +code would look like:: -array('I') is used for memory conservation as there may be millions of + devices = [self.devs[part2dev_id[partition]] + for part2dev_id in self._replica2part2dev_id] + +``array('H')`` is used for memory conservation as there may be millions of partitions. ********************* Partition Shift Value ********************* -The partition shift value is known internally to the Ring class as _part_shift. -This value used to shift an MD5 hash to calculate the partition on which the -data for that hash should reside. Only the top four bytes of the hash is used -in this process. For example, to compute the partition for the path -/account/container/object the Python code might look like: ``partition = -unpack_from('>I', md5('/account/container/object').digest())[0] >> -self._part_shift`` +The partition shift value is known internally to the Ring class as +``_part_shift``. This value is used to shift an MD5 hash of an item's path to +calculate the partition on which the data for that item should reside. Only the +top four bytes of the hash are used in this process. For example, to compute +the partition for the path ``/account/container/object``, the Python code might +look like:: + + objhash = md5('/account/container/object').digest() + partition = struct.unpack_from('>I', objhash)[0] >> self._part_shift + +For a ring generated with partition power ``P``, the partition shift value is +``32 - P``. + +******************* +Fractional Replicas +******************* + +A ring is not restricted to having an integer number of replicas. In order to +support the gradual changing of replica counts, the ring is able to have a real +number of replicas. + +When the number of replicas is not an integer, the last element of +``_replica2part2dev_id`` will have a length that is less than the partition +count for the ring. This means that some partitions will have more replicas +than others. For example, if a ring has ``3.25`` replicas, then 25% of its +partitions will have four replicas, while the remaining 75% will have just +three. + +.. _ring_dispersion: + +********** +Dispersion +********** + +With each rebalance, the ring builder calculates a dispersion metric. This is +the percentage of partitions in the ring that have too many replicas within a +particular failure domain. + +For example, if you have three servers in a cluster but two replicas for a +partition get placed onto the same server, that partition will count towards +the dispersion metric. + +A lower dispersion value is better, and the value can be used to find the +proper value for "overload". + +.. _ring_overload: + +******** +Overload +******** + +The ring builder tries to keep replicas as far apart as possible while +still respecting device weights. When it can't do both, the overload +factor determines what happens. Each device may take some extra +fraction of its desired partitions to allow for replica dispersion; +once that extra fraction is exhausted, replicas will be placed closer +together than is optimal for durability. + +Essentially, the overload factor lets the operator trade off replica +dispersion (durability) against device balance (uniform disk usage). + +The default overload factor is ``0``, so device weights will be strictly +followed. + +With an overload factor of ``0.1``, each device will accept 10% more +partitions than it otherwise would, but only if needed to maintain +dispersion. + +Example: Consider a 3-node cluster of machines with equal-size disks; +let node A have 12 disks, node B have 12 disks, and node C have only +11 disks. Let the ring have an overload factor of ``0.1`` (10%). + +Without the overload, some partitions would end up with replicas only +on nodes A and B. However, with the overload, every device is willing +to accept up to 10% more partitions for the sake of dispersion. The +missing disk in C means there is one disk's worth of partitions that +would like to spread across the remaining 11 disks, which gives each +disk in C an extra 9.09% load. Since this is less than the 10% +overload, there is one replica of each partition on each node. + +However, this does mean that the disks in node C will have more data +on them than the disks in nodes A and B. If 80% full is the warning +threshold for the cluster, node C's disks will reach 80% full while A +and B's disks are only 72.7% full. + +------------------------------- +Partition & Replica Terminology +------------------------------- + +All descriptions of consistent hashing describe the process of breaking the +keyspace up into multiple ranges (vnodes, buckets, etc.) - many more than the +number of "nodes" to which keys in the keyspace must be assigned. Swift calls +these ranges `partitions` - they are partitions of the total keyspace. + +Each partition will have multiple replicas. Every replica of each partition +must be assigned to a device in the ring. When describing a specific replica +of a partition (like when it's assigned a device) it is described as a +`part-replica` in that it is a specific `replica` of the specific `partition`. +A single device will likely be assigned different replicas from many +partitions, but it may not be assigned multiple replicas of a single partition. + +The total number of partitions in a ring is calculated as ``2 ** +``. The total number of part-replicas in a ring is calculated as +`` * 2 ** ``. + +When considering a device's `weight` it is useful to describe the number of +part-replicas it would like to be assigned. A single device, regardless of +weight, will never hold more than ``2 ** `` part-replicas because +it can not have more than one replica of any partition assigned. The number of +part-replicas a device can take by weights is calculated as its `parts-wanted`. +The true number of part-replicas assigned to a device can be compared to its +parts-wanted similarly to a calculation of percentage error - this deviation in +the observed result from the idealized target is called a device's `balance`. + +When considering a device's `failure domain` it is useful to describe the number +of part-replicas it would like to be assigned. The number of part-replicas +wanted in a failure domain of a tier is the sum of the part-replicas wanted in +the failure domains of its sub-tier. However, collectively when the total +number of part-replicas in a failure domain exceeds or is equal to ``2 ** +`` it is most obvious that it's no longer sufficient to consider +only the number of total part-replicas, but rather the fraction of each +replica's partitions. Consider for example a ring with 3 replicas and 3 +servers: while dispersion requires that each server hold only ⅓ of the total +part-replicas, placement is additionally constrained to require ``1.0`` replica +of *each* partition per server. It would not be sufficient to satisfy +dispersion if two devices on one of the servers each held a replica of a single +partition, while another server held none. By considering a decimal fraction +of one replica's worth of partitions in a failure domain we can derive the +total part-replicas wanted in a failure domain (``1.0 * 2 ** ``). +Additionally we infer more about `which` part-replicas must go in the failure +domain. Consider a ring with three replicas and two zones, each with two +servers (four servers total). The three replicas worth of partitions will be +assigned into two failure domains at the zone tier. Each zone must hold more +than one replica of some partitions. We represent this improper fraction of a +replica's worth of partitions in decimal form as ``1.5`` (``3.0 / 2``). This +tells us not only the *number* of total partitions (``1.5 * 2 ** +``) but also that *each* partition must have `at least` one replica +in this failure domain (in fact ``0.5`` of the partitions will have 2 +replicas). Within each zone the two servers will hold ``0.75`` of a replica's +worth of partitions - this is equal both to "the fraction of a replica's worth +of partitions assigned to each zone (``1.5``) divided evenly among the number +of failure domains in its sub-tier (2 servers in each zone, i.e. ``1.5 / 2``)" +but *also* "the total number of replicas (``3.0``) divided evenly among the +total number of failure domains in the server tier (2 servers × 2 zones = 4, +i.e. ``3.0 / 4``)". It is useful to consider that each server in this ring +will hold only ``0.75`` of a replica's worth of partitions which tells that any +server should have `at most` one replica of a given partition assigned. In the +interests of brevity, some variable names will often refer to the concept +representing the fraction of a replica's worth of partitions in decimal form as +*replicanths* - this is meant to invoke connotations similar to ordinal numbers +as applied to fractions, but generalized to a replica instead of a four\*th* or +a fif\*th*. The "n" was probably thrown in because of Blade Runner. ----------------- Building the Ring ----------------- -The initial building of the ring first calculates the number of partitions that -should ideally be assigned to each device based the device's weight. For -example, if the partition power of 20 the ring will have 1,048,576 partitions. -If there are 1,000 devices of equal weight they will each desire 1,048.576 -partitions. The devices are then sorted by the number of partitions they desire -and kept in order throughout the initialization process. - -Then, the ring builder assigns each replica of each partition to the device -that desires the most partitions at that point while keeping it as far away as -possible from other replicas. The ring builder prefers to assign a replica to a -device in a zone that has no replicas already; should there be no such zone -available, the ring builder will try to find a device on a different server; -failing that, it will just look for a device that has no replicas; finally, if -all other options are exhausted, the ring builder will assign the replica to -the device that has the fewest replicas already assigned. - -When building a new ring based on an old ring, the desired number of partitions -each device wants is recalculated. Next the partitions to be reassigned are -gathered up. Any removed devices have all their assigned partitions unassigned -and added to the gathered list. Any partition replicas that (due to the -addition of new devices) can be spread out for better durability are unassigned -and added to the gathered list. Any devices that have more partitions than they -now desire have random partitions unassigned from them and added to the -gathered list. Lastly, the gathered partitions are then reassigned to devices -using a similar method as in the initial assignment described above. +First the ring builder calculates the replicanths wanted at each tier in the +ring's topology based on weight. + +Then the ring builder calculates the replicanths wanted at each tier in the +ring's topology based on dispersion. + +Then the ring builder calculates the maximum deviation on a single device +between its weighted replicanths and wanted replicanths. + +Next we interpolate between the two replicanth values (weighted & wanted) at +each tier using the specified overload (up to the maximum required overload). +It's a linear interpolation, similar to solving for a point on a line between +two points - we calculate the slope across the max required overload and then +calculate the intersection of the line with the desired overload. This +becomes the target. + +From the target we calculate the minimum and maximum number of replicas any +partition may have in a tier. This becomes the `replica-plan`. + +Finally, we calculate the number of partitions that should ideally be assigned +to each device based the replica-plan. + +On initial balance (i.e., the first time partitions are placed to generate a +ring) we must assign each replica of each partition to the device that desires +the most partitions excluding any devices that already have their maximum +number of replicas of that partition assigned to some parent tier of that +device's failure domain. + +When building a new ring based on an old ring, the desired number of +partitions each device wants is recalculated from the current replica-plan. +Next the partitions to be reassigned are gathered up. Any removed devices have +all their assigned partitions unassigned and added to the gathered list. Any +partition replicas that (due to the addition of new devices) can be spread out +for better durability are unassigned and added to the gathered list. Any +devices that have more partitions than they now desire have random partitions +unassigned from them and added to the gathered list. Lastly, the gathered +partitions are then reassigned to devices using a similar method as in the +initial assignment described above. Whenever a partition has a replica reassigned, the time of the reassignment is recorded. This is taken into account when gathering partitions to reassign so that no partition is moved twice in a configurable amount of time. This configurable amount of time is known internally to the RingBuilder class as -min_part_hours. This restriction is ignored for replicas of partitions on -devices that have been removed, as removing a device only happens on device +``min_part_hours``. This restriction is ignored for replicas of partitions on +devices that have been removed, as device removal should only happens on device failure and there's no choice but to make a reassignment. The above processes don't always perfectly rebalance a ring due to the random nature of gathering partitions for reassignment. To help reach a more balanced -ring, the rebalance process is repeated until near perfect (less 1% off) or -when the balance doesn't improve by at least 1% (indicating we probably can't -get perfect balance due to wildly imbalanced zones or too many partitions -recently moved). +ring, the rebalance process is repeated a fixed number of times until the +replica-plan is fulfilled or unable to be fulfilled (indicating we probably +can't get perfect balance due to too many partitions recently moved). + + +.. _composite_rings: + +--------------- +Composite Rings +--------------- + +See :ref:`composite_builder`. + +********************************** +swift-ring-composer (Experimental) +********************************** +.. automodule:: swift.cli.ringcomposer + +--------------------- +Ring Builder Analyzer +--------------------- +.. automodule:: swift.cli.ring_builder_analyzer ------- History ------- The ring code went through many iterations before arriving at what it is now -and while it has been stable for a while now, the algorithm may be tweaked or -perhaps even fundamentally changed if new ideas emerge. This section will try +and while it has largely been stable, the algorithm has seen a few tweaks or +perhaps even fundamentally changed as new ideas emerge. This section will try to describe the previous ideas attempted and attempt to explain why they were discarded. A "live ring" option was considered where each server could maintain its own copy of the ring and the servers would use a gossip protocol to communicate the changes they made. This was discarded as too complex and error prone to code -correctly in the project time span available. One bug could easily gossip bad +correctly in the project timespan available. One bug could easily gossip bad data out to the entire cluster and be difficult to recover from. Having an externally managed ring simplifies the process, allows full validation of data before it's shipped out to the servers, and guarantees each server is using a @@ -195,18 +402,18 @@ like the current process but where servers could submit change requests to the ring server to have a new ring built and shipped back out to the servers. This was discarded due to project time constraints and because ring changes are currently infrequent enough that manual control was sufficient. However, lack -of quick automatic ring changes did mean that other parts of the system had to -be coded to handle devices being unavailable for a period of hours until +of quick automatic ring changes did mean that other components of the system +had to be coded to handle devices being unavailable for a period of hours until someone could manually update the ring. The current ring process has each replica of a partition independently assigned to a device. A version of the ring that used a third of the memory was tried, where the first replica of a partition was directly assigned and the other two were determined by "walking" the ring until finding additional devices in other -zones. This was discarded as control was lost as to how many replicas for a -given partition moved at once. Keeping each replica independent allows for +zones. This was discarded due to the loss of control over how many replicas for +a given partition moved at once. Keeping each replica independent allows for moving only one partition replica within a given time window (except due to -device failures). Using the additional memory was deemed a good tradeoff for +device failures). Using the additional memory was deemed a good trade-off for moving data around the cluster much less often. Another ring design was tried where the partition to device assignments weren't @@ -219,16 +426,16 @@ add up. In the end, the memory savings wasn't that great and more processing power was used, so the idea was discarded. A completely non-partitioned ring was also tried but discarded as the -partitioning helps many other parts of the system, especially replication. +partitioning helps many other components of the system, especially replication. Replication can be attempted and retried in a partition batch with the other replicas rather than each data item independently attempted and retried. Hashes of directory structures can be calculated and compared with other replicas to reduce directory walking and network traffic. Partitioning and independently assigning partition replicas also allowed for -the best balanced cluster. The best of the other strategies tended to give -+-10% variance on device balance with devices of equal weight and +-15% with -devices of varying weights. The current strategy allows us to get +-3% and +-8% +the best-balanced cluster. The best of the other strategies tended to give +±10% variance on device balance with devices of equal weight and ±15% with +devices of varying weights. The current strategy allows us to get ±3% and ±8% respectively. Various hashing algorithms were tried. SHA offers better security, but the ring @@ -237,3 +444,19 @@ faster, but MD5 was built-in and hash computation is a small percentage of the overall request handling time. In all, once it was decided the servers wouldn't be maintaining the rings themselves anyway and only doing hash lookups, MD5 was chosen for its general availability, good distribution, and adequate speed. + +The placement algorithm has seen a number of behavioral changes for +unbalanceable rings. The ring builder wants to keep replicas as far apart as +possible while still respecting device weights. In most cases, the ring +builder can achieve both, but sometimes they conflict. At first, the behavior +was to keep the replicas far apart and ignore device weight, but that made it +impossible to gradually go from one region to two, or from two to three. Then +it was changed to favor device weight over dispersion, but that wasn't so good +for rings that were close to balanceable, like 3 machines with 60TB, 60TB, and +57TB of disk space; operators were expecting one replica per machine, but +didn't always get it. After that, overload was added to the ring builder so +that operators could choose a balance between dispersion and device weights. +In time the overload concept was improved and made more accurate. + +For more background on consistent hashing rings, please see +:doc:`ring_background`. diff --git a/doc/source/overview_ring_format.rst b/doc/source/overview_ring_format.rst new file mode 100644 index 0000000000..7f108c3d8c --- /dev/null +++ b/doc/source/overview_ring_format.rst @@ -0,0 +1,253 @@ +================= +Ring File Formats +================= + +The ring is the most important data structure in Swift. How this data structure +been serialized to disk has changed over the years. + +Initially ring files contain three key pieces of information: + +* the part_power value (often stored as ``part_shift := 32 - part_power``) + + * which determines how many partitions are in the ring, + +* the device list + + * which includes all the disks participating in the ring, and + +* the replica-to-part-to-device table + + * which has all ``replica_count * (2 ** part_power)`` partition assignments. + +But the ability to extend the serialization format to add more data structures +to the ring serialization format has meant a new ring v2 format has been created. + +Ring files have always been gzipped when serialized, though the inner, +raw format has evolved over the years. + +Ring v0 +------- + +Initially, rings were simply pickle dumps of the RingData object. `With +Swift 1.3.0 `__, this +changed to pickling a pure-stdlib data structure, but the core concept +was the same. + +.. note: + + Swift 2.36.0 dropped support for v0 rings. + +Ring v1 +------- + +Pickle presented some problems, however. While `there are security +concerns `__ around unpickling +untrusted data, security boundaries are generally drawn such that rings are +assumed to be trusted. Ultimately, what pushed us to a new format were +`performance considerations `__. + +Starting in `Swift 1.7.0 `__, +Swift began using a new format (while still being willing to read the old one). +The new format starts with some magic so we may identify it as such:: + + +---------------+-------+ + |'R' '1' 'N' 'G'| | + +---------------+-------+ + +where ```` is a network-order two-byte version number (which is always 1). +After that, a JSON object is serialized as:: + + +---------------+-------...---+ + | | | + +---------------+-------...---+ + +where ```` is the network-order four-byte length (in bytes) of +````, which is the ASCII-encoded JSON-serialized object. This object +has at minimum three keys: + +* ``devs`` for the device list +* ``part_shift`` (i.e., ``32 - part_power``) +* ``replica_count`` for the integer number of part-to-device rows to read + +The replica-to-part-to-device table then follows:: + + +-------+-------+...+-------+-------+ + | | |...| | | + +-------+-------+...+-------+-------+ + | | |...| | | + +-------+-------+...+-------+-------+ + | ... | + +-------+-------+...+-------+-------+ + | | |...| + +-------+-------+...+ + +Each ```` is a host-order two-byte index into the ``devs`` list. Every row +except the last has exactly ``2 ** part_power`` entries; the last row may +have the same or fewer. + +The metadata object has proven quite versatile: new keys have been added +to provide additional information while remaining backwards-compatible. +In order, the following new fields have been added: + +* ``byteorder`` specifies whether the host-order for the + replica-to-part-to-device table is "big" or "little" endian. Added in + `Swift 2.12.0 `__, + this allows rings written on big-endian machines to be read on + little-endian machines and vice-versa. +* ``next_part_power`` indicates whether a partition-power increase is in + progress. Added in `Swift 2.15.0 `__, + this will have one of two values, if present: the ring's current + ``part_power``, indicating that there may be hardlinks to clean up, + or ``part_power + 1`` indicating that hardlinks may need to be created. + See :ref:`the documentation` + for more information. +* ``version`` specifies the version number of the ring-builder that was used + to write this ring. Added in `Swift 2.24.0 `__, + this allows the comparing of rings from different machines to determine + which is newer. + +Ring v2 +------- + +The way that v1 rings dealt with fractional replicas made it impossible +to reliably serialize additional large data structures after the +replica-to-part-to-device table. The v2 format has been designed to be +extensable. + +The new format starts with magic similar to v1:: + + +---------------+-------+ + |'R' '1' 'N' 'G'| | + +---------------+-------+ + +where is again a network-order two-byte version number (which is now 2). +By bumping the version number, we ensure that old versions of Swift refuse to +read the ring, rather than misinterpret the content. + +After that, a series of BLOBs are serialized, each as:: + + +-------------------------------+-------...---+ + | | | + +-------------------------------+-------...---+ + +where ```` is the network-order eight-byte length (in bytes) of +````. Each BLOB is preceded by a ``Z_FULL_FLUSH`` to allow it to be +decompressed without reading the whole file. + +The order of the BLOBs isn't important, although they do tend to be written +in the order Swift will read them while loading. This reduces the disk seeks +necessary to load. + +The final BLOB is an index: a JSON object mapping named sections to an array +of offsets within the file, like + +.. code:: + + { + section: [ + compressed start, + uncompressed start, + compressed end, + uncompressed end, + checksum method, + checksum value + ], + ... + } + +Section names may be arbitrary strings, but the "swift/" prefix is reserved +for upstream use. The start/end values mark the beginning and ending of the +section's BLOB. Note that some end values may be ``null`` if they were not +known when the index was written -- in particular, this *will* be true for +the index itself. The checksum method should be one of ``"md5"``, ``"sha1"``, +``"sha256"``, or ``"sha512"``; other values will be ignored in anticipation +of a need to support further algorithms. The checksum value will be the +hex-encoded digest of the uncompressed section's bytes. Like end values, +checksum data may be ``null`` if not known when the index is written. + +Finally, a "tail" is written: + +* the gzip stream is flushed with another ``Z_FULL_FLUSH``, +* the stream is switched to uncompressed, +* the eight-byte offset of the uncompressed start of the index is written, +* the gzip stream is flushed with another ``Z_FULL_FLUSH``, +* the eight-byte offset of the compressed start of the index is written, +* the gzip stream is flushed with another ``Z_FULL_FLUSH``, and +* the gzip stream is closed; this involves: + + * flushing the underlying deflate stream with ``Z_FINISH`` + * writing ``CRC32`` (of the full uncompressed data) + * writing ``ISIZE`` (the length of the full uncompressed data ``mod 2 ** 32``) + +By switching to uncompressed, we can know exactly how many bytes will be +written in the tail, so that when reading we can quickly seek to and read the +index offset, seek to the index start, and read the index. From there we +can do similar things for any other section. + + +* Seek to the end of the file +* Go back 31 bytes in the underlying file; this should leave us at the start of + the deflate block containing the offset for the compressed start +* Decompress 8 bytes from the deflate stream to get the location of the + compressed start of the index BLOB +* Seek to that location +* Read/decompress the size of the index BLOB +* Read/decompress the json serialized index. + +.. note:: This 31 bytes is the deflate block containing the 8 byte location, + a ``Z_FULL_FLUSH`` block, the ``Z_FINISH`` block, and the ``CRC32`` and + ``ISIZE``. For more information, see `RFC 1951`_ (for the deflate stream) + and `RFC 1952`_ (for the gzip format). + +The currently defined section and section names upstream are as follows: + +* ``swift/index`` - The swift index +* ``swift/ring/metadata`` - Ring metadata serialized as json +* ``swift/ring/devices`` - Devices json serialized data structure. + + * This has been seperated from the ring metadata structure in v1 as it + gets large + +* ``swift/ring/assignments`` - The ring replica2part2dev_id data structure + +.. note:: + Third-parties may find it useful to add their own sections; however, + the ``swift/`` prefix is reserved for future upstream enhancements. + +swift/ring/metadata +~~~~~~~~~~~~~~~~~~~ +This BLOB is an ASCII-encoded JSON object full of metadata, similar +to v1 rings. It has the following required keys: + +* ``part_shift`` +* ``dev_id_bytes`` specifies the number of bytes used for each ```` in the + replica-to-part-to-device table; will be one of 2, 4, or 8 + +Additionally, there are several optional keys which may be present: + +* ``next_part_power`` +* ``version`` + +Notice that two keys are no longer present: ``replica_count`` is no longer +needed as the size of the replica-to-part-to-device table is explicit, and +``byteorder`` is not needed as all data in v2 rings should be written using +network-order. + +swift/ring/devices +~~~~~~~~~~~~~~~~~~ +This BLOB contains a list of swift device dictionarys. And was seperated out +from the metadata BLOB as this can become a large structure in it's own right. + +swift/ring/assignments +~~~~~~~~~~~~~~~~~~~~~~ +This BLOB is the replica-to-part-to-device table. It's length will be +``replicas * (2 ** part_power) * dev_id_bytes``, where ``replicas`` is the exact +(potentially fractional) replica count for the ring. Unlike in v1, each +```` is written using network-order. + +Note that this is why we increased the size of ```` as compared to +the v1 format -- otherwise, we may not be able to represent rings with both +high ``replica_count`` and high ``part_power``. + +.. _RFC 1952: https://rfc-editor.org/rfc/rfc1952 +.. _RFC 1951: https://rfc-editor.org/rfc/rfc1951 diff --git a/doc/source/overview_wsgi_management.rst b/doc/source/overview_wsgi_management.rst new file mode 100644 index 0000000000..ab1098afe5 --- /dev/null +++ b/doc/source/overview_wsgi_management.rst @@ -0,0 +1,87 @@ +WSGI Server Process Management +============================== + +Graceful Shutdowns with ``SIGHUP`` +---------------------------------- + +Swift has always supported graceful WSGI server shutdown via ``SIGHUP``. +This causes the manager process to fall out of its +ensure-all-workers-are-running loop, close all workers' listen sockets, +and exit. Closing the listen sockets causes all new ``accept`` calls to +fail, but does not impact any established connections. + +The workers are re-parented, likely to PID 1, and are discoverable with +``swift-orphans``. When the ``accept`` call fails, it waits for the +connection-handling ``GreenPool`` to complete, then exits. Each worker +continues processing the current request, then closes the connection. +Note that clients will get connection errors if they try to re-use a +connection for further requests. + +Prior to the introduction of seamless reloads (see below), a common +reload strategy was to perform a graceful shutdown followed by a fresh +service start. + +Seamless Reloads with ``SIGUSR1`` +--------------------------------- + +Beginning with Swift 2.24.0, WSGI servers support seamless reloads via +``SIGUSR1``. This allows servers to restart to pick up configuration or +code changes while being minimally-disruptive to clients. The process +is as follows: + +.. image:: images/reload_process_tree_1.svg + +1. Manager process receives ``USR1`` signal. This causes the process to fall + out of its loop ensuring that all workers are running and instead begin + reloading. The workers continue servicing client requests as long as + their listen sockets remain open. + +.. image:: images/reload_process_tree_2.svg + +2. Manager process forks. The new child knows about all the existing + workers and their listen sockets; it will be responsible for closing + the old worker listen sockets so they stop accepting new connections. + +.. image:: images/reload_process_tree_3.svg + +3. Manager process re-exec's itself. It picks up new configuration and + code while maintaining the same PID as the old manager process. At + this point only the socket-closer is tracking the old workers, but + everything (including old workers) remains a child of the new manager + process. As a result, old workers are *not* discoverable with + ``swift-orphans``; ``swift-oldies`` may be useful, but will also find + the manager process. + +.. image:: images/reload_process_tree_4.svg + +4. New manager process forks off new workers, each with its own listen + socket. Once all workers have started and can accept new connections, + the manager notifies the socket-closer via a pipe. The socket-closer + closes the old worker listen sockets so they stop accepting new + connections, passes the list of old workers to the new manager, + then exits. + +.. image:: images/reload_process_tree_5.svg + +5. Old workers continue servicing any in-progress connections, while new + connections are picked up by new workers. Once an old worker completes + all of its oustanding requests, it exits. Beginning with Swift 2.35.0, + if any workers persist beyond ``stale_worker_timeout``, the new manager + will clean them up with ``KILL`` signals. + +.. image:: images/reload_process_tree_6.svg + +6. All old workers have now exited. Only new code and configs are in use. + +``swift-reload`` +---------------- + +Beginning with Swift 2.33.0, a new ``swift-reload`` helper is included +to help validate the reload process. Given a PID, it will + +1. Validate that the PID seems to belong to a Swift WSGI server manager + process, +2. Check that the config file used by that PID is currently valid, +3. Send the ``USR1`` signal to initiate a reload, and +4. Wait for the new workers to come up (indicating the reload is complete) + before exiting. diff --git a/doc/source/policies_saio.rst b/doc/source/policies_saio.rst new file mode 100644 index 0000000000..9a8f6ce2e8 --- /dev/null +++ b/doc/source/policies_saio.rst @@ -0,0 +1,177 @@ +=========================================== +Adding Storage Policies to an Existing SAIO +=========================================== + +Depending on when you downloaded your SAIO environment, it may already +be prepared with two storage policies that enable some basic functional +tests. In the event that you are adding a storage policy to an existing +installation, however, the following section will walk you through the +steps for setting up Storage Policies. Note that configuring more than +one storage policy on your development environment is recommended but +optional. Enabling multiple Storage Policies is very easy regardless of +whether you are working with an existing installation or starting a +brand new one. + +Now we will create two policies - the first one will be a standard triple +replication policy that we will also explicitly set as the default and +the second will be setup for reduced replication using a factor of 2x. +We will call the first one 'gold' and the second one 'silver'. In this +example both policies map to the same devices because it's also +important for this sample implementation to be simple and easy +to understand and adding a bunch of new devices isn't really required +to implement a usable set of policies. + +1. To define your policies, add the following to your ``/etc/swift/swift.conf`` + file: + + .. code:: ini + + [storage-policy:0] + name = gold + aliases = yellow, orange + default = yes + + [storage-policy:1] + name = silver + + See :doc:`overview_policies` for detailed information on ``swift.conf`` policy + options. + +2. To create the object ring for the silver policy (index 1), add the following + to your ``bin/remakerings`` script and re-run it (your script may already have + these changes): + + .. code:: shell + + swift-ring-builder object-1.builder create 10 2 1 + swift-ring-builder object-1.builder add r1z1-127.0.0.1:6210/sdb1 1 + swift-ring-builder object-1.builder add r1z2-127.0.0.1:6220/sdb2 1 + swift-ring-builder object-1.builder add r1z3-127.0.0.1:6230/sdb3 1 + swift-ring-builder object-1.builder add r1z4-127.0.0.1:6240/sdb4 1 + swift-ring-builder object-1.builder rebalance + + Note that the reduced replication of the silver policy is only a function + of the replication parameter in the ``swift-ring-builder create`` command + and is not specified in ``/etc/swift/swift.conf``. + +3. Copy ``etc/container-reconciler.conf-sample`` to + ``/etc/swift/container-reconciler.conf`` and fix the user option: + + .. code:: shell + + cp etc/container-reconciler.conf-sample /etc/swift/container-reconciler.conf + sed -i "s/# user.*/user = $USER/g" /etc/swift/container-reconciler.conf + +------------------ +Using Policies +------------------ + +Setting up Storage Policies was very simple, and using them is even +simpler. In this section, we will run some commands to create a few +containers with different policies and store objects in them and see how +Storage Policies effect placement of data in Swift. + +1. We will be using the list_endpoints middleware to confirm object locations, + so enable that now in your ``proxy-server.conf`` file by adding it to the pipeline + and including the filter section as shown below (be sure to restart your proxy + after making these changes): + + .. code:: ini + + pipeline = catch_errors gatekeeper healthcheck proxy-logging cache bulk \ + slo dlo ratelimit crossdomain list-endpoints tempurl tempauth staticweb \ + container-quotas account-quotas proxy-logging proxy-server + + [filter:list-endpoints] + use = egg:swift#list_endpoints + +2. Check to see that your policies are reported via /info: + + .. code:: shell + + swift -A http://127.0.0.1:8080/auth/v1.0 -U test:tester -K testing info + + You should see this: (only showing the policy output here): + + .. code:: none + + policies: [{'aliases': 'gold, yellow, orange', 'default': True, + 'name': 'gold'}, {'aliases': 'silver', 'name': 'silver'}] + +3. Now create a container without specifying a policy, it will use the + default, 'gold' and then put a test object in it (create the file ``file0.txt`` + with your favorite editor with some content): + + .. code:: shell + + curl -v -X PUT -H 'X-Auth-Token: ' \ + http://127.0.0.1:8080/v1/AUTH_test/myCont0 + curl -X PUT -v -T file0.txt -H 'X-Auth-Token: ' \ + http://127.0.0.1:8080/v1/AUTH_test/myCont0/file0.txt + +4. Now confirm placement of the object with the :ref:`list_endpoints` middleware: + + .. code:: shell + + curl -X GET -v http://127.0.0.1:8080/endpoints/AUTH_test/myCont0/file0.txt + + You should see this: (note placement on expected devices): + + .. code:: json + + ["http://127.0.0.1:6230/sdb3/761/AUTH_test/myCont0/file0.txt", + "http://127.0.0.1:6210/sdb1/761/AUTH_test/myCont0/file0.txt", + "http://127.0.0.1:6220/sdb2/761/AUTH_test/myCont0/file0.txt"] + +5. Create a container using policy 'silver' and put a different file in it: + + .. code:: shell + + curl -v -X PUT -H 'X-Auth-Token: ' -H \ + "X-Storage-Policy: silver" \ + http://127.0.0.1:8080/v1/AUTH_test/myCont1 + curl -X PUT -v -T file1.txt -H 'X-Auth-Token: ' \ + http://127.0.0.1:8080/v1/AUTH_test/myCont1/ + +6. Confirm placement of the object for policy 'silver': + + .. code:: shell + + curl -X GET -v http://127.0.0.1:8080/endpoints/AUTH_test/myCont1/file1.txt + + You should see this: (note placement on expected devices): + + .. code:: json + + ["http://127.0.0.1:6210/sdb1/32/AUTH_test/myCont1/file1.txt", + "http://127.0.0.1:6240/sdb4/32/AUTH_test/myCont1/file1.txt"] + +7. Confirm account information with HEAD, make sure that your container-updater + service is running and has executed once since you performed the PUTs or the + account database won't be updated yet: + + .. code:: shell + + curl -i -X HEAD -H 'X-Auth-Token: ' \ + http://127.0.0.1:8080/v1/AUTH_test + + You should see something like this (note that total and per policy stats + object sizes will vary): + + .. code:: none + + HTTP/1.1 204 No Content + Content-Length: 0 + X-Account-Object-Count: 2 + X-Account-Bytes-Used: 174 + X-Account-Container-Count: 2 + X-Account-Storage-Policy-Gold-Object-Count: 1 + X-Account-Storage-Policy-Gold-Bytes-Used: 84 + X-Account-Storage-Policy-Silver-Object-Count: 1 + X-Account-Storage-Policy-Silver-Bytes-Used: 90 + X-Timestamp: 1397230339.71525 + Content-Type: text/plain; charset=utf-8 + Accept-Ranges: bytes + X-Trans-Id: tx96e7496b19bb44abb55a3-0053482c75 + X-Openstack-Request-Id: tx96e7496b19bb44abb55a3-0053482c75 + Date: Fri, 11 Apr 2014 17:55:01 GMT diff --git a/doc/source/proxy.rst b/doc/source/proxy.rst index 210480d7eb..405d06280d 100644 --- a/doc/source/proxy.rst +++ b/doc/source/proxy.rst @@ -4,6 +4,43 @@ Proxy ***** +.. _proxy-controllers: + +Proxy Controllers +================= + +Base +~~~~ + +.. automodule:: swift.proxy.controllers.base + :members: + :undoc-members: + :show-inheritance: + +Account +~~~~~~~ + +.. automodule:: swift.proxy.controllers.account + :members: + :undoc-members: + :show-inheritance: + +Container +~~~~~~~~~ + +.. automodule:: swift.proxy.controllers.container + :members: + :undoc-members: + :show-inheritance: + +Object +~~~~~~ + +.. automodule:: swift.proxy.controllers.obj + :members: + :undoc-members: + :show-inheritance: + .. _proxy-server: Proxy Server diff --git a/doc/source/ratelimit.rst b/doc/source/ratelimit.rst index 3b5f95bd03..81832f2bf2 100644 --- a/doc/source/ratelimit.rst +++ b/doc/source/ratelimit.rst @@ -1,8 +1,10 @@ +.. _ratelimit: + ============= Rate Limiting ============= -Rate limiting in swift is implemented as a pluggable middleware. Rate +Rate limiting in Swift is implemented as a pluggable middleware. Rate limiting is performed on requests that result in database writes to the account and container sqlite dbs. It uses memcached and is dependent on the proxy servers having highly synchronized time. The rate limits are @@ -15,38 +17,44 @@ Configuration All configuration is optional. If no account or container limits are provided there will be no rate limiting. Configuration available: -======================== ========= =========================================== -Option Default Description ------------------------- --------- ------------------------------------------- -clock_accuracy 1000 Represents how accurate the proxy servers' - system clocks are with each other. 1000 - means that all the proxies' clock are - accurate to each other within 1 - millisecond. No ratelimit should be - higher than the clock accuracy. -max_sleep_time_seconds 60 App will immediately return a 498 response - if the necessary sleep time ever exceeds - the given max_sleep_time_seconds. -log_sleep_time_seconds 0 To allow visibility into rate limiting set - this value > 0 and all sleeps greater than - the number will be logged. -rate_buffer_seconds 5 Number of seconds the rate counter can - drop and be allowed to catch up (at a - faster than listed rate). A larger number - will result in larger spikes in rate but - better average accuracy. -account_ratelimit 0 If set, will limit PUT and DELETE requests - to /account_name/container_name. - Number is in requests per second. -account_whitelist '' Comma separated lists of account names that - will not be rate limited. -account_blacklist '' Comma separated lists of account names that - will not be allowed. Returns a 497 response. -container_ratelimit_size '' When set with container_limit_x = r: - for containers of size x, limit requests - per second to r. Will limit PUT, DELETE, - and POST requests to /a/c/o. -======================== ========= =========================================== +================================ ======= ====================================== +Option Default Description +-------------------------------- ------- -------------------------------------- +clock_accuracy 1000 Represents how accurate the proxy + servers' system clocks are with each + other. 1000 means that all the + proxies' clock are accurate to each + other within 1 millisecond. No + ratelimit should be higher than the + clock accuracy. +max_sleep_time_seconds 60 App will immediately return a 498 + response if the necessary sleep time + ever exceeds the given + max_sleep_time_seconds. +log_sleep_time_seconds 0 To allow visibility into rate limiting + set this value > 0 and all sleeps + greater than the number will be + logged. +rate_buffer_seconds 5 Number of seconds the rate counter can + drop and be allowed to catch up (at a + faster than listed rate). A larger + number will result in larger spikes in + rate but better average accuracy. +account_ratelimit 0 If set, will limit PUT and DELETE + requests to + /account_name/container_name. Number + is in requests per second. +container_ratelimit_size '' When set with container_ratelimit_x = + r: for containers of size x, limit + requests per second to r. Will limit + PUT, DELETE, and POST requests to + /a/c/o. +container_listing_ratelimit_size '' When set with + container_listing_ratelimit_x = r: for + containers of size x, limit listing + requests per second to r. Will limit + GET requests to /a/c. +================================ ======= ====================================== The container rate limits are linearly interpolated from the values given. A sample container rate limiting could be: @@ -70,3 +78,35 @@ Container Size Rate Limit ================ ============ +----------------------------- +Account Specific Ratelimiting +----------------------------- + + +The above ratelimiting is to prevent the "many writes to a single container" +bottleneck from causing a problem. There could also be a problem where a single +account is just using too much of the cluster's resources. In this case, the +container ratelimits may not help because the customer could be doing thousands +of reqs/sec to distributed containers each getting a small fraction of the +total so those limits would never trigger. If a system administrator notices +this, he/she can set the X-Account-Sysmeta-Global-Write-Ratelimit on an account +and that will limit the total number of write requests (PUT, POST, DELETE, +COPY) that account can do for the whole account. This limit will be in addition +to the applicable account/container limits from above. This header will be +hidden from the user, because of the gatekeeper middleware, and can only be set +using a direct client to the account nodes. It accepts a float value and will +only limit requests if the value is > 0. + +------------------- +Black/White-listing +------------------- + +To blacklist or whitelist an account set: + +X-Account-Sysmeta-Global-Write-Ratelimit: BLACKLIST + +or + +X-Account-Sysmeta-Global-Write-Ratelimit: WHITELIST + +in the account headers. diff --git a/doc/source/replication_network.rst b/doc/source/replication_network.rst new file mode 100644 index 0000000000..6740ec8250 --- /dev/null +++ b/doc/source/replication_network.rst @@ -0,0 +1,527 @@ +.. _Dedicated-replication-network: + +============================= +Dedicated replication network +============================= + +------- +Summary +------- + +Swift's replication process is essential for consistency and availability of +data. By default, replication activity will use the same network interface as +other cluster operations. However, if a replication interface is set in the +ring for a node, that node will send replication traffic on its designated +separate replication network interface. Replication traffic includes REPLICATE +requests and rsync traffic. + +To separate the cluster-internal replication traffic from client traffic, +separate replication servers can be used. These replication servers are based +on the standard storage servers, but they listen on the replication IP and +only respond to REPLICATE requests. Storage servers can serve REPLICATE +requests, so an operator can transition to using a separate replication +network with no cluster downtime. + +Replication IP and port information is stored in the ring on a per-node basis. +These parameters will be used if they are present, but they are not required. +If this information does not exist or is empty for a particular node, the +node's standard IP and port will be used for replication. + +-------------------- +For SAIO replication +-------------------- + +#. Create new script in ``~/bin/`` (for example: ``remakerings_new``):: + + #!/bin/bash + set -e + cd /etc/swift + rm -f *.builder *.ring.gz backups/*.builder backups/*.ring.gz + swift-ring-builder object.builder create 10 3 1 + swift-ring-builder object.builder add z1-127.0.0.1:6210R127.0.0.1:6250/sdb1 1 + swift-ring-builder object.builder add z2-127.0.0.1:6220R127.0.0.1:6260/sdb2 1 + swift-ring-builder object.builder add z3-127.0.0.1:6230R127.0.0.1:6270/sdb3 1 + swift-ring-builder object.builder add z4-127.0.0.1:6240R127.0.0.1:6280/sdb4 1 + swift-ring-builder object.builder rebalance + swift-ring-builder object-1.builder create 10 2 1 + swift-ring-builder object-1.builder add z1-127.0.0.1:6210R127.0.0.1:6250/sdb1 1 + swift-ring-builder object-1.builder add z2-127.0.0.1:6220R127.0.0.1:6260/sdb2 1 + swift-ring-builder object-1.builder add z3-127.0.0.1:6230R127.0.0.1:6270/sdb3 1 + swift-ring-builder object-1.builder add z4-127.0.0.1:6240R127.0.0.1:6280/sdb4 1 + swift-ring-builder object-1.builder rebalance + swift-ring-builder object-2.builder create 10 6 1 + swift-ring-builder object-2.builder add z1-127.0.0.1:6210R127.0.0.1:6250/sdb1 1 + swift-ring-builder object-2.builder add z1-127.0.0.1:6210R127.0.0.1:6250/sdb5 1 + swift-ring-builder object-2.builder add z2-127.0.0.1:6220R127.0.0.1:6260/sdb2 1 + swift-ring-builder object-2.builder add z2-127.0.0.1:6220R127.0.0.1:6260/sdb6 1 + swift-ring-builder object-2.builder add z3-127.0.0.1:6230R127.0.0.1:6270/sdb3 1 + swift-ring-builder object-2.builder add z3-127.0.0.1:6230R127.0.0.1:6270/sdb7 1 + swift-ring-builder object-2.builder add z4-127.0.0.1:6240R127.0.0.1:6280/sdb4 1 + swift-ring-builder object-2.builder add z4-127.0.0.1:6240R127.0.0.1:6280/sdb8 1 + swift-ring-builder object-2.builder rebalance + swift-ring-builder container.builder create 10 3 1 + swift-ring-builder container.builder add z1-127.0.0.1:6211R127.0.0.1:6251/sdb1 1 + swift-ring-builder container.builder add z2-127.0.0.1:6221R127.0.0.1:6261/sdb2 1 + swift-ring-builder container.builder add z3-127.0.0.1:6231R127.0.0.1:6271/sdb3 1 + swift-ring-builder container.builder add z4-127.0.0.1:6241R127.0.0.1:6281/sdb4 1 + swift-ring-builder container.builder rebalance + swift-ring-builder account.builder create 10 3 1 + swift-ring-builder account.builder add z1-127.0.0.1:6212R127.0.0.1:6252/sdb1 1 + swift-ring-builder account.builder add z2-127.0.0.1:6222R127.0.0.1:6262/sdb2 1 + swift-ring-builder account.builder add z3-127.0.0.1:6232R127.0.0.1:6272/sdb3 1 + swift-ring-builder account.builder add z4-127.0.0.1:6242R127.0.0.1:6282/sdb4 1 + swift-ring-builder account.builder rebalance + + .. note:: + Syntax of adding device has been changed: ``R:`` + was added between ``z-:`` and ``/_ ``. + Added devices will use and for replication activities. + +#. Add next rows in ``/etc/rsyncd.conf``:: + + [account6252] + max connections = 25 + path = /srv/1/node/ + read only = false + lock file = /var/lock/account6252.lock + + [account6262] + max connections = 25 + path = /srv/2/node/ + read only = false + lock file = /var/lock/account6262.lock + + [account6272] + max connections = 25 + path = /srv/3/node/ + read only = false + lock file = /var/lock/account6272.lock + + [account6282] + max connections = 25 + path = /srv/4/node/ + read only = false + lock file = /var/lock/account6282.lock + + + [container6251] + max connections = 25 + path = /srv/1/node/ + read only = false + lock file = /var/lock/container6251.lock + + [container6261] + max connections = 25 + path = /srv/2/node/ + read only = false + lock file = /var/lock/container6261.lock + + [container6271] + max connections = 25 + path = /srv/3/node/ + read only = false + lock file = /var/lock/container6271.lock + + [container6281] + max connections = 25 + path = /srv/4/node/ + read only = false + lock file = /var/lock/container6281.lock + + + [object6250] + max connections = 25 + path = /srv/1/node/ + read only = false + lock file = /var/lock/object6250.lock + + [object6260] + max connections = 25 + path = /srv/2/node/ + read only = false + lock file = /var/lock/object6260.lock + + [object6270] + max connections = 25 + path = /srv/3/node/ + read only = false + lock file = /var/lock/object6270.lock + + [object6280] + max connections = 25 + path = /srv/4/node/ + read only = false + lock file = /var/lock/object6280.lock + +#. Restart rsync daemon:: + + service rsync restart + +#. Update configuration files in directories: + + * /etc/swift/object-server(files: 1.conf, 2.conf, 3.conf, 4.conf) + * /etc/swift/container-server(files: 1.conf, 2.conf, 3.conf, 4.conf) + * /etc/swift/account-server(files: 1.conf, 2.conf, 3.conf, 4.conf) + + delete all configuration options in section ``[<*>-replicator]`` + +#. Add configuration files for object-server, in ``/etc/swift/object-server/`` + + * 5.conf:: + + [DEFAULT] + devices = /srv/1/node + mount_check = false + disable_fallocate = true + bind_port = 6250 + user = swift + log_facility = LOG_LOCAL2 + recon_cache_path = /var/cache/swift + + [pipeline:main] + pipeline = recon object-server + + [app:object-server] + use = egg:swift#object + replication_server = True + + [filter:recon] + use = egg:swift#recon + + [object-replicator] + rsync_module = {replication_ip}::object{replication_port} + + * 6.conf:: + + [DEFAULT] + devices = /srv/2/node + mount_check = false + disable_fallocate = true + bind_port = 6260 + user = swift + log_facility = LOG_LOCAL3 + recon_cache_path = /var/cache/swift2 + + [pipeline:main] + pipeline = recon object-server + + [app:object-server] + use = egg:swift#object + replication_server = True + + [filter:recon] + use = egg:swift#recon + + [object-replicator] + rsync_module = {replication_ip}::object{replication_port} + + * 7.conf:: + + [DEFAULT] + devices = /srv/3/node + mount_check = false + disable_fallocate = true + bind_port = 6270 + user = swift + log_facility = LOG_LOCAL4 + recon_cache_path = /var/cache/swift3 + + [pipeline:main] + pipeline = recon object-server + + [app:object-server] + use = egg:swift#object + replication_server = True + + [filter:recon] + use = egg:swift#recon + + [object-replicator] + rsync_module = {replication_ip}::object{replication_port} + + * 8.conf:: + + [DEFAULT] + devices = /srv/4/node + mount_check = false + disable_fallocate = true + bind_port = 6280 + user = swift + log_facility = LOG_LOCAL5 + recon_cache_path = /var/cache/swift4 + + [pipeline:main] + pipeline = recon object-server + + [app:object-server] + use = egg:swift#object + replication_server = True + + [filter:recon] + use = egg:swift#recon + + [object-replicator] + rsync_module = {replication_ip}::object{replication_port} + +#. Add configuration files for container-server, in ``/etc/swift/container-server/`` + + * 5.conf:: + + [DEFAULT] + devices = /srv/1/node + mount_check = false + disable_fallocate = true + bind_port = 6251 + user = swift + log_facility = LOG_LOCAL2 + recon_cache_path = /var/cache/swift + + [pipeline:main] + pipeline = recon container-server + + [app:container-server] + use = egg:swift#container + replication_server = True + + [filter:recon] + use = egg:swift#recon + + [container-replicator] + rsync_module = {replication_ip}::container{replication_port} + + * 6.conf:: + + [DEFAULT] + devices = /srv/2/node + mount_check = false + disable_fallocate = true + bind_port = 6261 + user = swift + log_facility = LOG_LOCAL3 + recon_cache_path = /var/cache/swift2 + + [pipeline:main] + pipeline = recon container-server + + [app:container-server] + use = egg:swift#container + replication_server = True + + [filter:recon] + use = egg:swift#recon + + [container-replicator] + rsync_module = {replication_ip}::container{replication_port} + + * 7.conf:: + + [DEFAULT] + devices = /srv/3/node + mount_check = false + disable_fallocate = true + bind_port = 6271 + user = swift + log_facility = LOG_LOCAL4 + recon_cache_path = /var/cache/swift3 + + [pipeline:main] + pipeline = recon container-server + + [app:container-server] + use = egg:swift#container + replication_server = True + + [filter:recon] + use = egg:swift#recon + + [container-replicator] + rsync_module = {replication_ip}::container{replication_port} + + * 8.conf:: + + [DEFAULT] + devices = /srv/4/node + mount_check = false + disable_fallocate = true + bind_port = 6281 + user = swift + log_facility = LOG_LOCAL5 + recon_cache_path = /var/cache/swift4 + + [pipeline:main] + pipeline = recon container-server + + [app:container-server] + use = egg:swift#container + replication_server = True + + [filter:recon] + use = egg:swift#recon + + [container-replicator] + rsync_module = {replication_ip}::container{replication_port} + +#. Add configuration files for account-server, in ``/etc/swift/account-server/`` + + * 5.conf:: + + [DEFAULT] + devices = /srv/1/node + mount_check = false + disable_fallocate = true + bind_port = 6252 + user = swift + log_facility = LOG_LOCAL2 + recon_cache_path = /var/cache/swift + + [pipeline:main] + pipeline = recon account-server + + [app:account-server] + use = egg:swift#account + replication_server = True + + [filter:recon] + use = egg:swift#recon + + [account-replicator] + rsync_module = {replication_ip}::account{replication_port} + + * 6.conf:: + + [DEFAULT] + devices = /srv/2/node + mount_check = false + disable_fallocate = true + bind_port = 6262 + user = swift + log_facility = LOG_LOCAL3 + recon_cache_path = /var/cache/swift2 + + [pipeline:main] + pipeline = recon account-server + + [app:account-server] + use = egg:swift#account + replication_server = True + + [filter:recon] + use = egg:swift#recon + + [account-replicator] + rsync_module = {replication_ip}::account{replication_port} + + * 7.conf:: + + [DEFAULT] + devices = /srv/3/node + mount_check = false + disable_fallocate = true + bind_port = 6272 + user = swift + log_facility = LOG_LOCAL4 + recon_cache_path = /var/cache/swift3 + + [pipeline:main] + pipeline = recon account-server + + [app:account-server] + use = egg:swift#account + replication_server = True + + [filter:recon] + use = egg:swift#recon + + [account-replicator] + rsync_module = {replication_ip}::account{replication_port} + + * 8.conf:: + + [DEFAULT] + devices = /srv/4/node + mount_check = false + disable_fallocate = true + bind_port = 6282 + user = swift + log_facility = LOG_LOCAL5 + recon_cache_path = /var/cache/swift4 + + [pipeline:main] + pipeline = recon account-server + + [app:account-server] + use = egg:swift#account + replication_server = True + + [filter:recon] + use = egg:swift#recon + + [account-replicator] + rsync_module = {replication_ip}::account{replication_port} + + +--------------------------------- +For a Multiple Server replication +--------------------------------- + +#. Move configuration file. + + * Configuration file for object-server from /etc/swift/object-server.conf to /etc/swift/object-server/1.conf + + * Configuration file for container-server from /etc/swift/container-server.conf to /etc/swift/container-server/1.conf + + * Configuration file for account-server from /etc/swift/account-server.conf to /etc/swift/account-server/1.conf + +#. Add changes in configuration files in directories: + + * /etc/swift/object-server(files: 1.conf) + * /etc/swift/container-server(files: 1.conf) + * /etc/swift/account-server(files: 1.conf) + + delete all configuration options in section [<*>-replicator] + +#. Add configuration files for object-server, in /etc/swift/object-server/2.conf:: + + [DEFAULT] + bind_ip = $STORAGE_LOCAL_NET_IP + workers = 2 + + [pipeline:main] + pipeline = object-server + + [app:object-server] + use = egg:swift#object + replication_server = True + + [object-replicator] + +#. Add configuration files for container-server, in /etc/swift/container-server/2.conf:: + + [DEFAULT] + bind_ip = $STORAGE_LOCAL_NET_IP + workers = 2 + + [pipeline:main] + pipeline = container-server + + [app:container-server] + use = egg:swift#container + replication_server = True + + [container-replicator] + +#. Add configuration files for account-server, in /etc/swift/account-server/2.conf:: + + [DEFAULT] + bind_ip = $STORAGE_LOCAL_NET_IP + workers = 2 + + [pipeline:main] + pipeline = account-server + + [app:account-server] + use = egg:swift#account + replication_server = True + + [account-replicator] + diff --git a/doc/source/ring.rst b/doc/source/ring.rst index d8f5a611f4..196a0b88a3 100644 --- a/doc/source/ring.rst +++ b/doc/source/ring.rst @@ -4,6 +4,16 @@ Partitioned Consistent Hash Ring ******************************** +.. _ring-io: + +Ring IO +======= + +.. automodule:: swift.common.ring.io + :members: + :undoc-members: + :show-inheritance: + .. _ring: Ring @@ -23,3 +33,13 @@ Ring Builder :members: :undoc-members: :show-inheritance: + +.. _composite_builder: + +Composite Ring Builder +====================== + +.. automodule:: swift.common.ring.composite_builder + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/ring_background.rst b/doc/source/ring_background.rst new file mode 100644 index 0000000000..cc6ade8b85 --- /dev/null +++ b/doc/source/ring_background.rst @@ -0,0 +1,966 @@ +================================== +Building a Consistent Hashing Ring +================================== + +------------------------------------ +Authored by Greg Holt, February 2011 +------------------------------------ + +This is a compilation of five posts I made earlier discussing how to build +a consistent hashing ring. The posts seemed to be accessed quite frequently, +so I've gathered them all here on one page for easier reading. + +.. note:: + This is an historical document; as such, all code examples are Python 2. + If this makes you squirm, think of it as pseudo-code. Regardless of + implementation language, the state of the art in consistent-hashing and + distributed systems more generally has advanced. We hope that this + introduction from first principles will still prove informative, + particularly with regard to how data is distributed within a Swift + cluster. + +Part 1 +====== +"Consistent Hashing" is a term used to describe a process where data is +distributed using a hashing algorithm to determine its location. Using +only the hash of the id of the data you can determine exactly where that +data should be. This mapping of hashes to locations is usually termed a +"ring". + +Probably the simplest hash is just a modulus of the id. For instance, if +all ids are numbers and you have two machines you wish to distribute data +to, you could just put all odd numbered ids on one machine and even numbered +ids on the other. Assuming you have a balanced number of odd and even +numbered ids, and a balanced data size per id, your data would be balanced +between the two machines. + +Since data ids are often textual names and not numbers, like paths for +files or URLs, it makes sense to use a "real" hashing algorithm to convert +the names to numbers first. Using MD5 for instance, the hash of the name +'mom.png' is '4559a12e3e8da7c2186250c2f292e3af' and the hash of 'dad.png' +is '096edcc4107e9e18d6a03a43b3853bea'. Now, using the modulus, we can +place 'mom.jpg' on the odd machine and 'dad.png' on the even one. Another +benefit of using a hashing algorithm like MD5 is that the resulting hashes +have a known even distribution, meaning your ids will be evenly distributed +without worrying about keeping the id values themselves evenly distributed. + +Here is a simple example of this in action: + +.. code-block:: python + + from hashlib import md5 + from struct import unpack_from + + NODE_COUNT = 100 + DATA_ID_COUNT = 10000000 + + node_counts = [0] * NODE_COUNT + for data_id in range(DATA_ID_COUNT): + data_id = str(data_id) + # This just pulls part of the hash out as an integer + hsh = unpack_from('>I', md5(data_id).digest())[0] + node_id = hsh % NODE_COUNT + node_counts[node_id] += 1 + desired_count = DATA_ID_COUNT / NODE_COUNT + print '%d: Desired data ids per node' % desired_count + max_count = max(node_counts) + over = 100.0 * (max_count - desired_count) / desired_count + print '%d: Most data ids on one node, %.02f%% over' % \ + (max_count, over) + min_count = min(node_counts) + under = 100.0 * (desired_count - min_count) / desired_count + print '%d: Least data ids on one node, %.02f%% under' % \ + (min_count, under) + +:: + + 100000: Desired data ids per node + 100695: Most data ids on one node, 0.69% over + 99073: Least data ids on one node, 0.93% under + +So that's not bad at all; less than a percent over/under for distribution +per node. In the next part of this series we'll examine where modulus +distribution causes problems and how to improve our ring to overcome them. + +Part 2 +====== +In Part 1 of this series, we did a simple test of using the modulus of a +hash to locate data. We saw very good distribution, but that's only part +of the story. Distributed systems not only need to distribute load, but +they often also need to grow as more and more data is placed in it. + +So let's imagine we have a 100 node system up and running using our +previous algorithm, but it's starting to get full so we want to add +another node. When we add that 101st node to our algorithm we notice +that many ids now map to different nodes than they previously did. +We're going to have to shuffle a ton of data around our system to get +it all into place again. + +Let's examine what's happened on a much smaller scale: just 2 nodes +again, node 0 gets even ids and node 1 gets odd ids. So data id 100 +would map to node 0, data id 101 to node 1, data id 102 to node 0, etc. +This is simply node = id % 2. Now we add a third node (node 2) for more +space, so we want node = id % 3. So now data id 100 maps to node id 1, +data id 101 to node 2, and data id 102 to node 0. So we have to move +data for 2 of our 3 ids so they can be found again. + +Let's examine this at a larger scale: + +.. code-block:: python + + from hashlib import md5 + from struct import unpack_from + + NODE_COUNT = 100 + NEW_NODE_COUNT = 101 + DATA_ID_COUNT = 10000000 + + moved_ids = 0 + for data_id in range(DATA_ID_COUNT): + data_id = str(data_id) + hsh = unpack_from('>I', md5(str(data_id)).digest())[0] + node_id = hsh % NODE_COUNT + new_node_id = hsh % NEW_NODE_COUNT + if node_id != new_node_id: + moved_ids += 1 + percent_moved = 100.0 * moved_ids / DATA_ID_COUNT + print '%d ids moved, %.02f%%' % (moved_ids, percent_moved) + +:: + + 9900989 ids moved, 99.01% + +Wow, that's severe. We'd have to shuffle around 99% of our data just +to increase our capacity 1%! We need a new algorithm that combats this +behavior. + +This is where the "ring" really comes in. We can assign ranges of hashes +directly to nodes and then use an algorithm that minimizes the changes +to those ranges. Back to our small scale, let's say our ids range from 0 +to 999. We have two nodes and we'll assign data ids 0–499 to node 0 and +500–999 to node 1. Later, when we add node 2, we can take half the data +ids from node 0 and half from node 1, minimizing the amount of data that +needs to move. + +Let's examine this at a larger scale: + +.. code-block:: python + + from bisect import bisect_left + from hashlib import md5 + from struct import unpack_from + + NODE_COUNT = 100 + NEW_NODE_COUNT = 101 + DATA_ID_COUNT = 10000000 + + node_range_starts = [] + for node_id in range(NODE_COUNT): + node_range_starts.append(DATA_ID_COUNT / + NODE_COUNT * node_id) + new_node_range_starts = [] + for new_node_id in range(NEW_NODE_COUNT): + new_node_range_starts.append(DATA_ID_COUNT / + NEW_NODE_COUNT * new_node_id) + moved_ids = 0 + for data_id in range(DATA_ID_COUNT): + data_id = str(data_id) + hsh = unpack_from('>I', md5(str(data_id)).digest())[0] + node_id = bisect_left(node_range_starts, + hsh % DATA_ID_COUNT) % NODE_COUNT + new_node_id = bisect_left(new_node_range_starts, + hsh % DATA_ID_COUNT) % NEW_NODE_COUNT + if node_id != new_node_id: + moved_ids += 1 + percent_moved = 100.0 * moved_ids / DATA_ID_COUNT + print '%d ids moved, %.02f%%' % (moved_ids, percent_moved) + +:: + + 4901707 ids moved, 49.02% + +Okay, that is better. But still, moving 50% of our data to add 1% capacity +is not very good. If we examine what happened more closely we'll see what +is an "accordion effect". We shrunk node 0's range a bit to give to the +new node, but that shifted all the other node's ranges by the same amount. + +We can minimize the change to a node's assigned range by assigning several +smaller ranges instead of the single broad range we were before. This can +be done by creating "virtual nodes" for each node. So 100 nodes might have +1000 virtual nodes. Let's examine how that might work. + +.. code-block:: python + + from bisect import bisect_left + from hashlib import md5 + from struct import unpack_from + + NODE_COUNT = 100 + DATA_ID_COUNT = 10000000 + VNODE_COUNT = 1000 + + vnode_range_starts = [] + vnode2node = [] + for vnode_id in range(VNODE_COUNT): + vnode_range_starts.append(DATA_ID_COUNT / + VNODE_COUNT * vnode_id) + vnode2node.append(vnode_id % NODE_COUNT) + new_vnode2node = list(vnode2node) + new_node_id = NODE_COUNT + NEW_NODE_COUNT = NODE_COUNT + 1 + vnodes_to_reassign = VNODE_COUNT / NEW_NODE_COUNT + while vnodes_to_reassign > 0: + for node_to_take_from in range(NODE_COUNT): + for vnode_id, node_id in enumerate(new_vnode2node): + if node_id == node_to_take_from: + new_vnode2node[vnode_id] = new_node_id + vnodes_to_reassign -= 1 + break + if vnodes_to_reassign <= 0: + break + moved_ids = 0 + for data_id in range(DATA_ID_COUNT): + data_id = str(data_id) + hsh = unpack_from('>I', md5(str(data_id)).digest())[0] + vnode_id = bisect_left(vnode_range_starts, + hsh % DATA_ID_COUNT) % VNODE_COUNT + node_id = vnode2node[vnode_id] + new_node_id = new_vnode2node[vnode_id] + if node_id != new_node_id: + moved_ids += 1 + percent_moved = 100.0 * moved_ids / DATA_ID_COUNT + print '%d ids moved, %.02f%%' % (moved_ids, percent_moved) + +:: + + 90423 ids moved, 0.90% + +There we go, we added 1% capacity and only moved 0.9% of existing data. +The vnode_range_starts list seems a bit out of place though. Its values +are calculated and never change for the lifetime of the cluster, so let's +optimize that out. + +.. code-block:: python + + from bisect import bisect_left + from hashlib import md5 + from struct import unpack_from + + NODE_COUNT = 100 + DATA_ID_COUNT = 10000000 + VNODE_COUNT = 1000 + + vnode2node = [] + for vnode_id in range(VNODE_COUNT): + vnode2node.append(vnode_id % NODE_COUNT) + new_vnode2node = list(vnode2node) + new_node_id = NODE_COUNT + vnodes_to_reassign = VNODE_COUNT / (NODE_COUNT + 1) + while vnodes_to_reassign > 0: + for node_to_take_from in range(NODE_COUNT): + for vnode_id, node_id in enumerate(vnode2node): + if node_id == node_to_take_from: + vnode2node[vnode_id] = new_node_id + vnodes_to_reassign -= 1 + break + if vnodes_to_reassign <= 0: + break + moved_ids = 0 + for data_id in range(DATA_ID_COUNT): + data_id = str(data_id) + hsh = unpack_from('>I', md5(str(data_id)).digest())[0] + vnode_id = hsh % VNODE_COUNT + node_id = vnode2node[vnode_id] + new_node_id = new_vnode2node[vnode_id] + if node_id != new_node_id: + moved_ids += 1 + percent_moved = 100.0 * moved_ids / DATA_ID_COUNT + print '%d ids moved, %.02f%%' % (moved_ids, percent_moved) + +:: + + 89841 ids moved, 0.90% + +There we go. In the next part of this series, will further examine the +algorithm's limitations and how to improve on it. + +Part 3 +====== +In Part 2 of this series, we reached an algorithm that performed well +even when adding new nodes to the cluster. We used 1000 virtual nodes +that could be independently assigned to nodes, allowing us to minimize +the amount of data moved when a node was added. + +The number of virtual nodes puts a cap on how many real nodes you can +have. For example, if you have 1000 virtual nodes and you try to add a +1001st real node, you can't assign a virtual node to it without leaving +another real node with no assignment, leaving you with just 1000 active +real nodes still. + +Unfortunately, the number of virtual nodes created at the beginning can +never change for the life of the cluster without a lot of careful work. +For example, you could double the virtual node count by splitting each +existing virtual node in half and assigning both halves to the same real +node. However, if the real node uses the virtual node's id to optimally +store the data (for example, all data might be stored in /[virtual node +id]/[data id]) it would have to move data around locally to reflect the +change. And it would have to resolve data using both the new and old +locations while the moves were taking place, making atomic operations +difficult or impossible. + +Let's continue with this assumption that changing the virtual node +count is more work than it's worth, but keep in mind that some applications +might be fine with this. + +The easiest way to deal with this limitation is to make the limit high +enough that it won't matter. For instance, if we decide our cluster will +never exceed 60,000 real nodes, we can just make 60,000 virtual nodes. + +Also, we should include in our calculations the relative size of our +nodes. For instance, a year from now we might have real nodes that can +handle twice the capacity of our current nodes. So we'd want to assign +twice the virtual nodes to those future nodes, so maybe we should raise +our virtual node estimate to 120,000. + +A good rule to follow might be to calculate 100 virtual nodes to each +real node at maximum capacity. This would allow you to alter the load +on any given node by 1%, even at max capacity, which is pretty fine +tuning. So now we're at 6,000,000 virtual nodes for a max capacity cluster +of 60,000 real nodes. + +6 million virtual nodes seems like a lot, and it might seem like we'd +use up way too much memory. But the only structure this affects is the +virtual node to real node mapping. The base amount of memory required +would be 6 million times 2 bytes (to store a real node id from 0 to +65,535). 12 megabytes of memory just isn't that much to use these days. + +Even with all the overhead of flexible data types, things aren't that +bad. I changed the code from the previous part in this series to have +60,000 real and 6,000,000 virtual nodes, changed the list to an array('H'), +and python topped out at 27m of resident memory – and that includes two +rings. + +To change terminology a bit, we're going to start calling these virtual +nodes "partitions". This will make it a bit easier to discern between the +two types of nodes we've been talking about so far. Also, it makes sense +to talk about partitions as they are really just unchanging sections +of the hash space. + +We're also going to always keep the partition count a power of two. This +makes it easy to just use bit manipulation on the hash to determine the +partition rather than modulus. It isn't much faster, but it is a little. +So, here's our updated ring code, using 8,388,608 (2 ** 23) partitions +and 65,536 nodes. We've upped the sample data id set and checked the +distribution to make sure we haven't broken anything. + +.. code-block:: python + + from array import array + from hashlib import md5 + from struct import unpack_from + + PARTITION_POWER = 23 + PARTITION_SHIFT = 32 - PARTITION_POWER + NODE_COUNT = 65536 + DATA_ID_COUNT = 100000000 + + part2node = array('H') + for part in range(2 ** PARTITION_POWER): + part2node.append(part % NODE_COUNT) + node_counts = [0] * NODE_COUNT + for data_id in range(DATA_ID_COUNT): + data_id = str(data_id) + part = unpack_from('>I', + md5(str(data_id)).digest())[0] >> PARTITION_SHIFT + node_id = part2node[part] + node_counts[node_id] += 1 + desired_count = DATA_ID_COUNT / NODE_COUNT + print '%d: Desired data ids per node' % desired_count + max_count = max(node_counts) + over = 100.0 * (max_count - desired_count) / desired_count + print '%d: Most data ids on one node, %.02f%% over' % \ + (max_count, over) + min_count = min(node_counts) + under = 100.0 * (desired_count - min_count) / desired_count + print '%d: Least data ids on one node, %.02f%% under' % \ + (min_count, under) + +:: + + 1525: Desired data ids per node + 1683: Most data ids on one node, 10.36% over + 1360: Least data ids on one node, 10.82% under + +Hmm. +–10% seems a bit high, but I reran with 65,536 partitions and +256 nodes and got +–0.4% so it's just that our sample size (100m) is +too small for our number of partitions (8m). It'll take way too long +to run experiments with an even larger sample size, so let's reduce +back down to these lesser numbers. (To be certain, I reran at the full +version with a 10 billion data id sample set and got +–1%, but it took +6.5 hours to run.) + +In the next part of this series, we'll talk about how to increase the +durability of our data in the cluster. + +Part 4 +====== +In Part 3 of this series, we just further discussed partitions (virtual +nodes) and cleaned up our code a bit based on that. Now, let's talk +about how to increase the durability and availability of our data in the +cluster. + +For many distributed data stores, durability is quite important. Either +RAID arrays or individually distinct copies of data are required. While +RAID will increase the durability, it does nothing to increase the +availability – if the RAID machine crashes, the data may be safe but +inaccessible until repairs are done. If we keep distinct copies of the +data on different machines and a machine crashes, the other copies will +still be available while we repair the broken machine. + +An easy way to gain this multiple copy durability/availability is to +just use multiple rings and groups of nodes. For instance, to achieve +the industry standard of three copies, you'd split the nodes into three +groups and each group would have its own ring and each would receive a +copy of each data item. This can work well enough, but has the drawback +that expanding capacity requires adding three nodes at a time and that +losing one node essentially lowers capacity by three times that node's +capacity. + +Instead, let's use a different, but common, approach of meeting our +requirements with a single ring. This can be done by walking the ring +from the starting point and looking for additional distinct nodes. +Here's code that supports a variable number of replicas (set to 3 for +testing): + +.. code-block:: python + + from array import array + from hashlib import md5 + from struct import unpack_from + + REPLICAS = 3 + PARTITION_POWER = 16 + PARTITION_SHIFT = 32 - PARTITION_POWER + PARTITION_MAX = 2 ** PARTITION_POWER - 1 + NODE_COUNT = 256 + DATA_ID_COUNT = 10000000 + + part2node = array('H') + for part in range(2 ** PARTITION_POWER): + part2node.append(part % NODE_COUNT) + node_counts = [0] * NODE_COUNT + for data_id in range(DATA_ID_COUNT): + data_id = str(data_id) + part = unpack_from('>I', + md5(str(data_id)).digest())[0] >> PARTITION_SHIFT + node_ids = [part2node[part]] + node_counts[node_ids[0]] += 1 + for replica in range(1, REPLICAS): + while part2node[part] in node_ids: + part += 1 + if part > PARTITION_MAX: + part = 0 + node_ids.append(part2node[part]) + node_counts[node_ids[-1]] += 1 + desired_count = DATA_ID_COUNT / NODE_COUNT * REPLICAS + print '%d: Desired data ids per node' % desired_count + max_count = max(node_counts) + over = 100.0 * (max_count - desired_count) / desired_count + print '%d: Most data ids on one node, %.02f%% over' % \ + (max_count, over) + min_count = min(node_counts) + under = 100.0 * (desired_count - min_count) / desired_count + print '%d: Least data ids on one node, %.02f%% under' % \ + (min_count, under) + +:: + + 117186: Desired data ids per node + 118133: Most data ids on one node, 0.81% over + 116093: Least data ids on one node, 0.93% under + +That's pretty good; less than 1% over/under. While this works well, +there are a couple of problems. + +First, because of how we've initially assigned the partitions to nodes, +all the partitions for a given node have their extra copies on the same +other two nodes. The problem here is that when a machine fails, the load +on these other nodes will jump by that amount. It'd be better if we +initially shuffled the partition assignment to distribute the failover +load better. + +The other problem is a bit harder to explain, but deals with physical +separation of machines. Imagine you can only put 16 machines in a rack +in your datacenter. The 256 nodes we've been using would fill 16 racks. +With our current code, if a rack goes out (power problem, network issue, +etc.) there is a good chance some data will have all three copies in that +rack, becoming inaccessible. We can fix this shortcoming by adding the +concept of zones to our nodes, and then ensuring that replicas are stored +in distinct zones. + +.. code-block:: python + + from array import array + from hashlib import md5 + from random import shuffle + from struct import unpack_from + + REPLICAS = 3 + PARTITION_POWER = 16 + PARTITION_SHIFT = 32 - PARTITION_POWER + PARTITION_MAX = 2 ** PARTITION_POWER - 1 + NODE_COUNT = 256 + ZONE_COUNT = 16 + DATA_ID_COUNT = 10000000 + + node2zone = [] + while len(node2zone) < NODE_COUNT: + zone = 0 + while zone < ZONE_COUNT and len(node2zone) < NODE_COUNT: + node2zone.append(zone) + zone += 1 + part2node = array('H') + for part in range(2 ** PARTITION_POWER): + part2node.append(part % NODE_COUNT) + shuffle(part2node) + node_counts = [0] * NODE_COUNT + zone_counts = [0] * ZONE_COUNT + for data_id in range(DATA_ID_COUNT): + data_id = str(data_id) + part = unpack_from('>I', + md5(str(data_id)).digest())[0] >> PARTITION_SHIFT + node_ids = [part2node[part]] + zones = [node2zone[node_ids[0]]] + node_counts[node_ids[0]] += 1 + zone_counts[zones[0]] += 1 + for replica in range(1, REPLICAS): + while part2node[part] in node_ids and \ + node2zone[part2node[part]] in zones: + part += 1 + if part > PARTITION_MAX: + part = 0 + node_ids.append(part2node[part]) + zones.append(node2zone[node_ids[-1]]) + node_counts[node_ids[-1]] += 1 + zone_counts[zones[-1]] += 1 + desired_count = DATA_ID_COUNT / NODE_COUNT * REPLICAS + print '%d: Desired data ids per node' % desired_count + max_count = max(node_counts) + over = 100.0 * (max_count - desired_count) / desired_count + print '%d: Most data ids on one node, %.02f%% over' % \ + (max_count, over) + min_count = min(node_counts) + under = 100.0 * (desired_count - min_count) / desired_count + print '%d: Least data ids on one node, %.02f%% under' % \ + (min_count, under) + desired_count = DATA_ID_COUNT / ZONE_COUNT * REPLICAS + print '%d: Desired data ids per zone' % desired_count + max_count = max(zone_counts) + over = 100.0 * (max_count - desired_count) / desired_count + print '%d: Most data ids in one zone, %.02f%% over' % \ + (max_count, over) + min_count = min(zone_counts) + under = 100.0 * (desired_count - min_count) / desired_count + print '%d: Least data ids in one zone, %.02f%% under' % \ + (min_count, under) + +:: + + 117186: Desired data ids per node + 118782: Most data ids on one node, 1.36% over + 115632: Least data ids on one node, 1.33% under + 1875000: Desired data ids per zone + 1878533: Most data ids in one zone, 0.19% over + 1869070: Least data ids in one zone, 0.32% under + +So the shuffle and zone distinctions affected our distribution some, +but still definitely good enough. This test took about 64 seconds to +run on my machine. + +There's a completely alternate, and quite common, way of accomplishing +these same requirements. This alternate method doesn't use partitions +at all, but instead just assigns anchors to the nodes within the hash +space. Finding the first node for a given hash just involves walking +this anchor ring for the next node, and finding additional nodes works +similarly as before. To attain the equivalent of our virtual nodes, +each real node is assigned multiple anchors. + +.. code-block:: python + + from bisect import bisect_left + from hashlib import md5 + from struct import unpack_from + + REPLICAS = 3 + NODE_COUNT = 256 + ZONE_COUNT = 16 + DATA_ID_COUNT = 10000000 + VNODE_COUNT = 100 + + node2zone = [] + while len(node2zone) < NODE_COUNT: + zone = 0 + while zone < ZONE_COUNT and len(node2zone) < NODE_COUNT: + node2zone.append(zone) + zone += 1 + hash2index = [] + index2node = [] + for node in range(NODE_COUNT): + for vnode in range(VNODE_COUNT): + hsh = unpack_from('>I', md5(str(node)).digest())[0] + index = bisect_left(hash2index, hsh) + if index > len(hash2index): + index = 0 + hash2index.insert(index, hsh) + index2node.insert(index, node) + node_counts = [0] * NODE_COUNT + zone_counts = [0] * ZONE_COUNT + for data_id in range(DATA_ID_COUNT): + data_id = str(data_id) + hsh = unpack_from('>I', md5(str(data_id)).digest())[0] + index = bisect_left(hash2index, hsh) + if index >= len(hash2index): + index = 0 + node_ids = [index2node[index]] + zones = [node2zone[node_ids[0]]] + node_counts[node_ids[0]] += 1 + zone_counts[zones[0]] += 1 + for replica in range(1, REPLICAS): + while index2node[index] in node_ids and \ + node2zone[index2node[index]] in zones: + index += 1 + if index >= len(hash2index): + index = 0 + node_ids.append(index2node[index]) + zones.append(node2zone[node_ids[-1]]) + node_counts[node_ids[-1]] += 1 + zone_counts[zones[-1]] += 1 + desired_count = DATA_ID_COUNT / NODE_COUNT * REPLICAS + print '%d: Desired data ids per node' % desired_count + max_count = max(node_counts) + over = 100.0 * (max_count - desired_count) / desired_count + print '%d: Most data ids on one node, %.02f%% over' % \ + (max_count, over) + min_count = min(node_counts) + under = 100.0 * (desired_count - min_count) / desired_count + print '%d: Least data ids on one node, %.02f%% under' % \ + (min_count, under) + desired_count = DATA_ID_COUNT / ZONE_COUNT * REPLICAS + print '%d: Desired data ids per zone' % desired_count + max_count = max(zone_counts) + over = 100.0 * (max_count - desired_count) / desired_count + print '%d: Most data ids in one zone, %.02f%% over' % \ + (max_count, over) + min_count = min(zone_counts) + under = 100.0 * (desired_count - min_count) / desired_count + print '%d: Least data ids in one zone, %.02f%% under' % \ + (min_count, under) + +:: + + 117186: Desired data ids per node + 351282: Most data ids on one node, 199.76% over + 15965: Least data ids on one node, 86.38% under + 1875000: Desired data ids per zone + 2248496: Most data ids in one zone, 19.92% over + 1378013: Least data ids in one zone, 26.51% under + +This test took over 15 minutes to run! Unfortunately, this method also +gives much less control over the distribution. To get better distribution, +you have to add more virtual nodes, which eats up more memory and takes +even more time to build the ring and perform distinct node lookups. The +most common operation, data id lookup, can be improved (by predetermining +each virtual node's failover nodes, for instance) but it starts off so +far behind our first approach that we'll just stick with that. + +In the next part of this series, we'll start to wrap all this up into +a useful Python module. + +Part 5 +====== +In Part 4 of this series, we ended up with a multiple copy, distinctly +zoned ring. Or at least the start of it. In this final part we'll package +the code up into a useable Python module and then add one last feature. +First, let's separate the ring itself from the building of the data for +the ring and its testing. + +.. code-block:: python + + from array import array + from hashlib import md5 + from random import shuffle + from struct import unpack_from + from time import time + + class Ring(object): + + def __init__(self, nodes, part2node, replicas): + self.nodes = nodes + self.part2node = part2node + self.replicas = replicas + partition_power = 1 + while 2 ** partition_power < len(part2node): + partition_power += 1 + if len(part2node) != 2 ** partition_power: + raise Exception("part2node's length is not an " + "exact power of 2") + self.partition_shift = 32 - partition_power + + def get_nodes(self, data_id): + data_id = str(data_id) + part = unpack_from('>I', + md5(data_id).digest())[0] >> self.partition_shift + node_ids = [self.part2node[part]] + zones = [self.nodes[node_ids[0]]] + for replica in range(1, self.replicas): + while self.part2node[part] in node_ids and \ + self.nodes[self.part2node[part]] in zones: + part += 1 + if part >= len(self.part2node): + part = 0 + node_ids.append(self.part2node[part]) + zones.append(self.nodes[node_ids[-1]]) + return [self.nodes[n] for n in node_ids] + + def build_ring(nodes, partition_power, replicas): + begin = time() + part2node = array('H') + for part in range(2 ** partition_power): + part2node.append(part % len(nodes)) + shuffle(part2node) + ring = Ring(nodes, part2node, replicas) + print '%.02fs to build ring' % (time() - begin) + return ring + + def test_ring(ring): + begin = time() + DATA_ID_COUNT = 10000000 + node_counts = {} + zone_counts = {} + for data_id in range(DATA_ID_COUNT): + for node in ring.get_nodes(data_id): + node_counts[node['id']] = \ + node_counts.get(node['id'], 0) + 1 + zone_counts[node['zone']] = \ + zone_counts.get(node['zone'], 0) + 1 + print '%ds to test ring' % (time() - begin) + desired_count = \ + DATA_ID_COUNT / len(ring.nodes) * REPLICAS + print '%d: Desired data ids per node' % desired_count + max_count = max(node_counts.values()) + over = \ + 100.0 * (max_count - desired_count) / desired_count + print '%d: Most data ids on one node, %.02f%% over' % \ + (max_count, over) + min_count = min(node_counts.values()) + under = \ + 100.0 * (desired_count - min_count) / desired_count + print '%d: Least data ids on one node, %.02f%% under' % \ + (min_count, under) + zone_count = \ + len(set(n['zone'] for n in ring.nodes.values())) + desired_count = \ + DATA_ID_COUNT / zone_count * ring.replicas + print '%d: Desired data ids per zone' % desired_count + max_count = max(zone_counts.values()) + over = \ + 100.0 * (max_count - desired_count) / desired_count + print '%d: Most data ids in one zone, %.02f%% over' % \ + (max_count, over) + min_count = min(zone_counts.values()) + under = \ + 100.0 * (desired_count - min_count) / desired_count + print '%d: Least data ids in one zone, %.02f%% under' % \ + (min_count, under) + + if __name__ == '__main__': + PARTITION_POWER = 16 + REPLICAS = 3 + NODE_COUNT = 256 + ZONE_COUNT = 16 + nodes = {} + while len(nodes) < NODE_COUNT: + zone = 0 + while zone < ZONE_COUNT and len(nodes) < NODE_COUNT: + node_id = len(nodes) + nodes[node_id] = {'id': node_id, 'zone': zone} + zone += 1 + ring = build_ring(nodes, PARTITION_POWER, REPLICAS) + test_ring(ring) + +:: + + 0.06s to build ring + 82s to test ring + 117186: Desired data ids per node + 118773: Most data ids on one node, 1.35% over + 115801: Least data ids on one node, 1.18% under + 1875000: Desired data ids per zone + 1878339: Most data ids in one zone, 0.18% over + 1869914: Least data ids in one zone, 0.27% under + +It takes a bit longer to test our ring, but that's mostly because of +the switch to dictionaries from arrays for various items. Having node +dictionaries is nice because you can attach any node information you +want directly there (ip addresses, tcp ports, drive paths, etc.). But +we're still on track for further testing; our distribution is still good. + +Now, let's add our one last feature to our ring: the concept of weights. +Weights are useful because the nodes you add later in a ring's life are +likely to have more capacity than those you have at the outset. For this +test, we'll make half our nodes have twice the weight. We'll have to +change build_ring to give more partitions to the nodes with more weight +and we'll change test_ring to take into account these weights. Since +we've changed so much I'll just post the entire module again: + +.. code-block:: python + + from array import array + from hashlib import md5 + from random import shuffle + from struct import unpack_from + from time import time + + class Ring(object): + + def __init__(self, nodes, part2node, replicas): + self.nodes = nodes + self.part2node = part2node + self.replicas = replicas + partition_power = 1 + while 2 ** partition_power < len(part2node): + partition_power += 1 + if len(part2node) != 2 ** partition_power: + raise Exception("part2node's length is not an " + "exact power of 2") + self.partition_shift = 32 - partition_power + + def get_nodes(self, data_id): + data_id = str(data_id) + part = unpack_from('>I', + md5(data_id).digest())[0] >> self.partition_shift + node_ids = [self.part2node[part]] + zones = [self.nodes[node_ids[0]]] + for replica in range(1, self.replicas): + while self.part2node[part] in node_ids and \ + self.nodes[self.part2node[part]] in zones: + part += 1 + if part >= len(self.part2node): + part = 0 + node_ids.append(self.part2node[part]) + zones.append(self.nodes[node_ids[-1]]) + return [self.nodes[n] for n in node_ids] + + def build_ring(nodes, partition_power, replicas): + begin = time() + parts = 2 ** partition_power + total_weight = \ + float(sum(n['weight'] for n in nodes.values())) + for node in nodes.values(): + node['desired_parts'] = \ + parts / total_weight * node['weight'] + part2node = array('H') + for part in range(2 ** partition_power): + for node in nodes.values(): + if node['desired_parts'] >= 1: + node['desired_parts'] -= 1 + part2node.append(node['id']) + break + else: + for node in nodes.values(): + if node['desired_parts'] >= 0: + node['desired_parts'] -= 1 + part2node.append(node['id']) + break + shuffle(part2node) + ring = Ring(nodes, part2node, replicas) + print '%.02fs to build ring' % (time() - begin) + return ring + + def test_ring(ring): + begin = time() + DATA_ID_COUNT = 10000000 + node_counts = {} + zone_counts = {} + for data_id in range(DATA_ID_COUNT): + for node in ring.get_nodes(data_id): + node_counts[node['id']] = \ + node_counts.get(node['id'], 0) + 1 + zone_counts[node['zone']] = \ + zone_counts.get(node['zone'], 0) + 1 + print '%ds to test ring' % (time() - begin) + total_weight = float(sum(n['weight'] for n in + ring.nodes.values())) + max_over = 0 + max_under = 0 + for node in ring.nodes.values(): + desired = DATA_ID_COUNT * REPLICAS * \ + node['weight'] / total_weight + diff = node_counts[node['id']] - desired + if diff > 0: + over = 100.0 * diff / desired + if over > max_over: + max_over = over + else: + under = 100.0 * (-diff) / desired + if under > max_under: + max_under = under + print '%.02f%% max node over' % max_over + print '%.02f%% max node under' % max_under + max_over = 0 + max_under = 0 + for zone in set(n['zone'] for n in + ring.nodes.values()): + zone_weight = sum(n['weight'] for n in + ring.nodes.values() if n['zone'] == zone) + desired = DATA_ID_COUNT * REPLICAS * \ + zone_weight / total_weight + diff = zone_counts[zone] - desired + if diff > 0: + over = 100.0 * diff / desired + if over > max_over: + max_over = over + else: + under = 100.0 * (-diff) / desired + if under > max_under: + max_under = under + print '%.02f%% max zone over' % max_over + print '%.02f%% max zone under' % max_under + + if __name__ == '__main__': + PARTITION_POWER = 16 + REPLICAS = 3 + NODE_COUNT = 256 + ZONE_COUNT = 16 + nodes = {} + while len(nodes) < NODE_COUNT: + zone = 0 + while zone < ZONE_COUNT and len(nodes) < NODE_COUNT: + node_id = len(nodes) + nodes[node_id] = {'id': node_id, 'zone': zone, + 'weight': 1.0 + (node_id % 2)} + zone += 1 + ring = build_ring(nodes, PARTITION_POWER, REPLICAS) + test_ring(ring) + +:: + + 0.88s to build ring + 86s to test ring + 1.66% max over + 1.46% max under + 0.28% max zone over + 0.23% max zone under + +So things are still good, even though we have differently weighted nodes. +I ran another test with this code using random weights from 1 to 100 and +got over/under values for nodes of 7.35%/18.12% and zones of 0.24%/0.22%, +still pretty good considering the crazy weight ranges. + +Summary +======= +Hopefully this series has been a good introduction to building a ring. +This code is essentially how the OpenStack Swift ring works, except that +Swift's ring has lots of additional optimizations, such as storing each +replica assignment separately, and lots of extra features for building, +validating, and otherwise working with rings. diff --git a/doc/source/ring_partpower.rst b/doc/source/ring_partpower.rst new file mode 100644 index 0000000000..2e22bec5ed --- /dev/null +++ b/doc/source/ring_partpower.rst @@ -0,0 +1,199 @@ +.. _modify_part_power: + +============================== +Modifying Ring Partition Power +============================== + +The ring partition power determines the on-disk location of data files and is +selected when creating a new ring. In normal operation, it is a fixed value. +This is because a different partition power results in a different on-disk +location for all data files. + +However, increasing the partition power by 1 can be done by choosing locations +that are on the same disk. As a result, we can create hard-links for both the +new and old locations, avoiding data movement without impacting availability. + +To enable a partition power change without interrupting user access, object +servers need to be aware of it in advance. Therefore a partition power change +needs to be done in multiple steps. + +.. note:: + + Do not increase the partition power on account and container rings. + Increasing the partition power is *only* supported for object rings. + Trying to increase the part_power for account and container rings *will* + result in unavailability, maybe even data loss. + + +------- +Caveats +------- + +Before increasing the partition power, consider the possible drawbacks. +There are a few caveats when increasing the partition power: + +* Almost all diskfiles in the cluster need to be relinked then cleaned up, + and all partition directories need to be rehashed. This imposes significant + I/O load on object servers, which may impact client requests. Consider using + cgroups, ``ionice``, or even just the built-in ``--files-per-second`` + rate-limiting to reduce client impact. +* Object replicators and reconstructors will skip affected policies during the + partition power increase. Replicators are not aware of hard-links, and would + simply copy the content; this would result in heavy data movement and the + worst case would be that all data is stored twice. +* Due to the fact that each object will now be hard linked from two locations, + many more inodes will be used temporarily - expect around twice the amount. + You need to check the free inode count *before* increasing the partition + power. Even after the increase is complete and extra hardlinks are cleaned + up, expect increased inode usage since there will be twice as many partition + and suffix directories. +* Also, object auditors might read each object twice before cleanup removes the + second hard link. +* Due to the new inodes more memory is needed to cache them, and your + object servers should have plenty of available memory to avoid running out of + inode cache. Setting ``vfs_cache_pressure`` to 1 might help with that. +* All nodes in the cluster *must* run at least Swift version 2.13.0 or later. + +Due to these caveats you should only increase the partition power if really +needed, i.e. if the number of partitions per disk is extremely low and the data +is distributed unevenly across disks. + +----------------------------------- +1. Prepare partition power increase +----------------------------------- + +The swift-ring-builder is used to prepare the ring for an upcoming partition +power increase. It will store a new variable ``next_part_power`` with the current +partition power + 1. Object servers recognize this, and hard links to the new +location will be created (or deleted) on every PUT or DELETE. This will make +it possible to access newly written objects using the future partition power:: + + swift-ring-builder prepare_increase_partition_power + swift-ring-builder write_ring + +Now you need to copy the updated .ring.gz to all nodes. Already existing data +needs to be relinked too; therefore an operator has to run a relinker command +on all object servers in this phase:: + + swift-object-relinker relink + +.. note:: + + Start relinking after *all* the servers re-read the modified ring files, + which normally happens within 15 seconds after writing a modified ring. + Also, make sure the modified rings are pushed to all nodes running object + services (replicators, reconstructors and reconcilers)- they have to skip + the policy during relinking. + +.. note:: + + The relinking command must run as the same user as the daemon processes + (usually swift). It will create files and directories that must be + manipulable by the daemon processes (server, auditor, replicator, ...). + If necessary, the ``--user`` option may be used to drop privileges. + +Relinking might take some time; while there is no data copied or actually +moved, the tool still needs to walk the whole file system and create new hard +links as required. + +--------------------------- +2. Increase partition power +--------------------------- + +Now that all existing data can be found using the new location, it's time to +actually increase the partition power itself:: + + swift-ring-builder increase_partition_power + swift-ring-builder write_ring + +Now you need to copy the updated .ring.gz again to all nodes. Object servers +are now using the new, increased partition power and no longer create +additional hard links. + + +.. note:: + + The object servers will create additional hard links for each modified or + new object, and this requires more inodes. + +.. note:: + + If you decide you don't want to increase the partition power, you should + instead cancel the increase. It is not possible to revert this operation + once started. To abort the partition power increase, execute the following + commands, copy the updated .ring.gz files to all nodes and continue with + `3. Cleanup`_ afterwards:: + + swift-ring-builder cancel_increase_partition_power + swift-ring-builder write_ring + + +---------- +3. Cleanup +---------- + +Existing hard links in the old locations need to be removed, and a cleanup tool +is provided to do this. Run the following command on each storage node:: + + swift-object-relinker cleanup + +.. note:: + + The cleanup must be finished within your object servers ``reclaim_age`` + period (which is by default 1 week). Otherwise objects that have been + overwritten between step #1 and step #2 and deleted afterwards can't be + cleaned up anymore. You may want to increase your ``reclaim_age`` before + or during relinking. + +Afterwards it is required to update the rings one last +time to inform servers that all steps to increase the partition power are done, +and replicators should resume their job:: + + swift-ring-builder finish_increase_partition_power + swift-ring-builder write_ring + +Now you need to copy the updated .ring.gz again to all nodes. + +---------- +Background +---------- + +An existing object that is currently located on partition X will be placed +either on partition 2*X or 2*X+1 after the partition power is increased. The +reason for this is the Ring.get_part() method, that does a bitwise shift to the +right. + +To avoid actual data movement to different disks or even nodes, the allocation +of partitions to nodes needs to be changed. The allocation is pairwise due to +the above mentioned new partition scheme. Therefore devices are allocated like +this, with the partition being the index and the value being the device id:: + + old new + part dev part dev + ---- --- ---- --- + 0 0 0 0 + 1 0 + 1 3 2 3 + 3 3 + 2 7 4 7 + 5 7 + 3 5 6 5 + 7 5 + 4 2 8 2 + 9 2 + 5 1 10 1 + 11 1 + +There is a helper method to compute the new path, and the following example +shows the mapping between old and new location:: + + >>> from swift.common.utils import replace_partition_in_path + >>> old='objects/16003/a38/fa0fcec07328d068e24ccbf2a62f2a38/1467658208.57179.data' + >>> replace_partition_in_path('', '/sda/' + old, 14) + 'objects/16003/a38/fa0fcec07328d068e24ccbf2a62f2a38/1467658208.57179.data' + >>> replace_partition_in_path('', '/sda/' + old, 15) + 'objects/32007/a38/fa0fcec07328d068e24ccbf2a62f2a38/1467658208.57179.data' + +Using the original partition power (14) it returned the same path; however +after an increase to 15 it returns the new path, and the new partition is 2*X+1 +in this case. diff --git a/doc/source/s3_compat.rst b/doc/source/s3_compat.rst new file mode 100644 index 0000000000..04010a7c17 --- /dev/null +++ b/doc/source/s3_compat.rst @@ -0,0 +1,145 @@ +S3/Swift REST API Comparison Matrix +=================================== + +General compatibility statement +------------------------------- + +S3 is a product from Amazon, and as such, it includes "features" that +are outside the scope of Swift itself. For example, Swift doesn't +have anything to do with billing, whereas S3 buckets can be tied to +Amazon's billing system. Similarly, log delivery is a service outside +of Swift. It's entirely possible for a Swift deployment to provide that +functionality, but it is not part of Swift itself. Likewise, a Swift +deployment can provide similar geographic availability as S3, but this +is tied to the deployer's willingness to build the infrastructure and +support systems to do so. + +Amazon S3 operations +--------------------- + ++------------------------------------------------+------------------+--------------+ +| S3 REST API method | Category | Swift S3 API | ++================================================+==================+==============+ +| `GET Object`_ | Core-API | Yes | ++------------------------------------------------+------------------+--------------+ +| `HEAD Object`_ | Core-API | Yes | ++------------------------------------------------+------------------+--------------+ +| `PUT Object`_ | Core-API | Yes | ++------------------------------------------------+------------------+--------------+ +| `PUT Object Copy`_ | Core-API | Yes | ++------------------------------------------------+------------------+--------------+ +| `DELETE Object`_ | Core-API | Yes | ++------------------------------------------------+------------------+--------------+ +| `Initiate Multipart Upload`_ | Core-API | Yes | ++------------------------------------------------+------------------+--------------+ +| `Upload Part`_ | Core-API | Yes | ++------------------------------------------------+------------------+--------------+ +| `Upload Part Copy`_ | Core-API | Yes | ++------------------------------------------------+------------------+--------------+ +| `Complete Multipart Upload`_ | Core-API | Yes | ++------------------------------------------------+------------------+--------------+ +| `Abort Multipart Upload`_ | Core-API | Yes | ++------------------------------------------------+------------------+--------------+ +| `List Parts`_ | Core-API | Yes | ++------------------------------------------------+------------------+--------------+ +| `GET Object ACL`_ | Core-API | Yes | ++------------------------------------------------+------------------+--------------+ +| `PUT Object ACL`_ | Core-API | Yes | ++------------------------------------------------+------------------+--------------+ +| `PUT Bucket`_ | Core-API | Yes | ++------------------------------------------------+------------------+--------------+ +| `GET Bucket List Objects`_ | Core-API | Yes | ++------------------------------------------------+------------------+--------------+ +| `HEAD Bucket`_ | Core-API | Yes | ++------------------------------------------------+------------------+--------------+ +| `DELETE Bucket`_ | Core-API | Yes | ++------------------------------------------------+------------------+--------------+ +| `List Multipart Uploads`_ | Core-API | Yes | ++------------------------------------------------+------------------+--------------+ +| `GET Bucket acl`_ | Core-API | Yes | ++------------------------------------------------+------------------+--------------+ +| `PUT Bucket acl`_ | Core-API | Yes | ++------------------------------------------------+------------------+--------------+ +| `Versioning`_ | Versioning | Yes | ++------------------------------------------------+------------------+--------------+ +| `Bucket notification`_ | Notifications | No | ++------------------------------------------------+------------------+--------------+ +| Bucket Lifecycle [1]_ [2]_ [3]_ [4]_ [5]_ [6]_ | Bucket Lifecycle | No | ++------------------------------------------------+------------------+--------------+ +| `Bucket policy`_ | Advanced ACLs | No | ++------------------------------------------------+------------------+--------------+ +| Public website [7]_ [8]_ [9]_ [10]_ | Public Website | No | ++------------------------------------------------+------------------+--------------+ +| Billing [11]_ [12]_ | Billing | No | ++------------------------------------------------+------------------+--------------+ +| `GET Bucket location`_ | Advanced Feature | Yes | ++------------------------------------------------+------------------+--------------+ +| `Delete Multiple Objects`_ | Advanced Feature | Yes | ++------------------------------------------------+------------------+--------------+ +| `Object tagging`_ | Advanced Feature | No | ++------------------------------------------------+------------------+--------------+ +| `GET Object torrent`_ | Advanced Feature | No | ++------------------------------------------------+------------------+--------------+ +| `Bucket inventory`_ | Advanced Feature | No | ++------------------------------------------------+------------------+--------------+ +| `GET Bucket service`_ | Advanced Feature | No | ++------------------------------------------------+------------------+--------------+ +| `Bucket accelerate`_ | CDN Integration | No | ++------------------------------------------------+------------------+--------------+ + +---- + +.. _GET Object: http://docs.amazonwebservices.com/AmazonS3/latest/API/RESTObjectGET.html +.. _HEAD Object: http://docs.amazonwebservices.com/AmazonS3/latest/API/RESTObjectHEAD.html +.. _PUT Object: http://docs.amazonwebservices.com/AmazonS3/latest/API/RESTObjectPUT.html +.. _PUT Object Copy: http://docs.amazonwebservices.com/AmazonS3/latest/API/RESTObjectCOPY.html +.. _DELETE Object: http://docs.amazonwebservices.com/AmazonS3/latest/API/RESTObjectDELETE.html +.. _Initiate Multipart Upload: http://docs.amazonwebservices.com/AmazonS3/latest/API/mpUploadInitiate.html +.. _Upload Part: http://docs.amazonwebservices.com/AmazonS3/latest/API/mpUploadUploadPart.html +.. _Upload Part Copy: http://docs.amazonwebservices.com/AmazonS3/latest/API/mpUploadUploadPartCopy.html +.. _Complete Multipart Upload: http://docs.amazonwebservices.com/AmazonS3/latest/API/mpUploadComplete.html +.. _Abort Multipart Upload: http://docs.amazonwebservices.com/AmazonS3/latest/API/mpUploadAbort.html +.. _List Parts: http://docs.amazonwebservices.com/AmazonS3/latest/API/mpUploadListParts.html +.. _GET Object ACL: http://docs.amazonwebservices.com/AmazonS3/latest/API/RESTObjectGETacl.html +.. _PUT Object ACL: http://docs.amazonwebservices.com/AmazonS3/latest/API/RESTObjectPUTacl.html +.. _Delete Multiple Objects: http://docs.amazonwebservices.com/AmazonS3/latest/API/multiobjectdeleteapi.html +.. _GET Object torrent: http://docs.amazonwebservices.com/AmazonS3/latest/API/RESTObjectGETtorrent.html +.. _Object tagging: http://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectGETtagging.html + +.. _PUT Bucket: http://docs.amazonwebservices.com/AmazonS3/latest/API/RESTBucketPUT.html +.. _GET Bucket List Objects: http://docs.amazonwebservices.com/AmazonS3/latest/API/RESTBucketGET.html +.. _HEAD Bucket: http://docs.amazonwebservices.com/AmazonS3/latest/API/RESTBucketHEAD.html +.. _DELETE Bucket: http://docs.amazonwebservices.com/AmazonS3/latest/API/RESTBucketDELETE.html +.. _List Multipart Uploads: http://docs.amazonwebservices.com/AmazonS3/latest/API/mpUploadListMPUpload.html +.. _GET Bucket acl: http://docs.amazonwebservices.com/AmazonS3/latest/API/RESTBucketGETacl.html +.. _PUT Bucket acl: http://docs.amazonwebservices.com/AmazonS3/latest/API/RESTBucketPUTacl.html +.. _Bucket notification: http://docs.amazonwebservices.com/AmazonS3/latest/API/RESTBucketGETnotification.html +.. _Bucket policy: http://docs.amazonwebservices.com/AmazonS3/latest/API/RESTBucketGETpolicy.html +.. _GET Bucket location: http://docs.amazonwebservices.com/AmazonS3/latest/API/RESTBucketGETlocation.html +.. _Bucket accelerate: http://docs.aws.amazon.com/AmazonS3/latest/API/RESTBucketGETaccelerate.html +.. _Bucket inventory: http://docs.aws.amazon.com/AmazonS3/latest/API/RESTBucketGETInventoryConfig.html +.. _GET Bucket service: http://docs.aws.amazon.com/AmazonS3/latest/API/RESTServiceGET.html + +.. Versioning +.. _Versioning: http://docs.amazonwebservices.com/AmazonS3/latest/API/RESTBucketGETversioningStatus.html + + +.. Lifecycle +.. [1] `POST restore `_ +.. [2] `Bucket lifecycle `_ +.. [3] `Bucket logging `_ +.. [4] `Bucket analytics `_ +.. [5] `Bucket metrics `_ +.. [6] `Bucket replication `_ + + +.. Public website +.. [7] `OPTIONS object `_ +.. [8] `Object POST from HTML form `_ +.. [9] `Bucket public website `_ +.. [10] `Bucket CORS `_ + + +.. Billing +.. [11] `Request payment `_ +.. [12] `Bucket tagging `_ diff --git a/doc/source/test-cors.html b/doc/source/test-cors.html new file mode 100644 index 0000000000..b8a28a195f --- /dev/null +++ b/doc/source/test-cors.html @@ -0,0 +1,60 @@ + + + + + Test CORS + + + + Token


+ + Method
+

+ + URL (Container or Object)


+ + + +

+    

+


+

+
+    
+
+  
+
diff --git a/docker/dockerhub_description.md b/docker/dockerhub_description.md
new file mode 100644
index 0000000000..d5a3739f58
--- /dev/null
+++ b/docker/dockerhub_description.md
@@ -0,0 +1,50 @@
+# SAIO (Swift All in One)
+
+SAIO is a containerized instance of Openstack Swift object storage. It is
+running the main services of Swift, designed to provide an endpoint for
+application developers to test against both the Swift and AWS S3 API. It can
+also be used when integrating with a CI/CD system. These images are not
+configured to provide data durability and are not intended for production use.
+
+
+# Quickstart
+
+```
+docker pull openstackswift/saio
+docker run -d -p 8080:8080 openstackswift/saio
+```
+
+### Test against Swift API:
+
+Example using swift client to target endpoint:
+```
+swift -A http://127.0.0.1:8080/auth/v1.0 -U test:tester -K testing stat
+```
+
+### Test against S3 API:
+
+Example using s3cmd to test AWS S3:
+
+1. Create config file:
+```
+[default]
+access_key = test:tester
+secret_key = testing
+host_base = localhost:8080
+host_bucket = localhost:8080
+use_https = False
+```
+
+2. Test with s3cmd:
+```
+s3cmd -c s3cfg_saio mb s3://bucket
+```
+
+# Quick Reference
+
+- **Image tags**: `latest` automatically built/published by Zuul, follows
+   master branch. Releases are also tagged in case you want to test against
+   a specific release.
+- **Source Code**: github.com/openstack/swift
+- **Maintained by**: Openstack Swift community
+- **Feedback/Questions**: #openstack-swift on OFTC
diff --git a/docker/install_scripts/00_swift_needs.sh b/docker/install_scripts/00_swift_needs.sh
new file mode 100755
index 0000000000..b83621d7b0
--- /dev/null
+++ b/docker/install_scripts/00_swift_needs.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+set -e
+
+# adduser -D -H syslog && \
+for user in "swift"; do
+  if ! id -u $user > /dev/null 2>&1 ; then
+    adduser -D $user
+    printf "created user $user\n"
+  fi
+done
+printf "\n"
+# mkdir /srv/node && \
+# mkdir /var/spool/rsyslog && \
+# chown -R swift:swift /srv/node/ && \
+for dirname in "/srv/node" "$HOME/bin" "/opt" "/var/cache/swift" " /var/log/socklog/swift" "/var/log/swift/" "/var/run/swift"; do
+  if [ ! -d $dirname ]; then
+    mkdir -p $dirname
+    printf "created $dirname\n"
+  fi
+done
+# mkdir -p $HOME/bin && \
+# mkdir -p /opt
diff --git a/docker/install_scripts/10_apk_install_prereqs.sh b/docker/install_scripts/10_apk_install_prereqs.sh
new file mode 100755
index 0000000000..f021b4c113
--- /dev/null
+++ b/docker/install_scripts/10_apk_install_prereqs.sh
@@ -0,0 +1,27 @@
+#!/bin/sh
+set -e
+
+echo "@testing http://dl-cdn.alpinelinux.org/alpine/edge/testing" >> /etc/apk/repositories
+apk add --update \
+  linux-headers \
+  liberasurecode@testing \
+  liberasurecode-dev@testing \
+  gnupg \
+  git \
+  curl \
+  rsync \
+  memcached \
+  openssl \
+  openssl-dev \
+  sqlite \
+  sqlite-libs \
+  sqlite-dev \
+  xfsprogs \
+  zlib-dev \
+  g++ \
+  libffi \
+  libffi-dev \
+  libxslt \
+  libxslt-dev \
+  libxml2 \
+  libxml2-dev \
diff --git a/docker/install_scripts/21_apk_install_py3.sh b/docker/install_scripts/21_apk_install_py3.sh
new file mode 100755
index 0000000000..0bc647d513
--- /dev/null
+++ b/docker/install_scripts/21_apk_install_py3.sh
@@ -0,0 +1,12 @@
+#!/bin/sh
+set -e
+
+apk add --update \
+  python3 \
+  python3-dev \
+  py3-pip \
+  py3-cffi \
+  py3-cryptography
+
+if [ ! -e /usr/bin/pip ]; then ln -s pip3 /usr/bin/pip ; fi
+
diff --git a/docker/install_scripts/50_swift_install.sh b/docker/install_scripts/50_swift_install.sh
new file mode 100755
index 0000000000..953d75d541
--- /dev/null
+++ b/docker/install_scripts/50_swift_install.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+set -e
+
+pip install -U pip
+cd /opt/swift
+pip install -r requirements.txt
+pip install -e .
+
+cp doc/saio/bin/* $HOME/bin
+chmod +x $HOME/bin/*
+sed -i "s/bash/sh/g" $HOME/bin/*
+sed -i "s/sudo //g" $HOME/bin/*
+mkdir /root/tmp
+echo "export PATH=${PATH}:$HOME/bin" >> $HOME/.shrc
+echo "export PYTHON_EGG_CACHE=/root/tmp" >> $HOME/.shrc
+echo "export ENV=$HOME/.shrc" >> $HOME/.profile
+chmod +x $HOME/.shrc
+chmod +x $HOME/.profile
diff --git a/docker/install_scripts/60_pip_uninstall_dev.sh b/docker/install_scripts/60_pip_uninstall_dev.sh
new file mode 100755
index 0000000000..655aba5ccc
--- /dev/null
+++ b/docker/install_scripts/60_pip_uninstall_dev.sh
@@ -0,0 +1,9 @@
+#!/bin/sh
+set -e
+
+echo "- - - - - - - - uninstalling simplejson"
+pip uninstall --yes simplejson
+echo "- - - - - - - - uninstalling pyopenssl"
+pip uninstall --yes pyopenssl
+echo "- - - - - - - - deleting python3-dev residue (config-3.6m-x86_64-linux-gnu)"
+rm -rf /opt/usr/local/lib/python3.6/config-3.6m-x86_64-linux-gnu/
diff --git a/docker/install_scripts/99_apk_uninstall_dev.sh b/docker/install_scripts/99_apk_uninstall_dev.sh
new file mode 100755
index 0000000000..c4692ff3b4
--- /dev/null
+++ b/docker/install_scripts/99_apk_uninstall_dev.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+set -e
+
+cd /
+rm -rf /build
+
+apk del gnupg
+apk del git
+apk del openssl-dev
+apk del sqlite-dev
+apk del zlib-dev
+apk del g++
+apk del libffi-dev
+apk del libxslt-dev
+apk del libxml2-dev
+apk del python3-dev
+rm -rf /var/cache/apk/*
diff --git a/docker/install_scripts/python_test_dirs b/docker/install_scripts/python_test_dirs
new file mode 100644
index 0000000000..382d99284e
--- /dev/null
+++ b/docker/install_scripts/python_test_dirs
@@ -0,0 +1,7 @@
+/opt/python/usr/local/lib/python3.6/ctypes/test
+/opt/python/usr/local/lib/python3.6/distutils/tests
+/opt/python/usr/local/lib/python3.6/idlelib/idle_test
+/opt/python/usr/local/lib/python3.6/lib2to3/tests
+/opt/python/usr/local/lib/python3.6/sqlite3/test
+/opt/python/usr/local/lib/python3.6/test
+/opt/python/usr/local/lib/python3.6/tkinter/test
diff --git a/docker/rootfs/etc/cont-init.d/01_swift_logs b/docker/rootfs/etc/cont-init.d/01_swift_logs
new file mode 100644
index 0000000000..fcc3e4c6a8
--- /dev/null
+++ b/docker/rootfs/etc/cont-init.d/01_swift_logs
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+s6-setuidgid swift ln -s /var/log/socklog/swift/swift_all/current /var/log/swift/all.log
+s6-setuidgid swift ln -s /var/log/socklog/swift/proxy_server/current /var/log/swift/proxy_access.log
diff --git a/docker/rootfs/etc/cont-init.d/02_build_remakerings b/docker/rootfs/etc/cont-init.d/02_build_remakerings
new file mode 100644
index 0000000000..e49d4a911a
--- /dev/null
+++ b/docker/rootfs/etc/cont-init.d/02_build_remakerings
@@ -0,0 +1,3 @@
+#!/usr/bin/with-contenv sh
+
+exec s6-setuidgid swift /etc/swift_build/prepare_rings
diff --git a/docker/rootfs/etc/fix-attrs.d/logging b/docker/rootfs/etc/fix-attrs.d/logging
new file mode 100644
index 0000000000..a8de639776
--- /dev/null
+++ b/docker/rootfs/etc/fix-attrs.d/logging
@@ -0,0 +1,2 @@
+/var/log/swift true swift 0755 0755
+/var/spool/rsyslog true syslog 0700 0700
diff --git a/docker/rootfs/etc/fix-attrs.d/srv_node b/docker/rootfs/etc/fix-attrs.d/srv_node
new file mode 100644
index 0000000000..adee7ec5e8
--- /dev/null
+++ b/docker/rootfs/etc/fix-attrs.d/srv_node
@@ -0,0 +1 @@
+/srv/node true swift 0700 0700
diff --git a/docker/rootfs/etc/fix-attrs.d/swift b/docker/rootfs/etc/fix-attrs.d/swift
new file mode 100644
index 0000000000..bf368314aa
--- /dev/null
+++ b/docker/rootfs/etc/fix-attrs.d/swift
@@ -0,0 +1,4 @@
+/etc/swift true swift 0700 0700
+/etc/swift/mime.types true swift 0700 0700
+/var/run/swift true swift 0755 0755
+/var/cache/swift true swift 0755 0755
diff --git a/docker/rootfs/etc/fix-attrs.d/tmp b/docker/rootfs/etc/fix-attrs.d/tmp
new file mode 100644
index 0000000000..8a6d27a5c0
--- /dev/null
+++ b/docker/rootfs/etc/fix-attrs.d/tmp
@@ -0,0 +1 @@
+/tmp true root 0700 0700
diff --git a/docker/rootfs/etc/profile b/docker/rootfs/etc/profile
new file mode 100644
index 0000000000..9ad1060e20
--- /dev/null
+++ b/docker/rootfs/etc/profile
@@ -0,0 +1,11 @@
+export CHARSET=UTF-8
+export PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/python/usr/local/bin
+export PAGER=less
+export PS1='\h:\w\$ '
+umask 022
+
+for script in /etc/profile.d/*.sh ; do
+        if [ -r $script ] ; then
+                . $script
+        fi
+done
diff --git a/docker/rootfs/etc/rsyncd.conf b/docker/rootfs/etc/rsyncd.conf
new file mode 100644
index 0000000000..2d4a1fb35d
--- /dev/null
+++ b/docker/rootfs/etc/rsyncd.conf
@@ -0,0 +1,23 @@
+uid = swift
+gid = swift
+log file = /var/log/rsyncd.log
+pid file = /var/run/rsyncd.pid
+address = 127.0.0.1
+
+[account]
+max connections = 2
+path = /srv/node/
+read only = false
+lock file = /var/lock/account.lock
+
+[container]
+max connections = 2
+path = /srv/node/
+read only = false
+lock file = /var/lock/container.lock
+
+[object]
+max connections = 2
+path = /srv/node/
+read only = false
+lock file = /var/lock/object.lock
diff --git a/docker/rootfs/etc/rsyslog.conf b/docker/rootfs/etc/rsyslog.conf
new file mode 100644
index 0000000000..fa42783dab
--- /dev/null
+++ b/docker/rootfs/etc/rsyslog.conf
@@ -0,0 +1,64 @@
+#  /etc/rsyslog.conf	Configuration file for rsyslog.
+#
+#			For more information see
+#			/usr/share/doc/rsyslog-doc/html/rsyslog_conf.html
+#
+#  Default logging rules can be found in /etc/rsyslog.d/50-default.conf
+
+
+#################
+#### MODULES ####
+#################
+
+$ModLoad imuxsock # provides support for local system logging
+#$ModLoad imklog   # provides kernel logging support
+#$ModLoad immark  # provides --MARK-- message capability
+
+# provides UDP syslog reception
+#$ModLoad imudp
+#$UDPServerRun 514
+
+# provides TCP syslog reception
+#$ModLoad imtcp
+#$InputTCPServerRun 514
+
+# Enable non-kernel facility klog messages
+$KLogPermitNonKernelFacility on
+
+###########################
+#### GLOBAL DIRECTIVES ####
+###########################
+
+#
+# Use traditional timestamp format.
+# To enable high precision timestamps, comment out the following line.
+#
+$ActionFileDefaultTemplate RSYSLOG_TraditionalFileFormat
+
+# Filter duplicated messages
+$RepeatedMsgReduction on
+
+# Disable rate-limiting of log entries
+$SystemLogRateLimitInterval 0
+$SystemLogRateLimitBurst 0
+
+#
+# Set the default permissions for all log files.
+#
+$FileOwner syslog
+$FileGroup adm
+$FileCreateMode 0640
+$DirCreateMode 0755
+$Umask 0022
+$PrivDropToUser syslog
+$PrivDropToGroup syslog
+
+#
+# Where to place spool and state files
+#
+$WorkDirectory /var/spool/rsyslog
+
+#
+# Include all config files in /etc/rsyslog.d/
+#
+$IncludeConfig /etc/rsyslog.d/*.conf
diff --git a/docker/rootfs/etc/rsyslog.d/00-swift.conf b/docker/rootfs/etc/rsyslog.d/00-swift.conf
new file mode 100644
index 0000000000..00adb57258
--- /dev/null
+++ b/docker/rootfs/etc/rsyslog.d/00-swift.conf
@@ -0,0 +1,16 @@
+# NOTE: we used to enable UDP logging here, but we switched
+# back to just unix domain socket.
+
+#$imjournalRatelimitInterval 60
+#$imjournalRatelimitBurst 600000
+
+# *.*                         @127.0.0.1:514
+
+# Log all Swift proxy-server access log lines (local2) to
+# /var/log/swift/proxy_access.log
+local2.* /var/log/swift/proxy_access.log;RSYSLOG_FileFormat
+
+# Log all Swift lines to /var/log/swift/all.log
+# AND PREVENT FURTHER LOGGING OF THEM (eg. to /var/log/syslog)
+local0.*;local2.* /var/log/swift/all.log;RSYSLOG_TraditionalFileFormat
+& ~
diff --git a/docker/rootfs/etc/rsyslog.d/50-default.conf b/docker/rootfs/etc/rsyslog.d/50-default.conf
new file mode 100644
index 0000000000..d8283d485d
--- /dev/null
+++ b/docker/rootfs/etc/rsyslog.d/50-default.conf
@@ -0,0 +1,68 @@
+#  Default rules for rsyslog.
+#
+#			For more information see rsyslog.conf(5) and /etc/rsyslog.conf
+
+#
+# First some standard log files.  Log by facility.
+#
+auth,authpriv.*			/var/log/auth.log
+*.*;auth,authpriv.none		-/var/log/syslog
+#cron.*				/var/log/cron.log
+#daemon.*			-/var/log/daemon.log
+#kern.*				-/var/log/kern.log
+#lpr.*				-/var/log/lpr.log
+#mail.*				-/var/log/mail.log
+#user.*				-/var/log/user.log
+
+#
+# Logging for the mail system.  Split it up so that
+# it is easy to write scripts to parse these files.
+#
+#mail.info			-/var/log/mail.info
+#mail.warn			-/var/log/mail.warn
+mail.err			/var/log/mail.err
+
+#
+# Logging for INN news system.
+#
+news.crit			/var/log/news/news.crit
+news.err			/var/log/news/news.err
+news.notice			-/var/log/news/news.notice
+
+#
+# Some "catch-all" log files.
+#
+#*.=debug;\
+#	auth,authpriv.none;\
+#	news.none;mail.none	-/var/log/debug
+#*.=info;*.=notice;*.=warn;\
+#	auth,authpriv.none;\
+#	cron,daemon.none;\
+#	mail,news.none		-/var/log/messages
+
+#
+# Emergencies are sent to everybody logged in.
+#
+*.emerg                                :omusrmsg:*
+
+#
+# I like to have messages displayed on the console, but only on a virtual
+# console I usually leave idle.
+#
+#daemon,mail.*;\
+#	news.=crit;news.=err;news.=notice;\
+#	*.=debug;*.=info;\
+#	*.=notice;*.=warn	/dev/tty8
+
+# The named pipe /dev/xconsole is for the `xconsole' utility.  To use it,
+# you must invoke `xconsole' with the `-file' option:
+#
+#    $ xconsole -file /dev/xconsole [...]
+#
+# NOTE: adjust the list below, or you'll go crazy if you have a reasonably
+#      busy site..
+#
+daemon.*;mail.*;\
+	news.err;\
+	*.=debug;*.=info;\
+	*.=notice;*.=warn	|/dev/xconsole
diff --git a/docker/rootfs/etc/services.d/memcached/run b/docker/rootfs/etc/services.d/memcached/run
new file mode 100644
index 0000000000..a6d5f994d6
--- /dev/null
+++ b/docker/rootfs/etc/services.d/memcached/run
@@ -0,0 +1,3 @@
+#!/usr/bin/execlineb -P
+
+memcached -u root -l 127.0.0.1
diff --git a/docker/rootfs/etc/services.d/swift-account/run b/docker/rootfs/etc/services.d/swift-account/run
new file mode 100644
index 0000000000..f310983146
--- /dev/null
+++ b/docker/rootfs/etc/services.d/swift-account/run
@@ -0,0 +1,5 @@
+#!/bin/sh
+source /etc/profile
+
+# swift-account-server /etc/swift/account-server.conf
+exec s6-setuidgid swift swift-init account restart --no-daemon
diff --git a/docker/rootfs/etc/services.d/swift-container/run b/docker/rootfs/etc/services.d/swift-container/run
new file mode 100644
index 0000000000..0ed34dad05
--- /dev/null
+++ b/docker/rootfs/etc/services.d/swift-container/run
@@ -0,0 +1,5 @@
+#!/bin/sh
+source /etc/profile
+
+# swift-container-server /etc/swift/container-server.conf
+exec s6-setuidgid swift swift-init container restart --no-daemon
diff --git a/docker/rootfs/etc/services.d/swift-object/run b/docker/rootfs/etc/services.d/swift-object/run
new file mode 100644
index 0000000000..80f8c8d618
--- /dev/null
+++ b/docker/rootfs/etc/services.d/swift-object/run
@@ -0,0 +1,4 @@
+#!/bin/sh
+source /etc/profile
+
+exec s6-setuidgid swift swift-init object restart --no-daemon
diff --git a/docker/rootfs/etc/services.d/swift-proxy/run b/docker/rootfs/etc/services.d/swift-proxy/run
new file mode 100644
index 0000000000..be73b5398d
--- /dev/null
+++ b/docker/rootfs/etc/services.d/swift-proxy/run
@@ -0,0 +1,5 @@
+#!/bin/sh
+source /etc/profile
+
+# swift-proxy-server /etc/swift/proxy-server.conf
+exec s6-setuidgid swift swift-init proxy restart --no-daemon
diff --git a/docker/rootfs/etc/socklog.rules/swift_account_server b/docker/rootfs/etc/socklog.rules/swift_account_server
new file mode 100644
index 0000000000..7a8ddb8ea0
--- /dev/null
+++ b/docker/rootfs/etc/socklog.rules/swift_account_server
@@ -0,0 +1,3 @@
+-
++\local5.*
+/var/log/socklog/swift/account_server
diff --git a/docker/rootfs/etc/socklog.rules/swift_all b/docker/rootfs/etc/socklog.rules/swift_all
new file mode 100644
index 0000000000..6ce9c899dc
--- /dev/null
+++ b/docker/rootfs/etc/socklog.rules/swift_all
@@ -0,0 +1,7 @@
+-
++\local5.*
++\local4.*
++\local3.*
++\local2.*
++\local0.*
+/var/log/socklog/swift/swift_all
diff --git a/docker/rootfs/etc/socklog.rules/swift_container_server b/docker/rootfs/etc/socklog.rules/swift_container_server
new file mode 100644
index 0000000000..428119e964
--- /dev/null
+++ b/docker/rootfs/etc/socklog.rules/swift_container_server
@@ -0,0 +1,3 @@
+-
++\local4.*
+/var/log/socklog/swift/container_server
diff --git a/docker/rootfs/etc/socklog.rules/swift_object_server b/docker/rootfs/etc/socklog.rules/swift_object_server
new file mode 100644
index 0000000000..e2e92fe9d2
--- /dev/null
+++ b/docker/rootfs/etc/socklog.rules/swift_object_server
@@ -0,0 +1,3 @@
+-
++\local3.*
+/var/log/socklog/swift/object_server
diff --git a/docker/rootfs/etc/socklog.rules/swift_proxy_server b/docker/rootfs/etc/socklog.rules/swift_proxy_server
new file mode 100644
index 0000000000..b743082168
--- /dev/null
+++ b/docker/rootfs/etc/socklog.rules/swift_proxy_server
@@ -0,0 +1,3 @@
+-
++\local2.*
+/var/log/socklog/swift/proxy_server
diff --git a/docker/rootfs/etc/swift/account-server.conf b/docker/rootfs/etc/swift/account-server.conf
new file mode 100644
index 0000000000..fd61d550ca
--- /dev/null
+++ b/docker/rootfs/etc/swift/account-server.conf
@@ -0,0 +1,25 @@
+[DEFAULT]
+devices = /srv/node/
+bind_ip = 127.0.0.1
+bind_port = 6202
+workers = 2
+mount_check = false
+log_facility = LOG_LOCAL5
+
+[pipeline:main]
+pipeline = healthcheck recon account-server
+
+[app:account-server]
+use = egg:swift#account
+
+[filter:recon]
+use = egg:swift#recon
+
+[filter:healthcheck]
+use = egg:swift#healthcheck
+
+[account-replicator]
+
+[account-auditor]
+
+[account-reaper]
diff --git a/docker/rootfs/etc/swift/container-server.conf b/docker/rootfs/etc/swift/container-server.conf
new file mode 100644
index 0000000000..aec046aafa
--- /dev/null
+++ b/docker/rootfs/etc/swift/container-server.conf
@@ -0,0 +1,27 @@
+[DEFAULT]
+devices = /srv/node/
+bind_ip = 127.0.0.1
+bind_port = 6201
+workers = 2
+mount_check = false
+log_facility = LOG_LOCAL4
+
+[pipeline:main]
+pipeline = healthcheck recon container-server
+
+[app:container-server]
+use = egg:swift#container
+
+[filter:recon]
+use = egg:swift#recon
+
+[filter:healthcheck]
+use = egg:swift#healthcheck
+
+[container-replicator]
+
+[container-updater]
+
+[container-auditor]
+
+[container-sync]
diff --git a/docker/rootfs/etc/swift/object-server.conf b/docker/rootfs/etc/swift/object-server.conf
new file mode 100644
index 0000000000..67bbf2cb37
--- /dev/null
+++ b/docker/rootfs/etc/swift/object-server.conf
@@ -0,0 +1,26 @@
+[DEFAULT]
+devices = /srv/node/
+bind_ip = 127.0.0.1
+bind_port = 6200
+workers = 2
+mount_check = false
+log_facility = LOG_LOCAL3
+
+[pipeline:main]
+pipeline = healthcheck recon object-server
+
+[app:object-server]
+use = egg:swift#object
+
+[filter:recon]
+use = egg:swift#recon
+
+[filter:healthcheck]
+use = egg:swift#healthcheck
+
+
+[object-replicator]
+
+[object-updater]
+
+[object-auditor]
diff --git a/docker/rootfs/etc/swift/proxy-server.conf b/docker/rootfs/etc/swift/proxy-server.conf
new file mode 100644
index 0000000000..8189cb7f2d
--- /dev/null
+++ b/docker/rootfs/etc/swift/proxy-server.conf
@@ -0,0 +1,100 @@
+[DEFAULT]
+bind_ip = 0.0.0.0
+bind_port = 8080
+log_address = /dev/log
+log_facility = LOG_LOCAL2
+log_headers = false
+log_level = DEBUG
+log_name = proxy-server
+user = swift
+
+[pipeline:main]
+pipeline = catch_errors gatekeeper healthcheck proxy-logging cache etag-quoter listing_formats bulk tempurl ratelimit s3api tempauth staticweb copy container-quotas account-quotas slo dlo versioned_writes symlink proxy-logging proxy-server
+
+[filter:catch_errors]
+use = egg:swift#catch_errors
+
+[filter:healthcheck]
+use = egg:swift#healthcheck
+
+[filter:proxy-logging]
+use = egg:swift#proxy_logging
+
+[filter:bulk]
+use = egg:swift#bulk
+
+[filter:ratelimit]
+use = egg:swift#ratelimit
+
+[filter:crossdomain]
+use = egg:swift#crossdomain
+
+[filter:dlo]
+use = egg:swift#dlo
+
+[filter:slo]
+use = egg:swift#slo
+
+[filter:tempurl]
+use = egg:swift#tempurl
+
+[filter:tempauth]
+use = egg:swift#tempauth
+user_admin_admin = admin .admin .reseller_admin
+user_test_tester = testing .admin
+user_test_tester2 = testing2 .admin
+user_test_tester3 = testing3
+user_test2_tester2 = testing2 .admin
+
+[filter:staticweb]
+use = egg:swift#staticweb
+
+[filter:account-quotas]
+use = egg:swift#account_quotas
+
+[filter:container-quotas]
+use = egg:swift#container_quotas
+
+[filter:cache]
+use = egg:swift#memcache
+
+[filter:etag-quoter]
+use = egg:swift#etag_quoter
+enable_by_default = false
+
+[filter:gatekeeper]
+use = egg:swift#gatekeeper
+
+[filter:versioned_writes]
+use = egg:swift#versioned_writes
+allow_versioned_writes = true
+allow_object_versioning = true
+
+[filter:copy]
+use = egg:swift#copy
+
+[filter:listing_formats]
+use = egg:swift#listing_formats
+
+[filter:symlink]
+use = egg:swift#symlink
+
+# To enable, add the s3api middleware to the pipeline before tempauth
+[filter:s3api]
+use = egg:swift#s3api
+cors_preflight_allow_origin = *
+
+# Example to create root secret: `openssl rand -base64 32`
+[filter:keymaster]
+use = egg:swift#keymaster
+encryption_root_secret = changeme/changeme/changeme/changeme/change/=
+
+# To enable use of encryption add both middlewares to pipeline, example:
+#  keymaster encryption proxy-logging proxy-server
+[filter:encryption]
+use = egg:swift#encryption
+
+[app:proxy-server]
+use = egg:swift#proxy
+allow_account_management = true
+account_autocreate = true
diff --git a/docker/rootfs/etc/swift/swift.conf b/docker/rootfs/etc/swift/swift.conf
new file mode 100644
index 0000000000..129a5d98a2
--- /dev/null
+++ b/docker/rootfs/etc/swift/swift.conf
@@ -0,0 +1,17 @@
+[swift-hash]
+# random unique strings that can never change (DO NOT LOSE)
+swift_hash_path_prefix = bd08f643f5663c4ec607
+swift_hash_path_suffix = f423bf7ab663888fe832
+
+[storage-policy:0]
+name = 1replica
+default = true
+policy_type = replication
+
+# [storage-policy:1]
+# name = EC42
+# policy_type = erasure_coding
+# ec_type = liberasurecode_rs_vand
+# ec_num_data_fragments = 4
+# ec_num_parity_fragments = 2
+# ec_object_segment_size = 1048576
diff --git a/docker/rootfs/etc/swift_build/build_devices b/docker/rootfs/etc/swift_build/build_devices
new file mode 100755
index 0000000000..7588377dd1
--- /dev/null
+++ b/docker/rootfs/etc/swift_build/build_devices
@@ -0,0 +1,62 @@
+#!/usr/bin/with-contenv sh
+
+cd /etc/swift
+DEV_SIZE="1GB"
+# POLICIES="object container account"
+MY_STORAGE_TYPE=${STORAGE_TYPE:-"internal_dirs"}
+MY_DEVICE_COUNT=${DEVICE_COUNT:-6}
+
+echo "[[ checking --privileged ]]"
+ip link add dummy0 type dummy >/dev/null
+if [[ $? -eq 0 ]]; then
+  PRIVILEGED=true
+  # clean the dummy0 link
+  ip link delete dummy0 >/dev/null
+else
+  PRIVILEGED=false
+fi
+
+echo "storage type is $MY_STORAGE_TYPE. container is privileged? $PRIVILEGED"
+
+echo "[[ checking what to use as storage devices ]]"
+DEVICE_LIST=""
+if [[ $MY_STORAGE_TYPE == "external_devices" ]]; then
+  DEVICE_LIST=$(ls /dev/ | grep -i "swift-d")
+  MY_DEVICE_COUNT=$(wc -w $DEVICE_LIST)
+  echo "  using external device. devices found: $DEVICE_LIST"
+elif [[ $MY_DEVICE_COUNT -le 0 ]]; then
+  echo "Device count must be greater than 0"
+  exit -1
+else
+  for i in $(seq 0 $(( MY_DEVICE_COUNT-1 ))); do
+    DEVICE_LIST="$DEVICE_LIST swift-d$i"
+  done
+  # echo "  using internal devices. devices to create: $DEVICE_LIST"
+fi
+
+if [[ $MY_STORAGE_TYPE == "internal_devices" ]]; then
+  for device in $DEVICE_LIST; do
+    truncate -s $DEV_SIZE /dev/$device;
+    echo "    created storage device /dev/swift-d$i of $DEV_SIZE";
+  done
+fi
+
+export PATH=$PATH:/opt/python/usr/local/bin/
+
+echo "[[ creating directories ]]"
+for dir in $DEVICE_LIST; do
+  mkdir -p /srv/node/$dir;
+  echo "  created /srv/node/$dir";
+done
+
+if [[ $MY_STORAGE_TYPE == "internal_devices" ]] || [[ $MY_STORAGE_TYPE == "external_devices" ]]; then
+  echo "[[ formating and mounting storage devices ]] "
+  for device in $DEVICE_LIST; do
+    # truncate -s $DEV_SIZE /dev/swift-d$i;
+    # echo "created storage device /dev/swift-d$i of $DEV_SIZE";
+    mkfs.xfs -f -L D$i -i size=512 /dev/$device;
+    echo "  created XFS file system on device /dev/$device";
+    mount -t xfs -o noatime /dev/$device /srv/node/$device;
+    echo "  mounted /dev/$device as /srv/node/$device";
+  done
+fi
diff --git a/docker/rootfs/etc/swift_build/build_remakerings b/docker/rootfs/etc/swift_build/build_remakerings
new file mode 100755
index 0000000000..08830d6491
--- /dev/null
+++ b/docker/rootfs/etc/swift_build/build_remakerings
@@ -0,0 +1,36 @@
+#!/usr/bin/with-contenv sh
+
+POLICIES="object container account"
+
+for p in $POLICIES; do
+  echo "swift-ring-builder $p.builder create 10 1 1" > /etc/swift/remakerings.$p;
+  echo "started /etc/swift/remakerings.$p with 'swift-ring-build create'"
+done
+
+
+for drive in `ls /srv/node/ | grep 'swift-d'`; do
+  echo "swift-ring-builder object.builder add r1z1-127.0.0.1:6200/$drive 1" >> /etc/swift/remakerings.object
+  echo "pushed command to add r1z1-127.0.0.1:6200/$drive to /etc/swift/remakerings.object"
+  echo "swift-ring-builder container.builder add r1z1-127.0.0.1:6201/$drive 1" >> /etc/swift/remakerings.container
+  echo "pushed command to add r1z1-127.0.0.1:6201/$drive to /etc/swift/remakerings.container"
+  echo "swift-ring-builder account.builder add r1z1-127.0.0.1:6202/$drive 1" >> /etc/swift/remakerings.account
+  echo "pushed command to add r1z1-127.0.0.1:6202/$drive to /etc/swift/remakerings.account"
+done
+
+for p in $POLICIES; do
+  echo "swift-ring-builder $p.builder rebalance" >> /etc/swift/remakerings.$p;
+  echo "pushed command to rebalance ring into /etc/swift/remakerings.$p"
+done
+
+echo "rm -f *.builder *.ring.gz backups/*.builder backups/*.ring.gz" > /etc/swift/remakerings
+echo "created umbrella /etc/swift/remakerings, with deleting all ring files"
+
+for p in $POLICIES; do
+  cat /etc/swift/remakerings.$p >> /etc/swift/remakerings;
+  echo "pushed /etc/swift/remakerings.$p to /etc/swift/remakerings"
+  rm -f /etc/swift/remakerings.$p;
+  echo "deleted /etc/swift/remakerings.$p"
+done
+
+chmod +x /etc/swift/remakerings
+echo "made remaketings executable (+x)"
diff --git a/docker/rootfs/etc/swift_build/prepare_rings b/docker/rootfs/etc/swift_build/prepare_rings
new file mode 100755
index 0000000000..7b951ff223
--- /dev/null
+++ b/docker/rootfs/etc/swift_build/prepare_rings
@@ -0,0 +1,6 @@
+#!/usr/bin/with-contenv sh
+
+/etc/swift_build/build_devices
+/etc/swift_build/build_remakerings
+cd /etc/swift
+/etc/swift/remakerings
diff --git a/docker/s6-gpg-pub-key b/docker/s6-gpg-pub-key
new file mode 100644
index 0000000000..c3a865fa2c
--- /dev/null
+++ b/docker/s6-gpg-pub-key
@@ -0,0 +1,69 @@
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+Version: SKS 1.1.6
+Comment: Hostname: pgp.mit.edu
+
+mQINBFe3YfMBEAC6pERKLjXDcWWrMU9l68ujJkbCjtnKYRKsIjsmvoETHJkCZaHXX0JoVFth
+7OEhEh8wQG6PTWb6HPFWJxKJaLTOS6d5xc7i8iMWFjUkssh7jEJY0unON8OleggjL4bPz2Ra
+Ox5hKJru1A8BjDdT4XyYWk+PFjaJGmll7FyqyVIng2bGRYgRah+CjKPjzk1RX5cfz48lO1wg
+Fs4rzd/SrpcbqMW1nv57ZCNK1nPrDpXytrMA2ZaMxWa5I13NXTQ9hJw0yhCV46f+4vXBvz4l
+0HrVqlZE16iaiW9rniHHM1FFqH9aOMU6PWWNzrO4cyMiNBEgLT5jNAFFteKufUKaOlGRT768
+kyRfvC/uYND3BdZ8EcC+e8Fe+g7Xj/L85853XeCApDIT+FG4Poiby71SWu/PDk9qm/BJ18kh
+6f8EJvWJWMBQJCQHYs5LWEU0BUSnFucbJhd6wF+47wDC9hByvwSOc+5Q4BIj4WHoOCYjaeX5
+ET2Kto7+E4UZjC+38q0G7oH4sOfe7FFHW/R9y/9AUj/AGhNx+lyruKOXKuTZByZlHZKWV4LT
+mkey3NIRahYKWWZIBN8ndAkP62QHuMGfWOKDC6VwgFVQGkHGYZ3NuEUNsN35P77XY7G7K8dV
+wlidTS57JZarNpILNJJsYkfMd6zrRZf9a+cZWMxyvgXKgaCx4QARAQABtDVKdXN0IENvbnRh
+aW5lcnMgQm90IDxqdXN0LmNvbnRhaW5lcnMucm9ib3RAZ21haWwuY29tPokCNAQTAQoAHgUC
+V7dh8wIbAwMLCQcDFQoIAh4BAheAAxYCAQIZAQAKCRBhAbJ4Oy/RYQrJD/49WWEJXgcZClEt
+BQUTo9KZKehAh9K5+455/lFtUh8YEhiF+7HAVlOL3KlGbg/ZUXkrXbGMW4Cm91nz99Fr+rZp
+LPcogZ0Lox5IVPn6zjmxRrWuaEvH/SlnhjUiBj9/rMgWwzTSV0PLP6bOhMJ0NIteAgW+jzSy
+4Sf4N+3XE1HAeL3sUtYex0FXzRTQAjMAnCa6AJS1dCJRc0tuI13XkiZnVnqELF2CCSnaPj6o
+hn/90/sKhr7PSGQznagiAjG49nzqOE/9CRVOy8JqNS+1Y8A1PmCVofvgy3uaPKL/yLMRXk2j
++5Fed9aVGXG3JE5lJjWUAyeL3jTEdE336tc+kHVUXrTSza/akvFHTJQfaw+MVuRIPT2JvZLl
+ePOxHgM+U9eOJ7rwXYoLS/e5KrGvhi+LCMO3r4UfIGL3cgtGkM7rwvfY3uMCq7hfoA6d4SGw
+h99J6h3M7O9+UxB4VH8yjQJl6ghY0ruEgp1PpKSo9Ogdz/loZpEExnOzp4zrdFalKcy9ehUh
+Ody/S79NlKsWOE1DtbM6IQHDxZplT9IJhTxuqrDgsIaYgwUxipqvA/kEU5k5QIIoJU8u5o6i
+ZLuC6mlqOhjmLst6/ndXuVAG4GwDKrwxri3zmctxHRwDzTJXsZsKYOqrheO6HRu+6VVVNAI5
+Q/nI/vN79vbZGAb8Z5PgZrkBDQRXt2HzAQgAsrKhLIusc/9dUOPi9f3FN30obwZLZRp8qTND
+glqSyAaL5WiiGJII1erM66s1dIv1qqUbTNd6nAKfb2w5zbgAOTAKsGNEzljFKAApdZm/sAyk
+Wx9PTqVQov6PAjzgoWC9yH8UcxhvxPtpw+rqnz1oUVK9paszoZWuPz5jAE/ZhdrEXy/51ckS
+jJ/p8T55SFK3p6UzSGDqQRfDwHDgDJMIzPABpnPk+ETf/YYWbJwOx81YrlRKBau8XdyBkRlK
+ZeZ+SrvDMugn45lWSdjXJZ2BH1U7akuWd7lYP3xI/Vfs2rF3e+7+72W75s/3pOVckdbgn13B
+REgdptgOBX9ILCtpwQARAQABiQNEBBgBCgAPBQJXt2HzBQkPCZwAAhsiASkJEGEBsng7L9Fh
+wF0gBBkBCgAGBQJXt2HzAAoJECU2yhbfT82iCzoH/iAw5+zBpXdE3Ju/KrCpZ4JwzSkAw4n7
+uj4UzTtzYb5KfkXAkIQFq5MTHJ6jpHe6g6aJf5Z4NV2cbw/4d9W5rAzXkuKnksoo7JbRDt+T
+adCBCuoz8HvkVT4lgV6TTWx3kMESGaqz/y0d8P+FRCKhmbv4ayTAZZJM2cdDcqtum8sYPs9R
+d6L13x8hZGTSKavLwus64/GA2tOa334zDDI1+7AoJRRLApqdYZmX/LrQykNoNR7RSzLIn5+S
+GdCS6JU8c0oQnJgf+7zililWqagkYRqaHhcBy90XiYOPMdHyKmudcfvpYLE78E0iyHhfmsAj
+I+pK3U4MquRA+v8AfL5/PLRKbhAAomTfB2WPI9ea1nN6OfCZZE9bq/PVmeahW0CZoBmCQJLn
+oypbBtMUnOhSFd+QUWekH8+prkvq15s8LdjfhJWlzMRbwourZvffmeHX8dTuMZwwV+7flnf+
+AH9OnwcKNg5/T4aRm3gZGSV7fTFh1Regx3136TIyRcwPqjwqbc9slW6Bg9veE3ayveUKaG0S
+WDjkPad4wqFWTF84vAD+T6p1hMxBrInkj8ocHXkyxdndQAuVd4dCjdm/dlpFs/ntZFhVQUFG
+zjqZaSvqQpKIui1x3WDap1RFy7n81B/e23eO+R8CyJg+upI38FIroR38EGhEFAjgcqKSi+0f
+WDsXR49XjIO5EX7RkFhnMudvMA+sW2PsI7yAfIFrTO8VEnevAwsVNIeTpyYnVVFBTUGeRP5u
+9eNoLO3wHpARvsT4JtmdVWoTX2XzQA9xXa+6cOmiT4XLnwtIU4a8W1dfINqMUVLBhIJD2zvL
+TppISqzmIISugSMiNND0kvkp9moYXz0QodrEHzJDZmzqbTv5IAs+gPER1eNS2BZKJjXJ7Egn
+2JDWIRgm2kzS1BaSyL004F39AfsKCBcsBsbsTIUcmpRUwLjMpdkomkGGA3RHnfk06odrEEQO
+72ZOIsIwd1+X5U8tK9pnEH0/RsZONUMPtGrQ4Pe0ZlNZUHCyN6U633MUO32Wmru5AQ0EV7dh
+8wEIAOAvY6Wrlp6k/Fknu/wIZLWoGIOTR11iYgHHvVWWeoatleewsqHbzCMiCQ5txX5RJJv7
+F5xDURmoqwpKdkjFVqriuCt506MeztBohRqTvDYOczS/eQJuI+pR9/aGmESErP9+B9AmQ+rN
+no391Z+HRI75VIP+AnTZGYVMec5fQbFUwws3Dt9VeXgPIPixfVoXtz5vQPj9EfH3RTQ//9Vz
+zznZkHBPFMroM3VLznwlDb9a2Z4S4WVgztMMrZnlYmym6tN1sm61TPNK+4KFy+FNFbudcHcg
+AXXT7H5/rNhUD8aMMLAQHqNCeg/eXCQO0Sp2TzBs/x90jti9cGmyMfsZDKkAEQEAAYkDRAQY
+AQoADwUCV7dh8wUJDwmcAAIbDAEpCRBhAbJ4Oy/RYcBdIAQZAQoABgUCV7dh8wAKCRDZBk7K
+WLNt46vQB/0QOlN8vMJNVlJJZ2TD+Es63/bjd/oa1djnBXFhqii/vY1WI7c1lUK+JPIu7RpE
+eb3ZwpwnTeHxLe+kJtvEjTdHygM0KtWdq+MHAX+t+5AJA9UyVIQupztH+/87/GvtxYMIQRwg
+WY9ExP1HAi8vyLxOxQNmc1A3boYY5GA16L3AOGxtOIn43qDTz5RwY+s1A1zyUq4zczBA/Fma
+ddqN0N/arjHEkE1cLXEypcYme1xfLE8mpU3/7FSyHdQxW2o/KqoDkqVj12oKAMuBnKcYoKmr
+qsmy8eHpmbfMUrRE7frpGeF4II/NgCfEYOAxysOOq4IRXQClaZpquL4AOXN2EVjz/awQAKU6
+fpScpzZoNAMJYnbTQrs8YEy4VUFvUyZWpSVDj5aAhrZApbb7LfGQyBMFxHARnwDGv9AK6Sl+
+vHp8zvPn9nHE3D9tLGIWtjCRRhPe/RY1wWyw8ZUmBN6jDZ1LSh/Tqr7J24zsLmxGBUJcDfZ/
+awv/sabqPp0AGbs/qQwjxgWj9en6IS2+mWnWL3sQXOmxdFil/0+Tx5WOrEtCkR35yPLnTSeY
+xKP6KKfG7gA8xLxXKxxVMojjAzN0Dxb0+0iQ4RwPygb79OzAsx588Rv2Qo8kf0QyvgUZhufv
+q355qQ248FU4gBEcLc5b2yu1Iz1nToubu74Uwl9t7XzZs+RP/6ZGuItSHxsqLzVFexmNdcXh
+oKfu58NnH1Fi9wMKtAKCH31q235wSh/x0YM391cdIvSjxfItNXtykR7KDbal7YLOa5dKyRyf
+2WiYMCEAQSoRVj6A4ylRsqs9hirvYinNSWPa1ZrketKz+9g+rj0/pmQjKAPiapYkarp5yT8d
+dgQ1XuwGCaPZXhByS9s6SonZwvrthrHFoWfK7JzkepYoBKy/nGUNt+9NDWbCB6sAe2zLAfmA
+tsOhB7ZO8/AlPRQCIvEGRXcEtbYkxtB2vMNGPbIoHDv5QvbHP0Foj79SwRg/2a9wiq6i5Vwv
+wGWOhC4ELGF+imX35GGbJq0a8A2z5WX6
+=VHze
+-----END PGP PUBLIC KEY BLOCK-----
diff --git a/etc/account-server.conf-sample b/etc/account-server.conf-sample
index 5cb0d2897e..fc21517a11 100644
--- a/etc/account-server.conf-sample
+++ b/etc/account-server.conf-sample
@@ -1,38 +1,86 @@
 [DEFAULT]
 # bind_ip = 0.0.0.0
-# bind_port = 6002
+bind_port = 6202
+# keep_idle = 600
 # bind_timeout = 30
 # backlog = 4096
-# workers = 1
 # user = swift
 # swift_dir = /etc/swift
 # devices = /srv/node
 # mount_check = true
 # disable_fallocate = false
+#
+# Use an integer to override the number of pre-forked processes that will
+# accept connections.
+# workers = auto
+#
+# Maximum concurrent requests per worker
+# max_clients = 1024
+#
 # You can specify default log routing here if you want:
 # log_name = swift
 # log_facility = LOG_LOCAL0
 # log_level = INFO
 # log_address = /dev/log
+# The following caps the length of log lines to the value given; no limit if
+# set to 0, the default.
+# log_max_line_length = 0
+#
+# Hashing algorithm for log anonymization. Must be one of algorithms supported
+# by Python's hashlib.
+# log_anonymization_method = MD5
+#
+# Salt added during log anonymization
+# log_anonymization_salt =
+#
+# Template used to format logs. All words surrounded by curly brackets
+# will be substituted with the appropriate values
+# log_format = {remote_addr} - - [{time.d}/{time.b}/{time.Y}:{time.H}:{time.M}:{time.S} +0000] "{method} {path}" {status} {content_length} "{referer}" "{txn_id}" "{user_agent}" {trans_time:.4f} "{additional_info}" {pid} {policy_index}
+#
 # comma separated list of functions to call to setup custom log handlers.
 # functions get passed: conf, name, log_to_console, log_route, fmt, logger,
 # adapted_logger
 # log_custom_handlers =
+#
 # If set, log_udp_host will override log_address
 # log_udp_host =
 # log_udp_port = 514
+#
 # You can enable StatsD logging here:
-# log_statsd_host = localhost
+# log_statsd_host =
 # log_statsd_port = 8125
-# log_statsd_default_sample_rate = 1
+# log_statsd_default_sample_rate = 1.0
+# log_statsd_sample_rate_factor = 1.0
 # log_statsd_metric_prefix =
+#
 # If you don't mind the extra disk space usage in overhead, you can turn this
 # on to preallocate disk space with SQLite databases to decrease fragmentation.
 # db_preallocation = off
+#
+# Enable this option to log all sqlite3 queries (requires python >=3.3)
+# db_query_logging = off
+#
 # eventlet_debug = false
+#
+# You can set fallocate_reserve to the number of bytes or percentage of disk
+# space you'd like fallocate to reserve, whether there is space for the given
+# file size or not. Percentage will be used if the value ends with a '%'.
+# fallocate_reserve = 1%
+#
+# You can set scheduling priority of processes. Niceness values range from -20
+# (most favorable to the process) to 19 (least favorable to the process).
+# nice_priority =
+#
+# You can set I/O scheduling class and priority of processes. I/O niceness
+# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
+# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
+# 0 to 7. The higher the value, the lower the I/O priority of the process.
+# Work only with ionice_class.
+# ionice_class =
+# ionice_priority =
 
 [pipeline:main]
-pipeline = healthcheck recon account-server
+pipeline = healthcheck recon backend_ratelimit account-server
 
 [app:account-server]
 use = egg:swift#account
@@ -40,9 +88,41 @@ use = egg:swift#account
 # set log_name = account-server
 # set log_facility = LOG_LOCAL0
 # set log_level = INFO
-# set log_requests = True
+# set log_requests = true
 # set log_address = /dev/log
-# auto_create_account_prefix = .
+#
+# You can disable REPLICATE handling (default is to allow it). When deploying
+# a cluster with a separate replication network, you'll want multiple
+# account-server processes running: one for client-driven traffic and another
+# for replication traffic. The server handling client-driven traffic may set
+# this to false. If there is only one account-server process, leave this as
+# true.
+# replication_server = true
+#
+# You can set scheduling priority of processes. Niceness values range from -20
+# (most favorable to the process) to 19 (least favorable to the process).
+# nice_priority =
+#
+# You can set I/O scheduling class and priority of processes. I/O niceness
+# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
+# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
+# 0 to 7. The higher the value, the lower the I/O priority of the process.
+# Work only with ionice_class.
+# ionice_class =
+# ionice_priority =
+#
+# You can set fallocate_reserve to the number of bytes or percentage
+# of disk space you'd like kept free at all times. If the disk's free
+# space falls below this value, then PUT, POST, and REPLICATE requests
+# will be denied until the disk ha s more space available. Percentage
+# will be used if the value ends with a '%'.
+# fallocate_reserve = 1%
+#
+# When reloading servers with SIGUSR1, workers running with old config/code
+# are allowed some time to finish serving in-flight requests. Use this to
+# configure the grace period (in seconds), after which the reloaded server
+# will issue SIGKILLs to remaining stale workers.
+# stale_worker_timeout = 86400
 
 [filter:healthcheck]
 use = egg:swift#healthcheck
@@ -54,29 +134,106 @@ use = egg:swift#healthcheck
 use = egg:swift#recon
 # recon_cache_path = /var/cache/swift
 
+[filter:backend_ratelimit]
+use = egg:swift#backend_ratelimit
+# Config options can optionally be loaded from a separate config file. Config
+# options in this section will be used unless the same option is found in the
+# config file, in which case the config file option will be used. See the
+# backend-ratelimit.conf-sample file for details of available config options.
+# backend_ratelimit_conf_path = /etc/swift/backend-ratelimit.conf
+
+# The minimum interval between attempts to reload any config file at
+# backend_ratelimit_conf_path while the server is running. A value of 0 means
+# that the file is loaded at start-up but not subsequently reloaded. Note that
+# config options in this section are never reloaded after start-up.
+# config_reload_interval = 60
+
 [account-replicator]
 # You can override the default log routing for this app here (don't use set!):
 # log_name = account-replicator
 # log_facility = LOG_LOCAL0
 # log_level = INFO
 # log_address = /dev/log
-# vm_test_mode = no
+#
+# Maximum number of database rows that will be sync'd in a single HTTP
+# replication request. Databases with less than or equal to this number of
+# differing rows will always be sync'd using an HTTP replication request rather
+# than using rsync.
 # per_diff = 1000
+#
+# Maximum number of HTTP replication requests attempted on each replication
+# pass for any one container. This caps how long the replicator will spend
+# trying to sync a given database per pass so the other databases don't get
+# starved.
 # max_diffs = 100
+#
+# Number of replication workers to spawn.
 # concurrency = 8
-# interval = 30
-# How long without an error before a node's error count is reset. This will
-# also be how long before a node is reenabled after suppression is triggered.
-# error_suppression_interval = 60
-# How many errors can accumulate before a node is temporarily ignored.
-# error_suppression_limit = 10
+#
+# Time in seconds to wait between replication passes
+# interval = 30.0
+# run_pause is deprecated, use interval instead
+# run_pause = 30.0
+#
+# Process at most this many databases per second
+# databases_per_second = 50
+#
 # node_timeout = 10
 # conn_timeout = 0.5
+#
 # The replicator also performs reclamation
 # reclaim_age = 604800
-# Time in seconds to wait between replication passes
-# run_pause = 30
+#
+# Allow rsync to compress data which is transmitted to destination node
+# during sync. However, this is applicable only when destination node is in
+# a different region than the local one.
+# rsync_compress = no
+#
+# Format of the rsync module where the replicator will send data. See
+# etc/rsyncd.conf-sample for some usage examples.
+# rsync_module = {replication_ip}::account
+#
 # recon_cache_path = /var/cache/swift
+#
+# You can set scheduling priority of processes. Niceness values range from -20
+# (most favorable to the process) to 19 (least favorable to the process).
+# nice_priority =
+#
+# You can set I/O scheduling class and priority of processes. I/O niceness
+# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
+# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
+# 0 to 7. The higher the value, the lower the I/O priority of the process.
+# Work only with ionice_class.
+# ionice_class =
+# ionice_priority =
+#
+# The handoffs_only and handoff_delete options are for special-case emergency
+# situations such as full disks in the cluster. These options SHOULD NOT
+# BE ENABLED except in emergencies. When handoffs_only mode is enabled
+# the replicator will *only* replicate from handoff nodes to primary
+# nodes and will not sync primary nodes with other primary nodes.
+#
+# This has two main effects: first, the replicator becomes much more
+# effective at removing misplaced databases, thereby freeing up disk
+# space at a much faster pace than normal. Second, the replicator does
+# not sync data between primary nodes, so out-of-sync account and
+# container listings will not resolve while handoffs_only is enabled.
+#
+# This mode is intended to allow operators to temporarily sacrifice
+# consistency in order to gain faster rebalancing, such as during a
+# capacity addition with nearly-full disks. It is not intended for
+# long-term use.
+#
+# handoffs_only = no
+#
+# handoff_delete is the number of replicas which are ensured in swift.
+# If the number less than the number of replicas is set, account-replicator
+# could delete local handoffs even if all replicas are not ensured in the
+# cluster. The replicator would remove local handoff account database after
+# syncing when the number of successful responses is greater than or equal to
+# this number. By default(auto), handoff partitions will be
+# removed  when it has successfully replicated to all the canonical nodes.
+# handoff_delete = auto
 
 [account-auditor]
 # You can override the default log routing for this app here (don't use set!):
@@ -84,11 +241,24 @@ use = egg:swift#recon
 # log_facility = LOG_LOCAL0
 # log_level = INFO
 # log_address = /dev/log
-# Will audit, at most, 1 account per device per interval
-# interval = 1800
-# log_facility = LOG_LOCAL0
-# log_level = INFO
+#
+# Will audit each account at most once per interval
+# interval = 1800.0
+#
+# accounts_per_second = 200
 # recon_cache_path = /var/cache/swift
+#
+# You can set scheduling priority of processes. Niceness values range from -20
+# (most favorable to the process) to 19 (least favorable to the process).
+# nice_priority =
+#
+# You can set I/O scheduling class and priority of processes. I/O niceness
+# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
+# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
+# 0 to 7. The higher the value, the lower the I/O priority of the process.
+# Work only with ionice_class.
+# ionice_class =
+# ionice_priority =
 
 [account-reaper]
 # You can override the default log routing for this app here (don't use set!):
@@ -96,11 +266,72 @@ use = egg:swift#recon
 # log_facility = LOG_LOCAL0
 # log_level = INFO
 # log_address = /dev/log
+#
 # concurrency = 25
-# interval = 3600
+# interval = 3600.0
 # node_timeout = 10
 # conn_timeout = 0.5
+#
 # Normally, the reaper begins deleting account information for deleted accounts
 # immediately; you can set this to delay its work however. The value is in
-# seconds; 2592000 = 30 days for example.
+# seconds; 2592000 = 30 days for example. The sum of this value and the
+# container-updater interval should be less than the account-replicator
+# reclaim_age. This ensures that once the account-reaper has deleted a
+# container there is sufficient time for the container-updater to report to the
+# account before the account DB is removed.
 # delay_reaping = 0
+#
+# If the account fails to be reaped due to a persistent error, the
+# account reaper will log a message such as:
+#     Account  has not been reaped since 
+# You can search logs for this message if space is not being reclaimed
+# after you delete account(s).
+# Default is 2592000 seconds (30 days). This is in addition to any time
+# requested by delay_reaping.
+# reap_warn_after = 2592000
+#
+# You can set scheduling priority of processes. Niceness values range from -20
+# (most favorable to the process) to 19 (least favorable to the process).
+# nice_priority =
+#
+# You can set I/O scheduling class and priority of processes. I/O niceness
+# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
+# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
+# 0 to 7. The higher the value, the lower the I/O priority of the process.
+# Work only with ionice_class.
+# ionice_class =
+# ionice_priority =
+
+# Note: Put it at the beginning of the pipeline to profile all middleware. But
+# it is safer to put this after healthcheck. Not intended for production
+# environments!
+[filter:xprofile]
+use = egg:swift#xprofile
+# This option enable you to switch profilers which should inherit from python
+# standard profiler. Currently the supported value can be 'cProfile',
+# 'eventlet.green.profile' etc.
+# profile_module = eventlet.green.profile
+#
+# This prefix will be used to combine process ID and timestamp to name the
+# profile data file.  Make sure the executing user has permission to write
+# into this path (missing path segments will be created, if necessary).
+# If you enable profiling in more than one type of daemon, you must override
+# it with an unique value like: /var/log/swift/profile/account.profile
+# log_filename_prefix = /tmp/log/swift/profile/default.profile
+#
+# the profile data will be dumped to local disk based on above naming rule
+# in this interval.
+# dump_interval = 5.0
+#
+# Be careful, this option will enable profiler to dump data into the file with
+# time stamp which means there will be lots of files piled up in the directory.
+# dump_timestamp = false
+#
+# This is the path of the URL to access the mini web UI.
+# path = /__profile__
+#
+# Clear the data when the wsgi server shutdown.
+# flush_at_shutdown = false
+#
+# unwind the iterator of applications
+# unwind = false
diff --git a/etc/backend-ratelimit.conf-sample b/etc/backend-ratelimit.conf-sample
new file mode 100644
index 0000000000..5912b6668e
--- /dev/null
+++ b/etc/backend-ratelimit.conf-sample
@@ -0,0 +1,30 @@
+[backend_ratelimit]
+# The rate of requests to each device is limited by an overall per-device rate
+# limit that is applied to all requests to the device and/or a
+# per-method-per-device rate limit that is applied to requests of that method
+# to the device. If either of these rates would be exceeded the server will
+# return 529 responses and emit a 'backend.ratelimit' statsd metric without
+# logging.
+
+# Set the maximum overall rate of requests per device per second per worker for
+# all request methods. The default value of zero causes no per-device
+# rate-limiting to be applied other than that configured for specific request
+# methods.
+# requests_per_device_per_second = 0.0
+
+# Set maximum rate of requests per device per second per worker for individual
+# request methods. The default value of zero causes no per-method
+# rate-limiting to be applied. Note: the aggregate rate of requests for all
+# methods is still limited by requests_per_device_per_second even if a higher
+# per method rate is configured.
+# delete_requests_per_device_per_second = 0.0
+# get_requests_per_device_per_second = 0.0
+# head_requests_per_device_per_second = 0.0
+# post_requests_per_device_per_second = 0.0
+# put_requests_per_device_per_second = 0.0
+# replicate_requests_per_device_per_second = 0.0
+# update_requests_per_device_per_second = 0.0
+
+# Set the number of seconds of unused rate-limiting allowance that can
+# accumulate and be used to allow a subsequent burst of requests.
+# requests_per_device_rate_buffer = 1.0
diff --git a/etc/container-reconciler.conf-sample b/etc/container-reconciler.conf-sample
new file mode 100644
index 0000000000..6011a3c21e
--- /dev/null
+++ b/etc/container-reconciler.conf-sample
@@ -0,0 +1,96 @@
+[DEFAULT]
+# swift_dir = /etc/swift
+# user = swift
+# ring_check_interval = 15.0
+# You can specify default log routing here if you want:
+# log_name = swift
+# log_facility = LOG_LOCAL0
+# log_level = INFO
+# log_address = /dev/log
+#
+# comma separated list of functions to call to setup custom log handlers.
+# functions get passed: conf, name, log_to_console, log_route, fmt, logger,
+# adapted_logger
+# log_custom_handlers =
+#
+# If set, log_udp_host will override log_address
+# log_udp_host =
+# log_udp_port = 514
+#
+# You can enable StatsD logging here:
+# log_statsd_host =
+# log_statsd_port = 8125
+# log_statsd_default_sample_rate = 1.0
+# log_statsd_sample_rate_factor = 1.0
+# log_statsd_metric_prefix =
+#
+# You can set scheduling priority of processes. Niceness values range from -20
+# (most favorable to the process) to 19 (least favorable to the process).
+# nice_priority =
+#
+# You can set I/O scheduling class and priority of processes. I/O niceness
+# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
+# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
+# 0 to 7. The higher the value, the lower the I/O priority of the process.
+# Work only with ionice_class.
+# ionice_class =
+# ionice_priority =
+
+[container-reconciler]
+# The reconciler will re-attempt reconciliation if the source object is not
+# available up to reclaim_age seconds before it gives up and deletes the entry
+# in the queue.
+# reclaim_age = 604800
+# The cycle time of the daemon
+# interval = 30.0
+# Server errors from requests will be retried by default
+# request_tries = 3
+#
+# You can set scheduling priority of processes. Niceness values range from -20
+# (most favorable to the process) to 19 (least favorable to the process).
+# nice_priority =
+#
+# You can set I/O scheduling class and priority of processes. I/O niceness
+# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
+# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
+# 0 to 7. The higher the value, the lower the I/O priority of the process.
+# Work only with ionice_class.
+# ionice_class =
+# ionice_priority =
+# Number of objects to process concurrently per process
+# concurrency = 1
+
+# processes is how many parts to divide the work into, one part per process
+# that will be doing the work
+# processes set 0 means that a single process will be doing all the work
+# processes = 0
+#
+# process is which of the parts a particular process will work on
+# process is "zero based", if you want to use 3 processes, you should run
+# processes with process set to 0, 1, and 2
+# process = 0
+
+[pipeline:main]
+# Note that the reconciler's pipeline is intentionally very sparse -- it is
+# only responsible for moving data from one policy to another and should not
+# perform any transformations beyond (potentially) changing erasure coding.
+# It notably MUST NOT include transformative middlewares (such as encryption),
+# redirection middlewares (such as symlink), or composing middlewares (such
+# as slo and dlo).
+pipeline = catch_errors proxy-logging cache proxy-server
+
+[app:proxy-server]
+use = egg:swift#proxy
+# See proxy-server.conf-sample for options
+
+[filter:cache]
+use = egg:swift#memcache
+# See proxy-server.conf-sample for options
+
+[filter:proxy-logging]
+use = egg:swift#proxy_logging
+# See proxy-server.conf-sample for options
+
+[filter:catch_errors]
+use = egg:swift#catch_errors
+# See proxy-server.conf-sample for options
diff --git a/etc/container-server.conf-sample b/etc/container-server.conf-sample
index 728fa4cd3c..fbf5caad07 100644
--- a/etc/container-server.conf-sample
+++ b/etc/container-server.conf-sample
@@ -1,41 +1,92 @@
 [DEFAULT]
 # bind_ip = 0.0.0.0
-# bind_port = 6001
+bind_port = 6201
+# keep_idle = 600
 # bind_timeout = 30
 # backlog = 4096
-# workers = 1
 # user = swift
 # swift_dir = /etc/swift
 # devices = /srv/node
 # mount_check = true
 # disable_fallocate = false
+#
+# Use an integer to override the number of pre-forked processes that will
+# accept connections.
+# workers = auto
+#
+# Maximum concurrent requests per worker
+# max_clients = 1024
+#
 # This is a comma separated list of hosts allowed in the X-Container-Sync-To
-# field for containers.
+# field for containers. This is the old-style of using container sync. It is
+# strongly recommended to use the new style of a separate
+# container-sync-realms.conf -- see container-sync-realms.conf-sample
 # allowed_sync_hosts = 127.0.0.1
+#
 # You can specify default log routing here if you want:
 # log_name = swift
 # log_facility = LOG_LOCAL0
 # log_level = INFO
 # log_address = /dev/log
+# The following caps the length of log lines to the value given; no limit if
+# set to 0, the default.
+# log_max_line_length = 0
+#
+# Hashing algorithm for log anonymization. Must be one of algorithms supported
+# by Python's hashlib.
+# log_anonymization_method = MD5
+#
+# Salt added during log anonymization
+# log_anonymization_salt =
+#
+# Template used to format logs. All words surrounded by curly brackets
+# will be substituted with the appropriate values
+# log_format = {remote_addr} - - [{time.d}/{time.b}/{time.Y}:{time.H}:{time.M}:{time.S} +0000] "{method} {path}" {status} {content_length} "{referer}" "{txn_id}" "{user_agent}" {trans_time:.4f} "{additional_info}" {pid} {policy_index}
+#
 # comma separated list of functions to call to setup custom log handlers.
 # functions get passed: conf, name, log_to_console, log_route, fmt, logger,
 # adapted_logger
 # log_custom_handlers =
+#
 # If set, log_udp_host will override log_address
 # log_udp_host =
 # log_udp_port = 514
+#
 # You can enable StatsD logging here:
-# log_statsd_host = localhost
+# log_statsd_host =
 # log_statsd_port = 8125
-# log_statsd_default_sample_rate = 1
+# log_statsd_default_sample_rate = 1.0
+# log_statsd_sample_rate_factor = 1.0
 # log_statsd_metric_prefix =
+#
 # If you don't mind the extra disk space usage in overhead, you can turn this
 # on to preallocate disk space with SQLite databases to decrease fragmentation.
 # db_preallocation = off
+#
+# Enable this option to log all sqlite3 queries (requires python >=3.3)
+# db_query_logging = off
+#
 # eventlet_debug = false
+#
+# You can set fallocate_reserve to the number of bytes or percentage of disk
+# space you'd like fallocate to reserve, whether there is space for the given
+# file size or not. Percentage will be used if the value ends with a '%'.
+# fallocate_reserve = 1%
+#
+# You can set scheduling priority of processes. Niceness values range from -20
+# (most favorable to the process) to 19 (least favorable to the process).
+# nice_priority =
+#
+# You can set I/O scheduling class and priority of processes. I/O niceness
+# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
+# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
+# 0 to 7. The higher the value, the lower the I/O priority of the process.
+# Work only with ionice_class.
+# ionice_class =
+# ionice_priority =
 
 [pipeline:main]
-pipeline = healthcheck recon container-server
+pipeline = healthcheck recon backend_ratelimit container-server
 
 [app:container-server]
 use = egg:swift#container
@@ -43,12 +94,45 @@ use = egg:swift#container
 # set log_name = container-server
 # set log_facility = LOG_LOCAL0
 # set log_level = INFO
-# set log_requests = True
+# set log_requests = true
 # set log_address = /dev/log
+#
 # node_timeout = 3
 # conn_timeout = 0.5
-# allow_versions = False
-# auto_create_account_prefix = .
+# allow_versions = false
+#
+# You can disable REPLICATE handling (default is to allow it). When deploying
+# a cluster with a separate replication network, you'll want multiple
+# container-server processes running: one for client-driven traffic and another
+# for replication traffic. The server handling client-driven traffic may set
+# this to false. If there is only one container-server process, leave this as
+# true.
+# replication_server = true
+#
+# You can set scheduling priority of processes. Niceness values range from -20
+# (most favorable to the process) to 19 (least favorable to the process).
+# nice_priority =
+#
+# You can set I/O scheduling class and priority of processes. I/O niceness
+# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
+# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
+# 0 to 7. The higher the value, the lower the I/O priority of the process.
+# Work only with ionice_class.
+# ionice_class =
+# ionice_priority =
+#
+# You can set fallocate_reserve to the number of bytes or percentage
+# of disk space you'd like kept free at all times. If the disk's free
+# space falls below this value, then PUT, POST, and REPLICATE requests
+# will be denied until the disk ha s more space available. Percentage
+# will be used if the value ends with a '%'.
+# fallocate_reserve = 1%
+#
+# When reloading servers with SIGUSR1, workers running with old config/code
+# are allowed some time to finish serving in-flight requests. Use this to
+# configure the grace period (in seconds), after which the reloaded server
+# will issue SIGKILLs to remaining stale workers.
+# stale_worker_timeout = 86400
 
 [filter:healthcheck]
 use = egg:swift#healthcheck
@@ -60,24 +144,106 @@ use = egg:swift#healthcheck
 use = egg:swift#recon
 #recon_cache_path = /var/cache/swift
 
+[filter:backend_ratelimit]
+use = egg:swift#backend_ratelimit
+# Config options can optionally be loaded from a separate config file. Config
+# options in this section will be used unless the same option is found in the
+# config file, in which case the config file option will be used. See the
+# backend-ratelimit.conf-sample file for details of available config options.
+# backend_ratelimit_conf_path = /etc/swift/backend-ratelimit.conf
+
+# The minimum interval between attempts to reload any config file at
+# backend_ratelimit_conf_path while the server is running. A value of 0 means
+# that the file is loaded at start-up but not subsequently reloaded. Note that
+# config options in this section are never reloaded after start-up.
+# config_reload_interval = 60
+
 [container-replicator]
 # You can override the default log routing for this app here (don't use set!):
 # log_name = container-replicator
 # log_facility = LOG_LOCAL0
 # log_level = INFO
 # log_address = /dev/log
-# vm_test_mode = no
+#
+# Maximum number of database rows that will be sync'd in a single HTTP
+# replication request. Databases with less than or equal to this number of
+# differing rows will always be sync'd using an HTTP replication request rather
+# than using rsync.
 # per_diff = 1000
+#
+# Maximum number of HTTP replication requests attempted on each replication
+# pass for any one container. This caps how long the replicator will spend
+# trying to sync a given database per pass so the other databases don't get
+# starved.
 # max_diffs = 100
+#
+# Number of replication workers to spawn.
 # concurrency = 8
-# interval = 30
+#
+# Time in seconds to wait between replication passes
+# interval = 30.0
+# run_pause is deprecated, use interval instead
+# run_pause = 30.0
+#
+# Process at most this many databases per second
+# databases_per_second = 50
+#
 # node_timeout = 10
 # conn_timeout = 0.5
+#
 # The replicator also performs reclamation
 # reclaim_age = 604800
-# Time in seconds to wait between replication passes
-# run_pause = 30
+#
+# Allow rsync to compress data which is transmitted to destination node
+# during sync. However, this is applicable only when destination node is in
+# a different region than the local one.
+# rsync_compress = no
+#
+# Format of the rsync module where the replicator will send data. See
+# etc/rsyncd.conf-sample for some usage examples.
+# rsync_module = {replication_ip}::container
+#
 # recon_cache_path = /var/cache/swift
+#
+# You can set scheduling priority of processes. Niceness values range from -20
+# (most favorable to the process) to 19 (least favorable to the process).
+# nice_priority =
+#
+# You can set I/O scheduling class and priority of processes. I/O niceness
+# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
+# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
+# 0 to 7. The higher the value, the lower the I/O priority of the process.
+# Work only with ionice_class.
+# ionice_class =
+# ionice_priority =
+#
+# The handoffs_only and handoff_delete options are for special-case emergency
+# situations such as full disks in the cluster. These options SHOULD NOT
+# BE ENABLED except in emergencies. When handoffs_only mode is enabled
+# the replicator will *only* replicate from handoff nodes to primary
+# nodes and will not sync primary nodes with other primary nodes.
+#
+# This has two main effects: first, the replicator becomes much more
+# effective at removing misplaced databases, thereby freeing up disk
+# space at a much faster pace than normal. Second, the replicator does
+# not sync data between primary nodes, so out-of-sync account and
+# container listings will not resolve while handoffs_only is enabled.
+#
+# This mode is intended to allow operators to temporarily sacrifice
+# consistency in order to gain faster rebalancing, such as during a
+# capacity addition with nearly-full disks. It is not intended for
+# long-term use.
+#
+# handoffs_only = no
+#
+# handoff_delete is the number of replicas which are ensured in swift.
+# If the number less than the number of replicas is set, container-replicator
+# could delete local handoffs even if all replicas are not ensured in the
+# cluster. The replicator would remove local handoff container database after
+# syncing when the number of successful responses is greater than or equal to
+# this number. By default(auto), handoff partitions will be
+# removed  when it has successfully replicated to all the canonical nodes.
+# handoff_delete = auto
 
 [container-updater]
 # You can override the default log routing for this app here (don't use set!):
@@ -85,15 +251,35 @@ use = egg:swift#recon
 # log_facility = LOG_LOCAL0
 # log_level = INFO
 # log_address = /dev/log
-# interval = 300
+#
+# interval = 300.0
 # concurrency = 4
 # node_timeout = 3
 # conn_timeout = 0.5
-# slowdown will sleep that amount between containers
+#
+# Send at most this many container updates per second
+# containers_per_second = 50
+#
+# slowdown will sleep that amount between containers. Deprecated; use
+# containers_per_second instead.
 # slowdown = 0.01
+#
 # Seconds to suppress updating an account that has generated an error
 # account_suppression_time = 60
+#
 # recon_cache_path = /var/cache/swift
+#
+# You can set scheduling priority of processes. Niceness values range from -20
+# (most favorable to the process) to 19 (least favorable to the process).
+# nice_priority =
+#
+# You can set I/O scheduling class and priority of processes. I/O niceness
+# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
+# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
+# 0 to 7. The higher the value, the lower the I/O priority of the process.
+# Work only with ionice_class.
+# ionice_class =
+# ionice_priority =
 
 [container-auditor]
 # You can override the default log routing for this app here (don't use set!):
@@ -101,9 +287,24 @@ use = egg:swift#recon
 # log_facility = LOG_LOCAL0
 # log_level = INFO
 # log_address = /dev/log
-# Will audit, at most, 1 container per device per interval
-# interval = 1800
+#
+# Will audit each container at most once per interval
+# interval = 1800.0
+#
+# containers_per_second = 200
 # recon_cache_path = /var/cache/swift
+#
+# You can set scheduling priority of processes. Niceness values range from -20
+# (most favorable to the process) to 19 (least favorable to the process).
+# nice_priority =
+#
+# You can set I/O scheduling class and priority of processes. I/O niceness
+# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
+# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
+# 0 to 7. The higher the value, the lower the I/O priority of the process.
+# Work only with ionice_class.
+# ionice_class =
+# ionice_priority =
 
 [container-sync]
 # You can override the default log routing for this app here (don't use set!):
@@ -111,9 +312,246 @@ use = egg:swift#recon
 # log_facility = LOG_LOCAL0
 # log_level = INFO
 # log_address = /dev/log
+#
 # If you need to use an HTTP Proxy, set it here; defaults to no proxy.
-# sync_proxy = http://127.0.0.1:8888
-# Will sync, at most, each container once per interval
-# interval = 300
+# You can also set this to a comma separated list of HTTP Proxies and they will
+# be randomly used (simple load balancing).
+# sync_proxy = http://10.1.1.1:8888,http://10.1.1.2:8888
+#
+# Will sync each container at most once per interval
+# interval = 300.0
+#
 # Maximum amount of time to spend syncing each container per pass
 # container_time = 60
+#
+# Maximum amount of time in seconds for the connection attempt
+# conn_timeout = 5
+# Server errors from requests will be retried by default
+# request_tries = 3
+#
+# Internal client config file path
+# internal_client_conf_path = /etc/swift/internal-client.conf
+#
+# You can set scheduling priority of processes. Niceness values range from -20
+# (most favorable to the process) to 19 (least favorable to the process).
+# nice_priority =
+#
+# You can set I/O scheduling class and priority of processes. I/O niceness
+# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
+# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
+# 0 to 7. The higher the value, the lower the I/O priority of the process.
+# Work only with ionice_class.
+# ionice_class =
+# ionice_priority =
+
+# Note: Put it at the beginning of the pipeline to profile all middleware. But
+# it is safer to put this after healthcheck. Not intended for production
+# environments!
+[filter:xprofile]
+use = egg:swift#xprofile
+# This option enable you to switch profilers which should inherit from python
+# standard profiler. Currently the supported value can be 'cProfile',
+# 'eventlet.green.profile' etc.
+# profile_module = eventlet.green.profile
+#
+# This prefix will be used to combine process ID and timestamp to name the
+# profile data file.  Make sure the executing user has permission to write
+# into this path (missing path segments will be created, if necessary).
+# If you enable profiling in more than one type of daemon, you must override
+# it with an unique value like: /var/log/swift/profile/container.profile
+# log_filename_prefix = /tmp/log/swift/profile/default.profile
+#
+# the profile data will be dumped to local disk based on above naming rule
+# in this interval.
+# dump_interval = 5.0
+#
+# Be careful, this option will enable profiler to dump data into the file with
+# time stamp which means there will be lots of files piled up in the directory.
+# dump_timestamp = false
+#
+# This is the path of the URL to access the mini web UI.
+# path = /__profile__
+#
+# Clear the data when the wsgi server shutdown.
+# flush_at_shutdown = false
+#
+# unwind the iterator of applications
+# unwind = false
+
+[container-sharder]
+# You can override the default log routing for this app here (don't use set!):
+# log_name = container-sharder
+# log_facility = LOG_LOCAL0
+# log_level = INFO
+# log_address = /dev/log
+#
+# Container sharder specific settings
+#
+# If the auto_shard option is true then the sharder will automatically select
+# containers to shard, scan for shard ranges, and select shards to shrink.
+# The default is false.
+# Warning: auto-sharding is still under development and should not be used in
+# production; do not set this option to true in a production cluster.
+# auto_shard = false
+#
+# When auto-sharding is enabled shard_container_threshold defines the object
+# count at which a container with container-sharding enabled will start to
+# shard. shard_container_threshold also indirectly determines the defaults for
+# rows_per_shard, shrink_threshold and expansion_limit.
+# shard_container_threshold = 1000000
+#
+# rows_per_shard determines the initial nominal size of shard containers. The
+# default is shard_container_threshold // 2
+# rows_per_shard = 500000
+#
+# Minimum size of the final shard range. If this is greater than one then the
+# final shard range may be extended to more than rows_per_shard in order to
+# avoid a further shard range with less than minimum_shard_size rows. The
+# default value is rows_per_shard // 5.
+# minimum_shard_size = 100000
+#
+# When auto-sharding is enabled shrink_threshold defines the object count
+# below which a 'donor' shard container will be considered for shrinking into
+# another 'acceptor' shard container. The default is determined by
+# shard_shrink_point. If set, shrink_threshold will take precedence over
+# shard_shrink_point.
+# shrink_threshold =
+#
+# When auto-sharding is enabled shard_shrink_point defines the object count
+# below which a 'donor' shard container will be considered for shrinking into
+# another 'acceptor' shard container. shard_shrink_point is a percentage of
+# shard_container_threshold e.g. the default value of 10 means 10% of the
+# shard_container_threshold.
+# Deprecated: shrink_threshold is recommended and if set will take precedence
+# over shard_shrink_point.
+# shard_shrink_point = 10
+#
+# When auto-sharding is enabled expansion_limit defines the maximum
+# allowed size of an acceptor shard container after having a donor merged into
+# it. The default is determined by shard_shrink_merge_point.
+# If set, expansion_limit will take precedence over shard_shrink_merge_point.
+# expansion_limit =
+#
+# When auto-sharding is enabled shard_shrink_merge_point defines the maximum
+# allowed size of an acceptor shard container after having a donor merged into
+# it. Shard_shrink_merge_point is a percentage of shard_container_threshold.
+# e.g. the default value of 75 means that the projected sum of a donor object
+# count and acceptor count must be less than 75% of shard_container_threshold
+# for the donor to be allowed to merge into the acceptor.
+#
+# For example, if the shard_container_threshold is 1 million,
+# shard_shrink_point is 10, and shard_shrink_merge_point is 75 then a shard will
+# be considered for shrinking if it has less than or equal to 100 thousand
+# objects but will only merge into an acceptor if the combined object count
+# would be less than or equal to 750 thousand objects.
+# Deprecated: expansion_limit is recommended and if set will take precedence
+# over shard_shrink_merge_point.
+# shard_shrink_merge_point = 75
+#
+# When auto-sharding is enabled shard_scanner_batch_size defines the maximum
+# number of shard ranges that will be found each time the sharder daemon visits
+# a sharding container. If necessary the sharder daemon will continue to search
+# for more shard ranges each time it visits the container.
+# shard_scanner_batch_size = 10
+#
+# cleave_batch_size defines the number of shard ranges that will be cleaved
+# each time the sharder daemon visits a sharding container.
+# cleave_batch_size = 2
+#
+# cleave_row_batch_size defines the size of batches of object rows read from a
+# sharding container and merged to a shard container during cleaving.
+# cleave_row_batch_size = 10000
+#
+# max_expanding defines the maximum number of shards that could be expanded in a
+# single cycle of the sharder. Defaults to unlimited (-1).
+# max_expanding = -1
+#
+# max_shrinking defines the maximum number of shards that should be shrunk into
+# each expanding shard. Defaults to 1.
+# NOTE: Using values greater than 1 may result in temporary gaps in object listings
+# until all selected shards have shrunk.
+# max_shrinking = 1
+#
+# Defines the number of successfully replicated shard dbs required when
+# cleaving a previously uncleaved shard range before the sharder will progress
+# to the next shard range. The value should be less than or equal to the
+# container ring replica count. The default of 'auto' causes the container ring
+# quorum value to be used. This option only applies to the container-sharder
+# replication and does not affect the number of shard container replicas that
+# will eventually be replicated by the container-replicator.
+# shard_replication_quorum = auto
+#
+# Defines the number of successfully replicated shard dbs required when
+# cleaving a shard range that has been previously cleaved on another node
+# before the sharder will progress to the next shard range. The value should be
+# less than or equal to the container ring replica count. The default of 'auto'
+# causes the shard_replication_quorum value to be used. This option only
+# applies to the container-sharder replication and does not affect the number
+# of shard container replicas that will eventually be replicated by the
+# container-replicator.
+# existing_shard_replication_quorum = auto
+#
+# The sharder uses an internal client to create and make requests to
+# containers. The absolute path to the client config file can be configured.
+# internal_client_conf_path = /etc/swift/internal-client.conf
+#
+# The number of time the internal client will retry requests.
+# request_tries = 3
+#
+# Each time the sharder dumps stats to the recon cache file it includes a list
+# of containers that appear to need sharding but are not yet sharding. By
+# default this list is limited to the top 5 containers, ordered by object
+# count. The limit may be changed by setting recon_candidates_limit to an
+# integer value. A negative value implies no limit.
+# recon_candidates_limit = 5
+#
+# As the sharder visits each container that's currently sharding it dumps to
+# recon their current progress. To be able to mark their progress as completed
+# this in-progress check will need to monitor containers that have just
+# completed sharding. The recon_sharded_timeout parameter says for how long a
+# container whose just finished sharding should be checked by the in-progress
+# check. This is to allow anything monitoring the sharding recon dump to have
+# enough time to collate and see things complete. The time is capped at
+# reclaim_age, so this parameter should be less than or equal to reclaim_age.
+# The default is 12 hours (12 x 60 x 60)
+# recon_sharded_timeout = 43200
+#
+# Maximum amount of time in seconds after sharding has been started on a shard
+# container and before it's considered as timeout. After this amount of time,
+# sharder will warn that a container DB has not completed sharding.
+# The default is 48 hours (48 x 60 x 60)
+# container_sharding_timeout = 172800
+#
+# Some sharder states lead to repeated messages of 'Reclaimable db stuck
+# waiting for shrinking' on every sharder cycle. To reduce noise in logs,
+# this message will be suppressed for some time after its last emission.
+# Default is 24 hours.
+# periodic_warnings_interval = 86400
+#
+# Large databases tend to take a while to work with, but we want to make sure
+# we write down our progress. Use a larger-than-normal broker timeout to make
+# us less likely to bomb out on a LockTimeout.
+# broker_timeout = 60
+#
+# Time in seconds to wait between emitting stats to logs
+# stats_interval = 3600.0
+#
+# Time in seconds to wait between sharder cycles
+# interval = 30.0
+#
+# Process at most this many databases per second
+# databases_per_second = 50
+#
+# The container-sharder accepts the following configuration options as defined
+# in the container-replicator section:
+#
+# per_diff = 1000
+# max_diffs = 100
+# concurrency = 8
+# node_timeout = 10
+# conn_timeout = 0.5
+# reclaim_age = 604800
+# rsync_compress = no
+# rsync_module = {replication_ip}::container
+# recon_cache_path = /var/cache/swift
+#
diff --git a/etc/container-sync-realms.conf-sample b/etc/container-sync-realms.conf-sample
new file mode 100644
index 0000000000..01e4723ed5
--- /dev/null
+++ b/etc/container-sync-realms.conf-sample
@@ -0,0 +1,47 @@
+# [DEFAULT]
+# The number of seconds between checking the modified time of this config file
+# for changes and therefore reloading it.
+# mtime_check_interval = 300.0
+
+
+# [realm1]
+# key = realm1key
+# key2 = realm1key2
+# cluster_clustername1 = https://host1/v1/
+# cluster_clustername2 = https://host2/v1/
+#
+# [realm2]
+# key = realm2key
+# key2 = realm2key2
+# cluster_clustername3 = https://host3/v1/
+# cluster_clustername4 = https://host4/v1/
+
+
+# Each section name is the name of a sync realm. A sync realm is a set of
+# clusters that have agreed to allow container syncing with each other. Realm
+# names will be considered case insensitive.
+#
+# The key is the overall cluster-to-cluster key used in combination with the
+# external users' key that they set on their containers' X-Container-Sync-Key
+# metadata header values. These keys will be used to sign each request the
+# container sync daemon makes and used to validate each incoming container sync
+# request.
+#
+# The key2 is optional and is an additional key incoming requests will be
+# checked against. This is so you can rotate keys if you wish; you move the
+# existing key to key2 and make a new key value.
+#
+# Any values in the realm section whose names begin with cluster_ will indicate
+# the name and endpoint of a cluster and will be used by external users in
+# their containers' X-Container-Sync-To metadata header values with the format
+# "realm_name/cluster_name/container_name". Realm and cluster names are
+# considered case insensitive.
+#
+# The endpoint is what the container sync daemon will use when sending out
+# requests to that cluster. Keep in mind this endpoint must be reachable by all
+# container servers, since that is where the container sync daemon runs. Note
+# that the endpoint ends with /v1/ and that the container sync daemon will then
+# add the account/container/obj name after that.
+#
+# Distribute this container-sync-realms.conf file to all your proxy servers
+# and container servers.
diff --git a/etc/dispersion.conf-sample b/etc/dispersion.conf-sample
index d42cf35434..eae777a085 100644
--- a/etc/dispersion.conf-sample
+++ b/etc/dispersion.conf-sample
@@ -1,16 +1,41 @@
 [dispersion]
+# Please create a new account solely for using dispersion tools, which is
+# helpful for keep your own data clean.
 auth_url = http://localhost:8080/auth/v1.0
 auth_user = test:tester
 auth_key = testing
-# auth_url = http://saio:5000/v2.0/
-# auth_user = test:tester
-# auth_key = testing
+# auth_version = 1.0
+#
+# NOTE: If you want to use keystone (auth version 2.0), then its configuration
+# would look something like:
+# auth_url = http://localhost:5000/v2.0/
+# auth_user = tenant:user
+# auth_key = password
 # auth_version = 2.0
 #
+# NOTE: If you want to use keystone (auth version 3.0), then its configuration
+# would look something like:
+# auth_url = http://localhost:5000/v3/
+# auth_user = user
+# auth_key = password
+# auth_version = 3.0
+# project_name = project
+# project_domain_name = project_domain
+# user_domain_name = user_domain
+#
+# endpoint_type = publicURL
+#
+# NOTE: If you have only 1 region with a swift endpoint, no need to specify it
+# region_name =
+#
+# keystone_api_insecure = no
+#
 # swift_dir = /etc/swift
-# dispersion_coverage = 1
+# dispersion_coverage = 1.0
 # retries = 5
 # concurrency = 25
+# container_populate = yes
+# object_populate = yes
 # container_report = yes
 # object_report = yes
 # dump_json = no
diff --git a/etc/drive-audit.conf-sample b/etc/drive-audit.conf-sample
index 98ef4eb1be..69de6defbf 100644
--- a/etc/drive-audit.conf-sample
+++ b/etc/drive-audit.conf-sample
@@ -1,7 +1,42 @@
 [drive-audit]
+# Set owner of the drive-audit recon cache to this user:
+# user = swift
+#
 # device_dir = /srv/node
+#
+# You can specify default log routing here if you want:
+# log_name = drive-audit
 # log_facility = LOG_LOCAL0
 # log_level = INFO
 # log_address = /dev/log
+# The following caps the length of log lines to the value given; no limit if
+# set to 0, the default.
+# log_max_line_length = 0
+#
 # minutes = 60
 # error_limit = 1
+# recon_cache_path = /var/cache/swift
+# unmount_failed_device = True
+#
+# By default, drive-audit logs only to syslog. Setting this option True
+# makes drive-audit log to console in addition to syslog.
+# log_to_console = False
+#
+# Location of the log file with globbing
+# pattern to check against device errors.
+# log_file_pattern = /var/log/kern.*[!.][!g][!z]
+#
+# On Python 3, the encoding to use when reading the log file. Defaults
+# to the result of locale.getpreferredencoding(), like Python's open().
+# log_file_encoding = auto
+#
+# Regular expression patterns to be used to locate
+# device blocks with errors in the log file. Currently
+# the default ones are as follows:
+#   \berror\b.*\b(sd[a-z]{1,2}\d?)\b
+#   \b(sd[a-z]{1,2}\d?)\b.*\berror\b
+# One can overwrite the default ones by providing
+# new expressions using the format below:
+# Format: regex_pattern_X = regex_expression
+# Example:
+#   regex_pattern_1 = \berror\b.*\b(dm-[0-9]{1,2}\d?)\b
diff --git a/etc/internal-client.conf-sample b/etc/internal-client.conf-sample
new file mode 100644
index 0000000000..cbeb401c03
--- /dev/null
+++ b/etc/internal-client.conf-sample
@@ -0,0 +1,51 @@
+[DEFAULT]
+# swift_dir = /etc/swift
+# user = swift
+# You can specify default log routing here if you want:
+# Note: the 'set' syntax is necessary to override the log_name that some
+# daemons specify when instantiating an internal client.
+# set log_name = swift
+# log_facility = LOG_LOCAL0
+# log_level = INFO
+# log_address = /dev/log
+#
+# comma separated list of functions to call to setup custom log handlers.
+# functions get passed: conf, name, log_to_console, log_route, fmt, logger,
+# adapted_logger
+# log_custom_handlers =
+#
+# If set, log_udp_host will override log_address
+# log_udp_host =
+# log_udp_port = 514
+#
+# You can enable StatsD logging here:
+# log_statsd_host =
+# log_statsd_port = 8125
+# log_statsd_default_sample_rate = 1.0
+# log_statsd_sample_rate_factor = 1.0
+# log_statsd_metric_prefix =
+
+[pipeline:main]
+# Note: gatekeeper middleware is not allowed in the internal client pipeline
+pipeline = catch_errors proxy-logging cache symlink proxy-server
+
+[app:proxy-server]
+use = egg:swift#proxy
+account_autocreate = true
+# See proxy-server.conf-sample for options
+
+[filter:symlink]
+use = egg:swift#symlink
+# See proxy-server.conf-sample for options
+
+[filter:cache]
+use = egg:swift#memcache
+# See proxy-server.conf-sample for options
+
+[filter:proxy-logging]
+use = egg:swift#proxy_logging
+# See proxy-server.conf-sample for options
+
+[filter:catch_errors]
+use = egg:swift#catch_errors
+# See proxy-server.conf-sample for options
diff --git a/etc/keymaster.conf-sample b/etc/keymaster.conf-sample
new file mode 100644
index 0000000000..7881676e1a
--- /dev/null
+++ b/etc/keymaster.conf-sample
@@ -0,0 +1,131 @@
+[keymaster]
+# Over time, the format of crypto metadata on disk may change slightly to resolve
+# ambiguities. In general, you want to be writing the newest version, but to
+# ensure that all writes can still be read during rolling upgrades, there's the
+# option to write older formats as well.
+# Before upgrading from Swift 2.20.0 or earlier, ensure this is set to 1
+# Before upgrading from Swift 2.25.0 or earlier, ensure this is set to at most 2
+# After upgrading all proxy servers, set this to 3 (currently the highest version)
+# meta_version_to_write = 3
+
+# Sets the root secret from which encryption keys are derived. This must be set
+# before first use to a value that is a base64 encoding of at least 32 bytes.
+# The security of all encrypted data critically depends on this key, therefore
+# it should be set to a high-entropy value. For example, a suitable value may
+# be obtained by base-64 encoding a 32 byte (or longer) value generated by a
+# cryptographically secure random number generator. Changing the root secret is
+# likely to result in data loss. If this option is set, the root secret MUST
+# NOT be set in proxy-server.conf.
+# encryption_root_secret = changeme
+
+[kms_keymaster]
+# The kms_keymaster section is used for configuring a keymaster that retrieves
+# the encryption root secret from an external key management system (kms),
+# using the Castellan abstraction layer. Castellan can support various kms
+# backends that use Keystone for authentication. Currently, the only
+# implemented backend is for Barbican.
+
+# Over time, the format of crypto metadata on disk may change slightly to resolve
+# ambiguities. In general, you want to be writing the newest version, but to
+# ensure that all writes can still be read during rolling upgrades, there's the
+# option to write older formats as well.
+# Before upgrading from Swift 2.20.0 or earlier, ensure this is set to 1
+# Before upgrading from Swift 2.25.0 or earlier, ensure this is set to at most 2
+# After upgrading all proxy servers, set this to 3 (currently the highest version)
+# meta_version_to_write = 3
+
+# The api_class tells Castellan which key manager to use to access the external
+# key management system. The default value that accesses Barbican is
+# castellan.key_manager.barbican_key_manager.BarbicanKeyManager.
+# api_class = castellan.key_manager.barbican_key_manager.BarbicanKeyManager
+
+# The configuration options below apply to a Barbican KMS being accessed using
+# Castellan. If another KMS type is used (by specifying another value for
+# api_class), then other configuration options may be required.
+
+# The key_id is the identifier of the root secret stored in the KMS. For
+# details of how to store an existing root secret in Barbican, or how to
+# generate a new root secret in Barbican, see the 'overview_encryption'
+# documentation.
+# The key_id is the final part of the secret href returned in the
+# output of an 'openstack secret order get' command after an order to store or
+# create a key has been successfully completed. See the 'overview_encryption'
+# documentation for more information on this command.
+# key_id = changeme
+
+# The Keystone username of the user used to access the key from the KMS. The
+# username shall be set to match an existing user.
+# username = changeme
+
+# The password to go with the Keystone username above.
+# password = changeme
+
+# The Keystone project name. For security reasons, it is recommended to set
+# the project_name to a project separate from the service project used by
+# other OpenStack services. Thereby, if another service is compromised, it will
+# not have access to the Swift root encryption secret. It is recommended that
+# the swift user is the only one that has a role in this project.
+# project_name = changeme
+# Instead of the project name, the project id may also be used.
+# project_id = changeme
+
+# The Keystone URL to authenticate to. The value of auth_endpoint may be
+# set according to the value of www_authenticate_uri in [filter:authtoken] in
+# proxy-server.conf.
+# auth_endpoint = http://keystonehost/identity
+
+# The project and user domain names may optionally be specified. If they are
+# not specified, the default values of 'Default' (for *_domain_name) and
+# 'default' (for *_domain_id) are used (note the capitalization).
+# project_domain_name = Default
+# user_domain_name = Default
+# Instead of the project domain name and user domain name, the project domain
+# id and user domain id may also be specified.
+# project_domain_id = default
+# user_domain_id = default
+
+# The following configuration options may also be used in addition to/instead
+# of the above options. Refer to the Keystone documentation for more details
+# on the usage of the options: https://docs.openstack.org/keystone/
+# user_id = changeme
+# trust_id = changeme
+# reauthenticate = changeme
+# domain_id = changeme
+# domain_name = changeme
+
+# If running on a multi-region cluster, Castellan may select the wrong
+# endpoint for Barbican. To avoid this, configure the region name for the
+# correct barbican endpoint, or specify the barbican endpoint explicitly.
+# If there is only a single Barbican service in your deployment, it is
+# fine to leave these unconfigured.
+# barbican_region_name =
+# barbican_endpoint =
+
+[kmip_keymaster]
+# The kmip_keymaster section is used to configure a keymaster that fetches an
+# encryption root secret from a KMIP service.
+
+# Over time, the format of crypto metadata on disk may change slightly to resolve
+# ambiguities. In general, you want to be writing the newest version, but to
+# ensure that all writes can still be read during rolling upgrades, there's the
+# option to write older formats as well.
+# Before upgrading from Swift 2.20.0 or earlier, ensure this is set to 1
+# Before upgrading from Swift 2.25.0 or earlier, ensure this is set to at most 2
+# After upgrading all proxy servers, set this to 3 (currently the highest version)
+# meta_version_to_write = 3
+
+# The value of the ``key_id`` option should be the unique identifier for a
+# secret that will be retrieved from the KMIP service. The secret should be an
+# AES-256 symmetric key.
+# key_id = 
+
+# The remaining options are used to configure a PyKMIP client and are shown
+# below for information. The authoritative definition of these options can be
+# found at: https://pykmip.readthedocs.io/en/latest/client.html.
+# host = 
+# port = 
+# certfile = /path/to/client/cert.pem
+# keyfile = /path/to/client/key.pem
+# ca_certs = /path/to/server/cert.pem
+# username = 
+# password = 
diff --git a/etc/magic b/etc/magic
new file mode 100644
index 0000000000..0bd2a65064
--- /dev/null
+++ b/etc/magic
@@ -0,0 +1,20 @@
+#-------------------------------------------------------------------------------
+# Openstack swift
+# Note: add this snippet to either /etc/magic or ~/.magic
+#-------------------------------------------------------------------------------
+# gzip compressed
+0 beshort 0x1f8b
+# compress method: deflate, flags: FNAME
+>&0 beshort 0x0808
+# skip ahead another 6 (MTIME, XLF, OS); read FNAME
+>>&6 search/0x40 \0
+# Skip ahead five; should cover
+#   00    -- uncompressed block
+#   06 00 -- ... of length 6
+#   f9 ff -- (one's complement of length)
+>>>&5 string/4 R1NG     Swift ring,
+>>>>&0 clear x
+>>>>&0 beshort 1        version 1
+>>>>&0 beshort 2        version 2
+>>>>&0 default x
+>>>>>&0 beshort x       unknown version (0x%04x)
diff --git a/etc/memcache.conf-sample b/etc/memcache.conf-sample
index 5ad48ab100..f85e49edc6 100644
--- a/etc/memcache.conf-sample
+++ b/etc/memcache.conf-sample
@@ -2,14 +2,53 @@
 # You can use this single conf file instead of having memcache_servers set in
 # several other conf files under [filter:cache] for example. You can specify
 # multiple servers separated with commas, as in: 10.1.2.3:11211,10.1.2.4:11211
+# (IPv6 addresses must follow rfc3986 section-3.2.2, i.e. [::1]:11211)
 # memcache_servers = 127.0.0.1:11211
 #
-# Sets how memcache values are serialized and deserialized:
-# 0 = older, insecure pickle serialization
-# 1 = json serialization but pickles can still be read (still insecure)
-# 2 = json serialization only (secure and the default)
-# To avoid an instant full cache flush, existing installations should
-# upgrade with 0, then set to 1 and reload, then after some time (24 hours)
-# set to 2 and reload.
-# In the future, the ability to use pickle serialization will be removed.
-# memcache_serialization_support = 2
+# Sets the maximum number of connections to each memcached server per worker
+# memcache_max_connections = 2
+#
+# Timeout for connection
+# connect_timeout = 0.3
+# Timeout for pooled connection
+# pool_timeout = 1.0
+# number of servers to retry on failures getting a pooled connection
+# tries = 3
+# Timeout for read and writes
+# io_timeout = 2.0
+#
+# How long without an error before a server's error count is reset. This will
+# also be how long before a server is reenabled after suppression is triggered.
+# Set to 0 to disable error-limiting.
+# error_suppression_interval = 60.0
+#
+# How many errors can accumulate before a server is temporarily ignored.
+# error_suppression_limit = 10
+#
+# (Optional) Global toggle for TLS usage when comunicating with
+# the caching servers.
+# tls_enabled = false
+#
+# (Optional) Path to a file of concatenated CA certificates in PEM
+# format necessary to establish the caching server's authenticity.
+# If tls_enabled is False, this option is ignored.
+# tls_cafile =
+#
+# (Optional) Path to a single file in PEM format containing the
+# client's certificate as well as any number of CA certificates
+# needed to establish the certificate's authenticity. This file
+# is only required when client side authentication is necessary.
+# If tls_enabled is False, this option is ignored.
+# tls_certfile =
+#
+# (Optional) Path to a single file containing the client's private
+# key in. Otherwhise the private key will be taken from the file
+# specified in tls_certfile. If tls_enabled is False, this option
+# is ignored.
+# tls_keyfile =
+#
+# If an item size ever gets above item_size_warning_threshold then a warning will be
+# logged. This can be used to alert when memcache item sizes are getting to their limit.
+# It's an absolute size in bytes. Setting the value to 0 will warn on every memcache set.
+# A value of -1 disables the warning.
+# item_size_warning_threshold = -1
diff --git a/etc/object-expirer.conf-sample b/etc/object-expirer.conf-sample
index cef0f0f19d..109c0b5093 100644
--- a/etc/object-expirer.conf-sample
+++ b/etc/object-expirer.conf-sample
@@ -6,26 +6,118 @@
 # log_facility = LOG_LOCAL0
 # log_level = INFO
 # log_address = /dev/log
+# The following caps the length of log lines to the value given; no limit if
+# set to 0, the default.
+# log_max_line_length = 0
+#
 # comma separated list of functions to call to setup custom log handlers.
 # functions get passed: conf, name, log_to_console, log_route, fmt, logger,
 # adapted_logger
 # log_custom_handlers =
+#
 # If set, log_udp_host will override log_address
 # log_udp_host =
 # log_udp_port = 514
+#
 # You can enable StatsD logging here:
-# log_statsd_host = localhost
+# log_statsd_host =
 # log_statsd_port = 8125
-# log_statsd_default_sample_rate = 1
+# log_statsd_default_sample_rate = 1.0
+# log_statsd_sample_rate_factor = 1.0
 # log_statsd_metric_prefix =
+#
+# You can set scheduling priority of processes. Niceness values range from -20
+# (most favorable to the process) to 19 (least favorable to the process).
+# nice_priority =
+#
+# You can set I/O scheduling class and priority of processes. I/O niceness
+# class values are realtime, best-effort and idle. I/O niceness
+# priority is a number which goes from 0 to 7. The higher the value, the lower
+# the I/O priority of the process. Work only with ionice_class.
+# ionice_class =
+# ionice_priority =
 
 [object-expirer]
-# interval = 300
-# auto_create_account_prefix = .
-# report_interval = 300
+# interval = 300.0
+# report_interval = 300.0
+#
+# request_tries is the number of times the expirer's internal client will
+# attempt any given request in the event of failure. The default is 3.
+# request_tries = 3
+
+# concurrency is the level of concurrency to use to do the work, this value
+# must be set to at least 1
+# concurrency = 1
+#
+# deletes can be ratelimited to prevent the expirer from overwhelming the cluster
+# tasks_per_second = 50.0
+#
+# processes is how many parts to divide the work into, one part per process
+# that will be doing the work
+# processes set 0 means that a single process will be doing all the work
+# processes can also be specified on the command line and will override the
+# config value
+# processes = 0
+#
+# process is which of the parts a particular process will work on
+# process can also be specified on the command line and will override the config
+# value
+# process is "zero based", if you want to use 3 processes, you should run
+# processes with process set to 0, 1, and 2
+# process = 0
+#
+# The expirer will re-attempt expiring if the source object is not available
+# up to reclaim_age seconds before it gives up and deletes the entry in the
+# queue.
+# reclaim_age = 604800
+#
+# The expirer can delay the reaping of expired objects on disk (and in
+# container listings) with an account level or container level delay_reaping
+# time.
+# After the delay_reaping time has passed objects will be reaped as normal.
+# You may configure this delay_reaping value in seconds with dynamic config
+# option names prefixed with delay_reaping_ for account level delays
+# and delay_reaping_/ for container level delays.
+# Special characters in  or  should be quoted.
+# The delay_reaping value should be a float value greater than or equal to
+# zero.
+# A container level delay_reaping does not require an account level
+# delay_reaping but overrides the account level delay_reaping for the same
+# account if it exists.
+# For example:
+# delay_reaping_AUTH_test = 300.0
+# delay_reaping_AUTH_test2 = 86400.0
+# delay_reaping_AUTH_test/test = 400.0
+# delay_reaping_AUTH_test/test2 = 600.0
+# delay_reaping_AUTH_test/special%0Achars%3Dshould%20be%20quoted
+# N.B. By default no delay_reaping value is configured for any accounts or
+# containers.
+
+# Number of tasks objects to cache before processing.  With many nodes it may
+# take some time to fill a larger cache_size but may also have a better chance
+# to distribute DELETEs to multiple target containers.
+# round_robin_task_cache_size = 100000
+
+# recon_cache_path = /var/cache/swift
+#
+# You can set scheduling priority of processes. Niceness values range from -20
+# (most favorable to the process) to 19 (least favorable to the process).
+# nice_priority =
+#
+# You can set I/O scheduling class and priority of processes. I/O niceness
+# class values are realtime, best-effort and idle. I/O niceness
+# priority is a number which goes from 0 to 7. The higher the value, the lower
+# the I/O priority of the process. Work only with ionice_class.
+# ionice_class =
+# ionice_priority =
+
+#
+# The following sections define the configuration of the expirer's internal
+# client pipeline
+#
 
 [pipeline:main]
-pipeline = catch_errors cache proxy-server
+pipeline = catch_errors proxy-logging cache proxy-server
 
 [app:proxy-server]
 use = egg:swift#proxy
@@ -38,3 +130,7 @@ use = egg:swift#memcache
 [filter:catch_errors]
 use = egg:swift#catch_errors
 # See proxy-server.conf-sample for options
+
+[filter:proxy-logging]
+use = egg:swift#proxy_logging
+# See proxy-server.conf-sample for options
diff --git a/etc/object-server.conf-sample b/etc/object-server.conf-sample
index 2e8b54f4eb..dd6bc21866 100644
--- a/etc/object-server.conf-sample
+++ b/etc/object-server.conf-sample
@@ -1,36 +1,121 @@
 [DEFAULT]
 # bind_ip = 0.0.0.0
-# bind_port = 6000
+bind_port = 6200
+# keep_idle = 600
 # bind_timeout = 30
 # backlog = 4096
-# workers = 1
 # user = swift
 # swift_dir = /etc/swift
 # devices = /srv/node
 # mount_check = true
 # disable_fallocate = false
-# expiring_objects_container_divisor = 86400
+#
+# Use an integer to override the number of pre-forked processes that will
+# accept connections.  NOTE: if servers_per_port is set, this setting is
+# ignored.
+# workers = auto
+#
+# Make object-server run this many worker processes per unique port of "local"
+# ring devices across all storage policies. The default value of 0 disables this
+# feature.
+# servers_per_port = 0
+#
+# If running in a container, servers_per_port may not be able to use the
+# bind_ip to lookup the ports in the ring.  You may instead override the port
+# lookup in the ring using the ring_ip.  Any devices/ports associted with the
+# ring_ip will be used when listening on the configured bind_ip address.
+# ring_ip = 
+#
+# Maximum concurrent requests per worker
+# max_clients = 1024
+#
 # You can specify default log routing here if you want:
 # log_name = swift
 # log_facility = LOG_LOCAL0
 # log_level = INFO
 # log_address = /dev/log
+# The following caps the length of log lines to the value given; no limit if
+# set to 0, the default.
+# log_max_line_length = 0
+#
+# Hashing algorithm for log anonymization. Must be one of algorithms supported
+# by Python's hashlib.
+# log_anonymization_method = MD5
+#
+# Salt added during log anonymization
+# log_anonymization_salt =
+#
+# Template used to format logs. All words surrounded by curly brackets
+# will be substituted with the appropriate values
+# log_format = {remote_addr} - - [{time.d}/{time.b}/{time.Y}:{time.H}:{time.M}:{time.S} +0000] "{method} {path}" {status} {content_length} "{referer}" "{txn_id}" "{user_agent}" {trans_time:.4f} "{additional_info}" {pid} {policy_index}
+#
 # comma separated list of functions to call to setup custom log handlers.
 # functions get passed: conf, name, log_to_console, log_route, fmt, logger,
 # adapted_logger
 # log_custom_handlers =
+#
 # If set, log_udp_host will override log_address
 # log_udp_host =
 # log_udp_port = 514
+#
 # You can enable StatsD logging here:
-# log_statsd_host = localhost
+# log_statsd_host =
 # log_statsd_port = 8125
-# log_statsd_default_sample_rate = 1
+# log_statsd_default_sample_rate = 1.0
+# log_statsd_sample_rate_factor = 1.0
 # log_statsd_metric_prefix =
+#
 # eventlet_debug = false
+#
+# You can set fallocate_reserve to the number of bytes or percentage of disk
+# space you'd like fallocate to reserve, whether there is space for the given
+# file size or not. Percentage will be used if the value ends with a '%'.
+# fallocate_reserve = 1%
+#
+# Time to wait while attempting to connect to another backend node.
+# conn_timeout = 0.5
+# Time to wait while sending each chunk of data to another backend node.
+# node_timeout = 3
+# Time to wait while sending a container update on object update.
+# container_update_timeout = 1.0
+# Time to wait while receiving each chunk of data from a client or another
+# backend node.
+# client_timeout = 60.0
+#
+# network_chunk_size = 65536
+# disk_chunk_size = 65536
+#
+# Reclamation of tombstone files is performed primarily by the replicator and
+# the reconstructor but the object-server and object-auditor also reference
+# this value - it should be the same for all object services in the cluster,
+# and not greater than the container services reclaim_age
+# reclaim_age = 604800
+#
+# Non-durable data files may also get reclaimed if they are older than
+# reclaim_age, but not if the time they were written to disk (i.e. mtime) is
+# less than commit_window seconds ago. The commit_window also prevents the
+# reconstructor removing recently written non-durable data files from a handoff
+# node after reverting them to a primary. This gives the object-server a window
+# in which to finish a concurrent PUT on a handoff and mark the data durable. A
+# commit_window greater than zero is strongly recommended to avoid unintended
+# removal of data files that were about to become durable; commit_window should
+# be much less than reclaim_age.
+# commit_window = 60.0
+#
+# You can set scheduling priority of processes. Niceness values range from -20
+# (most favorable to the process) to 19 (least favorable to the process).
+# nice_priority =
+#
+# You can set I/O scheduling class and priority of processes. I/O niceness
+# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
+# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
+# 0 to 7. The higher the value, the lower the I/O priority of the process.
+# Work only with ionice_class.
+# ionice_class =
+# ionice_priority =
 
 [pipeline:main]
-pipeline = healthcheck recon object-server
+pipeline = healthcheck recon backend_ratelimit object-server
 
 [app:object-server]
 use = egg:swift#object
@@ -38,26 +123,132 @@ use = egg:swift#object
 # set log_name = object-server
 # set log_facility = LOG_LOCAL0
 # set log_level = INFO
-# set log_requests = True
+# set log_requests = true
 # set log_address = /dev/log
-# node_timeout = 3
-# conn_timeout = 0.5
-# network_chunk_size = 65536
-# disk_chunk_size = 65536
+#
 # max_upload_time = 86400
+#
+# slow is the total amount of seconds an object PUT/DELETE request takes at
+# least. If it is faster, the object server will sleep this amount of time minus
+# the already passed transaction time.  This is only useful for simulating slow
+# devices on storage nodes during testing and development.
 # slow = 0
+#
 # Objects smaller than this are not evicted from the buffercache once read
-# keep_cache_size = 5424880
+# keep_cache_size = 5242880
+#
 # If true, objects for authenticated GET requests may be kept in buffer cache
 # if small enough
-# keep_cache_private = False
+# keep_cache_private = false
+#
+# If true, SLO object's manifest file for GET requests may be kept in buffer cache
+# if smaller than 'keep_cache_size'. And this config will only matter when
+# 'keep_cache_private' is false.
+# keep_cache_slo_manifest = false
+#
+# cooperative_period defines how frequent object server GET/PUT request will
+# perform the cooperative yielding during iterating the disk chunks. For
+# example, value of '5' will insert one sleep() after every 5 disk_chunk_size
+# chunk reads/writes. A value of '0' (the default) will turn off cooperative
+# yielding.
+# cooperative_period = 0
+#
+# By default, the object-server will always validate the MD5 of object data
+# while streaming a complete object response. Occassionally this is identified
+# as a CPU bottleneck, consuming as much as 40% of the CPU time of the
+# object-server. Since range-request-heavy clients don't get these integrity
+# checks, it seems reasonable to give operators a chance to tune it down and
+# instead rely on the object-auditor to detect and quarantine corrupted objects.
+# etag_validate_pct = 100
+#
 # on PUTs, sync data every n MB
 # mb_per_sync = 512
+#
 # Comma separated list of headers that can be set in metadata on an object.
 # This list is in addition to X-Object-Meta-* headers and cannot include
 # Content-Type, etag, Content-Length, or deleted
-# allowed_headers = Content-Disposition, Content-Encoding, X-Delete-At, X-Object-Manifest
-# auto_create_account_prefix = .
+# allowed_headers = Content-Disposition, Content-Encoding, X-Delete-At, X-Object-Manifest, X-Static-Large-Object, Cache-Control, Content-Language, Expires, X-Robots-Tag
+
+# The number of threads in eventlet's thread pool. Most IO will occur
+# in the object server's main thread, but certain "heavy" IO
+# operations will occur in separate IO threads, managed by eventlet.
+#
+# The default value is auto, whose actual value is dependent on the
+# servers_per_port value:
+#
+#  - When servers_per_port is zero, the default value of
+#    eventlet_tpool_num_threads is empty, which uses eventlet's default
+#    (currently 20 threads).
+#
+#  - When servers_per_port is nonzero, the default value of
+#    eventlet_tpool_num_threads is 1.
+#
+# But you may override this value to any integer value.
+#
+# Note that this value is threads per object-server process, so to
+# compute the total number of IO threads on a node, you must multiply
+# this by the number of object-server processes on the node.
+#
+# eventlet_tpool_num_threads = auto
+
+# You can disable REPLICATE and SSYNC handling (default is to allow it). When
+# deploying a cluster with a separate replication network, you'll want multiple
+# object-server processes running: one for client-driven traffic and another
+# for replication traffic. The server handling client-driven traffic may set
+# this to false. If there is only one object-server process, leave this as
+# true.
+# replication_server = true
+#
+# Set to restrict the number of concurrent incoming SSYNC requests
+# Set to 0 for unlimited
+# Note that SSYNC requests are only used by the object reconstructor or the
+# object replicator when configured to use ssync.
+# replication_concurrency = 4
+#
+# Set to restrict the number of concurrent incoming SSYNC requests per
+# device; set to 0 for unlimited requests per device. This can help control
+# I/O to each device. This does not override replication_concurrency described
+# above, so you may need to adjust both parameters depending on your hardware
+# or network capacity.
+# replication_concurrency_per_device = 1
+#
+# Number of seconds to wait for an existing replication device lock before
+# giving up.
+# replication_lock_timeout = 15
+#
+# These next two settings control when the SSYNC subrequest handler will
+# abort an incoming SSYNC attempt. An abort will occur if there are at
+# least threshold number of failures and the value of failures / successes
+# exceeds the ratio. The defaults of 100 and 1.0 means that at least 100
+# failures have to occur and there have to be more failures than successes for
+# an abort to occur.
+# replication_failure_threshold = 100
+# replication_failure_ratio = 1.0
+#
+# Use splice() for zero-copy object GETs. This requires Linux kernel
+# version 3.0 or greater. If you set "splice = yes" but the kernel
+# does not support it, error messages will appear in the object server
+# logs at startup, but your object servers should continue to function.
+#
+# splice = no
+#
+# You can set scheduling priority of processes. Niceness values range from -20
+# (most favorable to the process) to 19 (least favorable to the process).
+# nice_priority =
+#
+# You can set I/O scheduling class and priority of processes. I/O niceness
+# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
+# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
+# 0 to 7. The higher the value, the lower the I/O priority of the process.
+# Work only with ionice_class.
+# ionice_class =
+# ionice_priority =
+#
+# When reloading servers with SIGUSR1, workers running with old config/code
+# are allowed some time to finish serving in-flight requests. Use this to
+# configure the grace period (in seconds), after which the reloaded server
+# will issue SIGKILLs to remaining stale workers.
+# stale_worker_timeout = 86400
 
 [filter:healthcheck]
 use = egg:swift#healthcheck
@@ -70,43 +261,315 @@ use = egg:swift#recon
 #recon_cache_path = /var/cache/swift
 #recon_lock_path = /var/lock
 
+[filter:backend_ratelimit]
+use = egg:swift#backend_ratelimit
+# Config options can optionally be loaded from a separate config file. Config
+# options in this section will be used unless the same option is found in the
+# config file, in which case the config file option will be used. See the
+# backend-ratelimit.conf-sample file for details of available config options.
+# backend_ratelimit_conf_path = /etc/swift/backend-ratelimit.conf
+
+# The minimum interval between attempts to reload any config file at
+# backend_ratelimit_conf_path while the server is running. A value of 0 means
+# that the file is loaded at start-up but not subsequently reloaded. Note that
+# config options in this section are never reloaded after start-up.
+# config_reload_interval = 60
+
 [object-replicator]
 # You can override the default log routing for this app here (don't use set!):
 # log_name = object-replicator
 # log_facility = LOG_LOCAL0
 # log_level = INFO
 # log_address = /dev/log
-# vm_test_mode = no
+#
 # daemonize = on
-# run_pause = 30
+#
+# Time in seconds to wait between replication passes
+# interval = 30.0
+# run_pause is deprecated, use interval instead
+# run_pause = 30.0
+#
+# Number of concurrent replication jobs to run. This is per-process,
+# so replicator_workers=W and concurrency=C will result in W*C
+# replication jobs running at once.
 # concurrency = 1
-# stats_interval = 300
+#
+# Number of worker processes to use. No matter how big this number is,
+# at most one worker per disk will be used. 0 means no forking; all work
+# is done in the main process.
+# replicator_workers = 0
+#
+# stats_interval = 300.0
+#
+# default is rsync, alternative is ssync
+# sync_method = rsync
+#
 # max duration of a partition rsync
 # rsync_timeout = 900
-# passed to rsync for io op timeout
+#
+# bandwidth limit for rsync in kB/s. 0 means unlimited. rsync 3.2.2 and later
+# accept suffixed values like 10M or 1.5G; see the --bwlimit option for rsync(1)
+# rsync_bwlimit = 0
+#
+# passed to rsync for both io op timeout and connection timeout
 # rsync_io_timeout = 30
-# max duration of an http request
+#
+# Allow rsync to compress data which is transmitted to destination node
+# during sync. However, this is applicable only when destination node is in
+# a different region than the local one.
+# NOTE: Objects that are already compressed (for example: .tar.gz, .mp3) might
+# slow down the syncing process.
+# rsync_compress = no
+#
+# Format of the rsync module where the replicator will send data. See
+# etc/rsyncd.conf-sample for some usage examples.
+# rsync_module = {replication_ip}::object
+#
+# node_timeout = 
+# max duration of an http request; this is for REPLICATE finalization calls and
+# so should be longer than node_timeout
 # http_timeout = 60
+#
 # attempts to kill all workers if nothing replicates for lockup_timeout seconds
 # lockup_timeout = 1800
-# The replicator also performs reclamation
-# reclaim_age = 604800
-# ring_check_interval = 15
+#
+# ring_check_interval = 15.0
 # recon_cache_path = /var/cache/swift
+#
+# By default, per-file rsync transfers are logged at debug if successful and
+# error on failure. During large rebalances (which both increase the number
+# of diskfiles transferred and increases the likelihood of failures), this
+# can overwhelm log aggregation while providing little useful insights.
+# Change this to false to disable per-file logging.
+# log_rsync_transfers = true
+#
+# limits how long rsync error log lines are
+# 0 means to log the entire line
+# rsync_error_log_line_length = 0
+#
+# handoffs_first and handoff_delete are options for a special case
+# such as disk full in the cluster. These two options SHOULD NOT BE
+# CHANGED, except for such an extreme situations. (e.g. disks filled up
+# or are about to fill up. Anyway, DO NOT let your drives fill up)
+# handoffs_first is the flag to replicate handoffs prior to canonical
+# partitions. It allows to force syncing and deleting handoffs quickly.
+# If set to a True value(e.g. "True" or "1"), partitions
+# that are not supposed to be on the node will be replicated first.
+# handoffs_first = False
+#
+# handoff_delete is the number of replicas which are ensured in swift.
+# If the number less than the number of replicas is set, object-replicator
+# could delete local handoffs even if all replicas are not ensured in the
+# cluster. Object-replicator would remove local handoff partition directories
+# after syncing partition when the number of successful responses is greater
+# than or equal to this number. By default(auto), handoff partitions will be
+# removed  when it has successfully replicated to all the canonical nodes.
+# handoff_delete = auto
+#
+# You can set scheduling priority of processes. Niceness values range from -20
+# (most favorable to the process) to 19 (least favorable to the process).
+# nice_priority =
+#
+# You can set I/O scheduling class and priority of processes. I/O niceness
+# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
+# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
+# 0 to 7. The higher the value, the lower the I/O priority of the process.
+# Work only with ionice_class.
+# ionice_class =
+# ionice_priority =
 
-[object-updater]
+[object-reconstructor]
 # You can override the default log routing for this app here (don't use set!):
-# log_name = object-updater
+# Unless otherwise noted, each setting below has the same meaning as described
+# in the [object-replicator] section, however these settings apply to the EC
+# reconstructor
+#
+# log_name = object-reconstructor
 # log_facility = LOG_LOCAL0
 # log_level = INFO
 # log_address = /dev/log
-# interval = 300
+#
+# daemonize = on
+#
+# Time in seconds to wait between reconstruction passes
+# interval = 30.0
+# run_pause is deprecated, use interval instead
+# run_pause = 30.0
+#
+# Maximum number of worker processes to spawn.  Each worker will handle a
+# subset of devices.  Devices will be assigned evenly among the workers so that
+# workers cycle at similar intervals (which can lead to fewer workers than
+# requested).  You can not have more workers than devices.  If you have no
+# devices only a single worker is spawned.
+# reconstructor_workers = 0
+#
 # concurrency = 1
+# stats_interval = 300.0
 # node_timeout = 10
-# conn_timeout = 0.5
-# slowdown will sleep that amount between objects
+# http_timeout = 60
+# lockup_timeout = 1800
+# ring_check_interval = 15.0
+# recon_cache_path = /var/cache/swift
+#
+# The handoffs_only mode option is for special case emergency situations during
+# rebalance such as disk full in the cluster.  This option SHOULD NOT BE
+# CHANGED, except for extreme situations.  When handoffs_only mode is enabled
+# the reconstructor will *only* revert fragments from handoff nodes to primary
+# nodes and will not sync primary nodes with neighboring primary nodes.  This
+# will force the reconstructor to sync and delete handoffs' fragments more
+# quickly and minimize the time of the rebalance by limiting the number of
+# rebuilds.  The handoffs_only option is only for temporary use and should be
+# disabled as soon as the emergency situation has been resolved.  When
+# handoffs_only is not set, the deprecated handoffs_first option will be
+# honored as a synonym, but may be ignored in a future release.
+# handoffs_only = False
+#
+# The default strategy for unmounted drives will stage rebuilt data on a
+# handoff node until updated rings are deployed.  Because fragments are rebuilt
+# on offset handoffs based on fragment index and the proxy limits how deep it
+# will search for EC frags we restrict how many nodes we'll try.  Setting to 0
+# will disable rebuilds to handoffs and only rebuild fragments for unmounted
+# devices to mounted primaries after a ring change.
+# Setting to -1 means "no limit".
+# rebuild_handoff_node_count = 2
+#
+# By default the reconstructor attempts to revert all objects from handoff
+# partitions in a single batch using a single SSYNC request. In exceptional
+# circumstances max_objects_per_revert can be used to temporarily limit the
+# number of objects reverted by each reconstructor revert type job. If more
+# than max_objects_per_revert are available in a sender's handoff partition,
+# the remaining objects will remain in the handoff partition and will not be
+# reverted until the next time the reconstructor visits that handoff partition
+# i.e. with this option set, a single cycle of the reconstructor may not
+# completely revert all handoff partitions. The option has no effect on
+# reconstructor sync type jobs between primary partitions. A value of 0 (the
+# default) means there is no limit.
+# max_objects_per_revert = 0
+#
+# You can set scheduling priority of processes. Niceness values range from -20
+# (most favorable to the process) to 19 (least favorable to the process).
+# nice_priority =
+#
+# You can set I/O scheduling class and priority of processes. I/O niceness
+# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
+# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
+# 0 to 7. The higher the value, the lower the I/O priority of the process.
+# Work only with ionice_class.
+# ionice_class =
+# ionice_priority =
+#
+# When upgrading from liberasurecode<=1.5.0, you may want to continue writing
+# legacy CRCs until all nodes are upgraded and capabale of reading fragments
+# with zlib CRCs. liberasurecode>=1.6.2 checks for the environment variable
+# LIBERASURECODE_WRITE_LEGACY_CRC; if set (value doesn't matter), it will use
+# its legacy CRC. Set this option to true or false to ensure the environment
+# variable is or is not set. Leave the option blank or absent to not touch
+# the environment (default). For more information, see
+# https://bugs.launchpad.net/liberasurecode/+bug/1886088
+# write_legacy_ec_crc =
+#
+# When attempting to reconstruct a missing fragment on another node from a
+# fragment on the local node, the reconstructor may fail to fetch sufficient
+# fragments to reconstruct the missing fragment. This may be because most or
+# all of the remote fragments have been deleted, and the local fragment is
+# stale, in which case the reconstructor will never succeed in reconstructing
+# the apparently missing fragment and will log errors. If the object's
+# tombstones have been reclaimed then the stale fragment will never be deleted
+# (see https://bugs.launchpad.net/swift/+bug/1655608). If an operator suspects
+# that stale fragments have been re-introduced to the cluster and is seeing
+# error logs similar to those in the bug report, then the quarantine_threshold
+# option may be set to a value greater than zero. This enables the
+# reconstructor to quarantine the stale fragments when it fails to fetch more
+# than the quarantine_threshold number of fragments (including the stale
+# fragment) during an attempt to reconstruct. For example, setting the
+# quarantine_threshold to 1 would cause a fragment to be quarantined if no
+# other fragments can be fetched. The value may be reset to zero after the
+# reconstructor has run on all affected nodes and the error logs are no longer
+# seen.
+# Note: the quarantine_threshold applies equally to all policies, but for each
+# policy it is effectively capped at (ec_ndata - 1) so that a fragment is never
+# quarantined when sufficient fragments exist to reconstruct the object.
+# quarantine_threshold = 0
+#
+# Fragments are not quarantined until they are older than
+# quarantine_age, which defaults to the value of reclaim_age.
+# quarantine_age =
+#
+# Sets the maximum number of nodes to which requests will be made before
+# quarantining a fragment. You can use '* replicas' at the end to have it use
+# the number given times the number of replicas for the ring being used for the
+# requests. The minimum number of nodes to which requests are made is the
+# number of replicas for the policy minus 1 (the node on which the fragment is
+# to be rebuilt). The minimum is only exceeded if request_node_count is
+# greater, and only for the purposes of quarantining.
+# request_node_count = 2 * replicas
+
+[object-updater]
+# You can override the default log routing for this app here (don't use set!):
+# log_name = object-updater
+# log_facility = LOG_LOCAL0
+# log_level = INFO
+# log_address = /dev/log
+#
+# interval = 300.0
+# node_timeout = 
+#
+# updater_workers controls how many processes the object updater will
+# spawn, while concurrency controls how many async_pending records
+# each updater process will operate on at any one time. With
+# concurrency=C and updater_workers=W, there will be up to W*C
+# async_pending records being processed at once.
+# concurrency = 8
+# updater_workers = 1
+#
+# Send at most this many object updates per second
+# objects_per_second = 50
+#
+# Send at most this many object updates per bucket per second. The value must
+# be a float greater than or equal to 0. Set to 0 for unlimited.
+# max_objects_per_container_per_second = 0
+#
+# The per_container ratelimit implementation uses a hashring to constrain
+# memory requirements.  Orders of magnitude more buckets will use (nominally)
+# more memory, but will ratelimit smaller groups of containers. The value must
+# be an integer greater than 0.
+# per_container_ratelimit_buckets = 1000
+#
+# Updates that cannot be sent due to per-container rate-limiting may be
+# deferred and re-tried at the end of the updater cycle. This option constrains
+# the size of the in-memory data structure used to store deferred updates.
+# Must be an integer value greater than or equal to 0.
+# max_deferred_updates = 10000
+#
+# Maximum number of oldest async pending timestamps to track for each
+# account-container pair.
+# async_tracker_max_entries = 100
+# Maximum number of oldest async pending timestamps to dump to recon cache.
+# async_tracker_dump_count = 5
+#
+# slowdown will sleep that amount between objects. Deprecated; use
+# objects_per_second instead.
 # slowdown = 0.01
+#
+# Log stats (at INFO level) every report_interval seconds. This
+# logging is per-process, so with concurrency > 1, the logs will
+# contain one stats log per worker process every report_interval
+# seconds.
+# report_interval = 300.0
+#
 # recon_cache_path = /var/cache/swift
+#
+# You can set scheduling priority of processes. Niceness values range from -20
+# (most favorable to the process) to 19 (least favorable to the process).
+# nice_priority =
+#
+# You can set I/O scheduling class and priority of processes. I/O niceness
+# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
+# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
+# 0 to 7. The higher the value, the lower the I/O priority of the process.
+# Work only with ionice_class.
+# ionice_class =
+# ionice_priority =
 
 [object-auditor]
 # You can override the default log routing for this app here (don't use set!):
@@ -114,8 +577,230 @@ use = egg:swift#recon
 # log_facility = LOG_LOCAL0
 # log_level = INFO
 # log_address = /dev/log
+#
+# Time in seconds to wait between auditor passes
+# interval = 30.0
+#
+# You can set the disk chunk size that the auditor uses making it larger if
+# you like for more efficient local auditing of larger objects
+# disk_chunk_size = 65536
 # files_per_second = 20
+# concurrency = 1
 # bytes_per_second = 10000000
 # log_time = 3600
 # zero_byte_files_per_second = 50
 # recon_cache_path = /var/cache/swift
+
+# Takes a comma separated list of ints. If set, the object auditor will
+# increment a counter for every object whose size is <= to the given break
+# points and report the result after a full scan.
+# object_size_stats =
+#
+# You can set scheduling priority of processes. Niceness values range from -20
+# (most favorable to the process) to 19 (least favorable to the process).
+# nice_priority =
+#
+# You can set I/O scheduling class and priority of processes. I/O niceness
+# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
+# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
+# 0 to 7. The higher the value, the lower the I/O priority of the process.
+# Work only with ionice_class.
+# ionice_class =
+# ionice_priority =
+
+# The auditor will cleanup old rsync tempfiles after they are "old
+# enough" to delete.  You can configure the time elapsed in seconds
+# before rsync tempfiles will be unlinked, or the default value of
+# "auto" try to use object-replicator's rsync_timeout + 900 and fallback
+# to 86400 (1 day).
+# rsync_tempfile_timeout = auto
+
+# A comma-separated list of watcher entry points. This lets operators
+# programmatically see audited objects.
+#
+# The entry point group name is "swift.object_audit_watcher". If your
+# setup.py has something like this:
+#
+# entry_points={'swift.object_audit_watcher': [
+#     'some_watcher = some_module:Watcher']}
+#
+# then you would enable it with "watchers = some_package#some_watcher".
+# For example, the built-in reference implementation is enabled as
+# "watchers = swift#dark_data".
+#
+# watchers =
+
+# Watcher-specific parameters can be added in a section with a name
+# [object-auditor:watcher:some_package#some_watcher]. The following
+# example uses the built-in reference watcher.
+#
+# [object-auditor:watcher:swift#dark_data]
+#
+# Action type can be 'log' (default), 'delete', or 'quarantine'.
+# action=log
+#
+# The watcher ignores the objects younger than certain minimum age.
+# This prevents spurious actions upon fresh objects while container
+# listings eventually settle.
+# grace_age=604800
+
+[object-expirer]
+# If this true, this expirer will execute tasks from legacy expirer task queue,
+# at least one object server should run with dequeue_from_legacy = true
+# dequeue_from_legacy = false
+#
+# Note: Be careful not to enable ``dequeue_from_legacy`` on too many expirers
+# as all legacy tasks are stored in a single hidden account and the same hidden
+# containers. On a large cluster one may inadvertently make the
+# acccount/container server for the hidden too busy.
+#
+# Note: the processes and process options can only be used in conjunction with
+# notes using `dequeue_from_legacy = true`.  These options are ignored on nodes
+# with `dequeue_from_legacy = false`.
+#
+# processes is how many parts to divide the legacy work into, one part per
+# process that will be doing the work
+# processes set 0 means that a single legacy process will be doing all the work
+# processes can also be specified on the command line and will override the
+# config value
+# processes = 0
+#
+# process is which of the parts a particular legacy process will work on
+# process can also be specified on the command line and will override the config
+# value
+# process is "zero based", if you want to use 3 processes, you should run
+# processes with process set to 0, 1, and 2
+# process = 0
+#
+# internal_client_conf_path = /etc/swift/internal-client.conf
+#
+# You can override the default log routing for this app here (don't use set!):
+# log_name = object-expirer
+# log_facility = LOG_LOCAL0
+# log_level = INFO
+# log_address = /dev/log
+#
+# interval = 300.0
+#
+# report_interval = 300.0
+#
+# request_tries is the number of times the expirer's internal client will
+# attempt any given request in the event of failure. The default is 3.
+# request_tries = 3
+#
+# concurrency is the level of concurrency to use to do the work, this value
+# must be set to at least 1
+# concurrency = 1
+#
+# deletes can be ratelimited to prevent the expirer from overwhelming the cluster
+# tasks_per_second = 50.0
+#
+# The expirer will re-attempt expiring if the source object is not available
+# up to reclaim_age seconds before it gives up and deletes the entry in the
+# queue.
+# reclaim_age = 604800
+
+# Number of tasks objects to cache before processing.  With many nodes it may
+# take some time to fill a larger cache_size but may also have a better chance
+# to distribute DELETEs to multiple target containers.
+# round_robin_task_cache_size = 100000
+
+# recon_cache_path = /var/cache/swift
+#
+# You can set scheduling priority of processes. Niceness values range from -20
+# (most favorable to the process) to 19 (least favorable to the process).
+# nice_priority =
+#
+# You can set I/O scheduling class and priority of processes. I/O niceness
+# class values are realtime, best-effort and idle. I/O niceness
+# priority is a number which goes from 0 to 7. The higher the value, the lower
+# the I/O priority of the process. Work only with ionice_class.
+# ionice_class =
+# ionice_priority =
+#
+# The expirer can delay the reaping of expired objects on disk (and in
+# container listings) with an account level or container level delay_reaping
+# time.
+# After the delay_reaping time has passed objects will be reaped as normal.
+# You may configure this delay_reaping value in seconds with dynamic config
+# option names prefixed with delay_reaping_ for account level delays
+# and delay_reaping_/ for container level delays.
+# Special characters in  or  should be quoted.
+# The delay_reaping value should be a float value greater than or equal to
+# zero.
+# A container level delay_reaping does not require an account level
+# delay_reaping but overrides the account level delay_reaping for the same
+# account if it exists.
+# For example:
+# delay_reaping_AUTH_test = 300.0
+# delay_reaping_AUTH_test2 = 86400.0
+# delay_reaping_AUTH_test/test = 400.0
+# delay_reaping_AUTH_test/test2 = 600.0
+# delay_reaping_AUTH_test/special%0Achars%3Dshould%20be%20quoted
+# N.B. By default no delay_reaping value is configured for any accounts or
+# containers.
+
+# Note: Put it at the beginning of the pipleline to profile all middleware. But
+# it is safer to put this after healthcheck. Not intended for production
+# environments!
+[filter:xprofile]
+use = egg:swift#xprofile
+# This option enable you to switch profilers which should inherit from python
+# standard profiler. Currently the supported value can be 'cProfile',
+# 'eventlet.green.profile' etc.
+# profile_module = eventlet.green.profile
+#
+# This prefix will be used to combine process ID and timestamp to name the
+# profile data file.  Make sure the executing user has permission to write
+# into this path (missing path segments will be created, if necessary).
+# If you enable profiling in more than one type of daemon, you must override
+# it with an unique value like: /var/log/swift/profile/object.profile
+# log_filename_prefix = /tmp/log/swift/profile/default.profile
+#
+# the profile data will be dumped to local disk based on above naming rule
+# in this interval.
+# dump_interval = 5.0
+#
+# Be careful, this option will enable profiler to dump data into the file with
+# time stamp which means there will be lots of files piled up in the directory.
+# dump_timestamp = false
+#
+# This is the path of the URL to access the mini web UI.
+# path = /__profile__
+#
+# Clear the data when the wsgi server shutdown.
+# flush_at_shutdown = false
+#
+# unwind the iterator of applications
+# unwind = false
+
+[object-relinker]
+# You can override the default log routing for this app here (don't use set!):
+# log_name = object-relinker
+# log_facility = LOG_LOCAL0
+# log_level = INFO
+# log_address = /dev/log
+#
+# Start up to this many sub-processes to process disks in parallel. Each disk
+# will be handled by at most one child process. By default, one process is
+# spawned per disk.
+# workers = auto
+#
+# Target this many relinks/cleanups per second for each worker, to reduce the
+# likelihood that the added I/O from a partition-power increase impacts
+# client traffic. Use zero for unlimited.
+# files_per_second = 0.0
+#
+# stats_interval = 300.0
+# recon_cache_path = /var/cache/swift
+#
+# Highly concurrent PUTs can result in timestamp collisions; see
+# https://bugs.launchpad.net/swift/+bug/1971686 and
+# https://bugs.launchpad.net/swift/+bug/2127779 for more information.
+# This can lead to relinker errors which may be benign; use this option to
+# handle such errors.  With this option enabled during the relink phase
+# we'll quarantine the colliding file in the new target part dir and retry
+# the relink.  During the cleanup phase we ignore the un-matched inode
+# "collision" and allow the cleanup of the old file in the old part dir
+# same as tombstones.
+# clobber_hardlink_collisions = false
diff --git a/etc/proxy-server.conf-sample b/etc/proxy-server.conf-sample
index 7a9254de37..5caa0d630b 100644
--- a/etc/proxy-server.conf-sample
+++ b/etc/proxy-server.conf-sample
@@ -1,39 +1,168 @@
 [DEFAULT]
 # bind_ip = 0.0.0.0
-# bind_port = 80
+bind_port = 8080
+# Connection idle timeout (in seconds)
+# keep_idle = 600
+# Socket bind timeout (in seconds)
 # bind_timeout = 30
 # backlog = 4096
 # swift_dir = /etc/swift
-# workers = 1
 # user = swift
+
+# Enables exposing configuration settings via HTTP GET /info.
+# expose_info = true
+
+# Key to use for admin calls that are HMAC signed.  Default is empty,
+# which will disable admin calls to /info.
+# admin_key = secret_admin_key
+#
+# Allows the ability to withhold sections from showing up in the public calls
+# to /info.  You can withhold subsections by separating the dict level with a
+# ".". Default value is 'swift.valid_api_versions, swift.auto_create_account_prefix'
+# which allows all registered features to be listed via HTTP GET /info except
+# swift.valid_api_versions and swift.auto_create_account_prefix information.
+# As an example, the following would cause the sections 'container_quotas' and
+# 'tempurl' to not be listed, and the key max_failed_deletes would be removed from
+# bulk_delete.
+# disallowed_sections = swift.valid_api_versions, container_quotas, tempurl, bulk_delete.max_failed_deletes
+
+# Use an integer to override the number of pre-forked processes that will
+# accept connections.  Should default to the number of effective cpu
+# cores in the system.  It's worth noting that individual workers will
+# use many eventlet co-routines to service multiple concurrent requests.
+# workers = auto
+#
+# Maximum concurrent requests per worker
+# max_clients = 1024
+#
 # Set the following two lines to enable SSL. This is for testing only.
 # cert_file = /etc/swift/proxy.crt
 # key_file = /etc/swift/proxy.key
-# expiring_objects_container_divisor = 86400
+#
+#
 # You can specify default log routing here if you want:
 # log_name = swift
 # log_facility = LOG_LOCAL0
 # log_level = INFO
-# log_headers = False
+# log_headers = false
 # log_address = /dev/log
+# The following caps the length of log lines to the value given; no limit if
+# set to 0, the default.
+# log_max_line_length = 0
+#
+# This optional suffix (default is empty) that would be appended to the swift transaction
+# id allows one to easily figure out from which cluster that X-Trans-Id belongs to.
+# This is very useful when one is managing more than one swift cluster.
+# trans_id_suffix =
+#
 # comma separated list of functions to call to setup custom log handlers.
 # functions get passed: conf, name, log_to_console, log_route, fmt, logger,
 # adapted_logger
 # log_custom_handlers =
+#
 # If set, log_udp_host will override log_address
 # log_udp_host =
 # log_udp_port = 514
+#
 # You can enable StatsD logging here:
-# log_statsd_host = localhost
+# log_statsd_host =
 # log_statsd_port = 8125
-# log_statsd_default_sample_rate = 1
+# log_statsd_default_sample_rate = 1.0
+# log_statsd_sample_rate_factor = 1.0
 # log_statsd_metric_prefix =
-# Use a comma separated list of full url (http://foo.bar:1234,https://foo.bar)
+#
+# Statsd metrics may include labeling information in a variety of formats.
+# Available options:
+#     disabled, dogstatsd, graphite, influxdb, librato.
+# Defaults to disabled; enable statsd_label_mode by setting another option.
+# See also: https://github.com/prometheus/statsd_exporter#tagging-extensions.
+# Note that enabling statsd_label_mode will likely increase the number of time
+# series stored, as more labeled metrics may be exposed than may have been
+# previously extracted from the dotted non-labeled legacy metric format.
+# statsd_label_mode = disabled
+#
+# Historically, statsd metrics were emitted with implied labels as part of
+# metric name in a dotted "legacy" format. Once swift is fully instrumented
+# with labeled metrics, and you have statsd_label_mode enabled, you may want to
+# turn off legacy metrics; to do that set this option to False. Defaults to
+# True.
+# statsd_emit_legacy = True
+#
+# Statsd metrics emitted with labels also support user defined labels
+# configured by options. The format for each option is:
+#     statsd_user_label_ = 
+# where  and  are restricted to a subset of non-whitespace ASCII
+# characters, including letters (upper and lower), numbers and underscores.
+#  may also contain the period character (.). Each option will add a
+# label with name user_ and value  to labeled metrics.
+# User defined labels may be configured in this [DEFAULT] section, in which
+# case they will be included with every labeled metric, or they may be
+# configured in individual [filter:] sections, in which case they
+# will only be included with labeled metrics emitted by that .
+# For example, a proxy-server configuration could use the following to
+# delineate labeled metrics emitted by different instances of proxy-logging
+# middleware in the pipeline:
+#     [filter:subrequest-logging]
+#     use = egg:swift#proxy_logging
+#     statsd_user_label_reqctx = subrequest
+# which adds a label with name 'user_reqctx' and value 'subrequest' to every
+# labeled metrics emitted by this proxy-logging instance. This would achieve
+# similar effect as the following proxy-server configuration for legacy
+# non-labeled metrics:
+#     [filter:subrequest-logging]
+#     use = egg:swift#proxy_logging
+#     access_log_statsd_metric_prefix = subrequest
+# Note that the legacy metrics option 'access_log_statsd_metric_prefix' does
+# not apply to labeled metrics.
+# By default there are no user defined labels.
+#
+# List of origin hosts that are allowed for CORS requests in addition to what
+# the container has set.
+# Use a comma separated list of full URL (http://foo.bar:1234,https://foo.bar)
 # cors_allow_origin =
+
+# If True (default) then CORS requests are only allowed if their Origin header
+# matches an allowed origin. Otherwise, any Origin is allowed.
+# strict_cors_mode = True
+#
+# Comma separated list of headers to expose through Access-Control-Expose-Headers,
+# in addition to the defaults and any headers set in container metadata (see
+# CORS documentation).
+# cors_expose_headers =
+#
+# General timeout when sending to or receiving from clients.
+# client_timeout = 60.0
+#
+# Timeout to use when looking for pipelined requests. Set to zero to disable
+# request pipelining. Defaults to client_timeout. Requires eventlet>=0.33.4;
+# with earlier eventlet, any non-zero value is treated as client_timeout.
+# keepalive_timeout =
+#
+# Note: enabling evenlet_debug might reveal sensitive information, for example
+# signatures for temp urls
 # eventlet_debug = false
+#
+# You can set scheduling priority of processes. Niceness values range from -20
+# (most favorable to the process) to 19 (least favorable to the process).
+# nice_priority =
+#
+# You can set I/O scheduling class and priority of processes. I/O niceness
+# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
+# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
+# 0 to 7. The higher the value, the lower the I/O priority of the process.
+# Work only with ionice_class.
+# ionice_class =
+# ionice_priority =
 
 [pipeline:main]
-pipeline = catch_errors healthcheck cache ratelimit tempauth proxy-logging proxy-server
+# This sample pipeline uses tempauth and is used for SAIO dev work and
+# testing. See below for a pipeline using keystone.
+pipeline = catch_errors gatekeeper healthcheck proxy-logging cache listing_formats container_sync bulk tempurl ratelimit tempauth copy container-quotas account-quotas slo dlo versioned_writes symlink proxy-logging proxy-server
+
+# The following pipeline shows keystone integration. Comment out the one
+# above and uncomment this one. Additional steps for integrating keystone are
+# covered further below in the filter sections for authtoken and keystoneauth.
+#pipeline = catch_errors gatekeeper healthcheck proxy-logging cache container_sync bulk tempurl ratelimit authtoken keystoneauth copy container-quotas account-quotas slo dlo versioned_writes symlink proxy-logging proxy-server
 
 [app:proxy-server]
 use = egg:swift#proxy
@@ -42,50 +171,271 @@ use = egg:swift#proxy
 # set log_facility = LOG_LOCAL0
 # set log_level = INFO
 # set log_address = /dev/log
-# log_handoffs = True
+#
+# When deployed behind a proxy, load balancer, or SSL terminator that is
+# configured to speak the human-readable (v1) PROXY protocol (see
+# http://www.haproxy.org/download/1.7/doc/proxy-protocol.txt), you should set
+# this option to true.  The proxy-server will populate the client connection
+# information using the PROXY protocol and reject any connection missing a
+# valid PROXY line with a 400.  Only v1 (human-readable) of the PROXY protocol
+# is supported.
+# require_proxy_protocol = false
+#
+# log_handoffs = true
+# How long (in seconds) to cache account existence information. Account
+# non-existence will be cached one-tenth as long.
 # recheck_account_existence = 60
+# How long (in seconds) to cache container existence information. Container
+# non-existence will be cached one-tenth as long.
 # recheck_container_existence = 60
-# object_chunk_size = 8192
-# client_chunk_size = 8192
+#
+# How long (in seconds) the proxy should cache a set of shard ranges for
+# a container when the set is to be used for directing object updates.
+# Note that stale shard range info should be fine; updates will still
+# eventually make their way to the correct shard. As a result, you can
+# usually set this much higher than the existence checks above.
+# recheck_updating_shard_ranges = 3600
+#
+# How long the proxy should cache a set of shard ranges for a container when
+# the set is to be used for gathering object listings.
+# Note that stale shard range info might result in incomplete object listings
+# so this value should be set less than recheck_updating_shard_ranges.
+# recheck_listing_shard_ranges = 600
+#
+# For particularly active containers, having information age out of cache can
+# be quite painful: suddenly thousands of requests per second all miss and
+# have to go to disk. By (rarely) going direct to disk regardless of whether
+# data is present in memcache, we can periodically refresh the data in memcache
+# without causing a thundering herd. Values around 0.0 - 0.1 (i.e., one in
+# every thousand requests skips cache, or fewer) are recommended.
+# container_existence_skip_cache_pct = 0.0
+# container_updating_shard_ranges_skip_cache_pct = 0.0
+# container_listing_shard_ranges_skip_cache_pct = 0.0
+# account_existence_skip_cache_pct = 0.0
+#
+# Use cooperative token on updating namespace cache to coalesce the requests
+# which fetch updating namespaces from the backend and set them in memcached.
+# Number of cooperative tokens per each token session, 0 means to disable the
+# usage of cooperative token and directly talk to the backend and memcache.
+# namespace_cache_tokens_per_session = 3
+#
+# The average time spent (in seconds) on getting updating namespaces from the
+# container servers, this will be used as basic unit for cooperative token to
+# figure out intervals for the retries when requests didn't acquire a token and
+# are waiting for other requests to fill in the cache; and a cooperative token
+# session (`token_ttl`) will be 10 times of this value.
+# namespace_avg_backend_fetch_time = 0.3
+#
+# object_chunk_size = 65536
+# client_chunk_size = 65536
+#
+# How long (in seconds) the proxy server will wait on responses from the a/c/o
+# servers.
 # node_timeout = 10
-# client_timeout = 60
+#
+# How long (in seconds) the proxy server will wait for an initial response and
+# to read a chunk of data from the object servers while serving GET / HEAD
+# requests.  Timeouts from these requests can be recovered from so setting this
+# to something lower than node_timeout would provide quicker error recovery
+# while allowing for a longer timeout for non-recoverable requests (PUTs).
+# Does not apply to requests with a truthy X-Newest header value.
+# Defaults to node_timeout, should be overridden if node_timeout is set to a
+# high number to prevent client timeouts from firing before the proxy server
+# has a chance to retry.
+# recoverable_node_timeout = node_timeout
+#
+# Connection timeout (in seconds)
 # conn_timeout = 0.5
-# How long without an error before a node's error count is reset. This will
-# also be how long before a node is reenabled after suppression is triggered.
-# error_suppression_interval = 60
+#
+# How long (in seconds) to wait for requests to finish after a quorum has been
+# established.
+# post_quorum_timeout = 0.5
+#
+# How long (in seconds) without an error before a node's error count is reset.
+# This will also be how long before a node is reenabled after suppression is
+# triggered.
+# Set to 0 to disable error-limiting.
+# error_suppression_interval = 60.0
+#
 # How many errors can accumulate before a node is temporarily ignored.
 # error_suppression_limit = 10
+#
 # If set to 'true' any authorized user may create and delete accounts; if
 # 'false' no one, even authorized, can.
 # allow_account_management = false
-# Set object_post_as_copy = false to turn on fast posts where only the metadata
-# changes are stored anew and the original data file is kept in place. This
-# makes for quicker posts; but since the container metadata isn't updated in
-# this mode, features like container sync won't be able to sync posts.
-# object_post_as_copy = true
+#
 # If set to 'true' authorized accounts that do not yet exist within the Swift
 # cluster will be automatically created.
 # account_autocreate = false
+#
 # If set to a positive value, trying to create a container when the account
 # already has at least this maximum containers will result in a 403 Forbidden.
 # Note: This is a soft limit, meaning a user might exceed the cap for
 # recheck_account_existence before the 403s kick in.
 # max_containers_per_account = 0
+#
 # This is a comma separated list of account hashes that ignore the
 # max_containers_per_account cap.
 # max_containers_whitelist =
+#
 # Comma separated list of Host headers to which the proxy will deny requests.
 # deny_host_headers =
-# Prefix used when automatically creating accounts.
-# auto_create_account_prefix = .
-# Depth of the proxy put queue.
-# put_queue_depth = 10
-# Start rate-limiting object segment serving after the Nth segment of a
-# segmented object.
-# rate_limit_after_segment = 10
-# Once segment rate-limiting kicks in for an object, limit segments served
-# to N per second.
-# rate_limit_segments_per_sec = 1
+#
+# During GET and HEAD requests, storage nodes can be chosen at random
+# (shuffle), by using timing measurements (timing), or by using an explicit
+# region/zone match (affinity). Using timing measurements may allow for lower
+# overall latency, while using affinity allows for finer control. In both the
+# timing and affinity cases, equally-sorting nodes are still randomly chosen to
+# spread load.
+# The valid values for sorting_method are "affinity", "shuffle", or "timing".
+# This option may be overridden in a per-policy configuration section.
+# sorting_method = shuffle
+#
+# If the "timing" sorting_method is used, the timings will only be valid for
+# the number of seconds configured by timing_expiry.
+# timing_expiry = 300
+#
+# Normally, you should only be moving one replica's worth of data at a time
+# when rebalancing. If you're rebalancing more aggressively, increase this
+# to avoid erroneously returning a 404 when the primary assignments that
+# *didn't* change get overloaded.
+# rebalance_missing_suppression_count = 1
+#
+# By default on a GET/HEAD swift will connect to a minimum number storage nodes
+# in a minimum number of threads - for replicated data just a single request to
+# a single node one at a time.  When enabled concurrent_gets allows the proxy
+# to use up to replica count threads when waiting on a response.  In
+# conjunction with the concurrency_timeout option this will allow swift to send
+# out GET/HEAD requests to the storage nodes concurrently and answer as soon as
+# the minimum number of backend responses are available - in replicated
+# contexts this will be the first backend replica to respond.
+# concurrent_gets = off
+#
+# This parameter controls how long (in seconds) to wait before firing off the
+# next concurrent_get thread. A value of 0 would be fully concurrent, any other
+# number will stagger the firing of the threads. This number should be between
+# 0 and node_timeout. The default is what ever you set for the conn_timeout
+# parameter.
+# concurrency_timeout = 0.5
+#
+# By default on a EC GET request swift will connect to a minimum number of
+# storage nodes in a minimum number of threads - for erasure coded data, ndata
+# requests to primary nodes are started at the same time.  When greater than
+# zero this option provides additional robustness and may reduce first byte
+# latency by starting additional requests - up to as many as nparity.
+# concurrent_ec_extra_requests = 0
+#
+# Set to the number of nodes to contact for a normal request. You can use
+# '* replicas' at the end to have it use the number given times the number of
+# replicas for the ring being used for the request.
+# request_node_count = 2 * replicas
+#
+# Specifies which backend servers to prefer on reads. Format is a comma
+# separated list of affinity descriptors of the form =.
+# The  may be r for selecting nodes in region N or rz for
+# selecting nodes in region N, zone M. The  value should be a whole
+# number that represents the priority to be given to the selection; lower
+# numbers are higher priority.
+#
+# Example: first read from region 1 zone 1, then region 1 zone 2, then
+# anything in region 2, then everything else:
+# read_affinity = r1z1=100, r1z2=200, r2=300
+# Default is empty, meaning no preference.
+# This option may be overridden in a per-policy configuration section.
+# read_affinity =
+#
+# Specifies which backend servers to prefer on object writes. Format is a comma
+# separated list of affinity descriptors of the form r for region N or
+# rz for region N, zone M. If this is set, then when handling an object
+# PUT request, some number (see setting write_affinity_node_count) of local
+# backend servers will be tried before any nonlocal ones.
+#
+# Example: try to write to regions 1 and 2 before writing to any other
+# nodes:
+# write_affinity = r1, r2
+# Default is empty, meaning no preference.
+# This option may be overridden in a per-policy configuration section.
+# write_affinity =
+#
+# The number of local (as governed by the write_affinity setting) nodes to
+# attempt to contact first on writes, before any non-local ones. The value
+# should be an integer number, or use '* replicas' at the end to have it use
+# the number given times the number of replicas for the ring being used for the
+# request.
+# This option may be overridden in a per-policy configuration section.
+# write_affinity_node_count = 2 * replicas
+#
+# The number of local (as governed by the write_affinity setting) handoff nodes
+# to attempt to contact on deletion, in addition to primary nodes.
+#
+# Example: in geographically distributed deployment of 2 regions, If
+# replicas=3, sometimes there may be 1 primary node and 2 local handoff nodes
+# in one region holding the object after uploading but before object replicated
+# to the appropriate locations in other regions. In this case, include these
+# handoff nodes to send request when deleting object could help make correct
+# decision for the response. The default value 'auto' means Swift will
+# calculate the number automatically, the default value is
+# (replicas - len(local_primary_nodes)). This option may be overridden in a
+# per-policy configuration section.
+# write_affinity_handoff_delete_count = auto
+#
+# These are the headers whose values will only be shown to swift_owners. The
+# exact definition of a swift_owner is up to the auth system in use, but
+# usually indicates administrative responsibilities.
+# swift_owner_headers = x-container-read, x-container-write, x-container-sync-key, x-container-sync-to, x-account-meta-temp-url-key, x-account-meta-temp-url-key-2, x-container-meta-temp-url-key, x-container-meta-temp-url-key-2, x-account-access-control
+#
+# You can set scheduling priority of processes. Niceness values range from -20
+# (most favorable to the process) to 19 (least favorable to the process).
+# nice_priority =
+#
+# You can set I/O scheduling class and priority of processes. I/O niceness
+# class values are IOPRIO_CLASS_RT (realtime), IOPRIO_CLASS_BE (best-effort) and
+# IOPRIO_CLASS_IDLE (idle). I/O niceness priority is a number which goes from
+# 0 to 7. The higher the value, the lower the I/O priority of the process.
+# Work only with ionice_class.
+# ionice_class =
+# ionice_priority =
+#
+# When reloading servers with SIGUSR1, workers running with old config/code
+# are allowed some time to finish serving in-flight requests. Use this to
+# configure the grace period (in seconds), after which the reloaded server
+# will issue SIGKILLs to remaining stale workers.
+# stale_worker_timeout = 86400
+#
+# When upgrading from liberasurecode<=1.5.0, you may want to continue writing
+# legacy CRCs until all nodes are upgraded and capabale of reading fragments
+# with zlib CRCs. liberasurecode>=1.6.2 checks for the environment variable
+# LIBERASURECODE_WRITE_LEGACY_CRC; if set (value doesn't matter), it will use
+# its legacy CRC. Set this option to true or false to ensure the environment
+# variable is or is not set. Leave the option blank or absent to not touch
+# the environment (default). For more information, see
+# https://bugs.launchpad.net/liberasurecode/+bug/1886088
+# write_legacy_ec_crc =
+#
+# Setting 'allow_open_expired' to 'true' allows the 'x-open-expired' header
+# to be used with HEAD, GET, or POST requests to access expired objects that
+# have not yet been deleted from disk. This can be useful in conjunction with
+# the object-expirer 'delay_reaping' feature.
+# This flag is set to false by default, so it must be changed to access
+# expired objects.
+# allow_open_expired = false
+
+# Some proxy-server configuration options may be overridden on a per-policy
+# basis by including per-policy config section(s). The value of any option
+# specified a per-policy section will override any value given in the
+# proxy-server section for that policy only. Otherwise the value of these
+# options will be that specified in the proxy-server section.
+# The section name should refer to the policy index, not the policy name.
+# [proxy-server:policy:]
+# sorting_method =
+# read_affinity =
+# write_affinity =
+# write_affinity_node_count =
+# write_affinity_handoff_delete_count =
+# rebalance_missing_suppression_count = 1
+# concurrent_gets = off
+# concurrency_timeout = 0.5
+# concurrent_ec_extra_requests = 0
 
 [filter:tempauth]
 use = egg:swift#tempauth
@@ -93,29 +443,59 @@ use = egg:swift#tempauth
 # set log_name = tempauth
 # set log_facility = LOG_LOCAL0
 # set log_level = INFO
-# set log_headers = False
+# set log_headers = false
 # set log_address = /dev/log
+#
 # The reseller prefix will verify a token begins with this prefix before even
 # attempting to validate it. Also, with authorization, only Swift storage
 # accounts with this prefix will be authorized by this middleware. Useful if
 # multiple auth systems are in use for one Swift cluster.
+# The reseller_prefix may contain a comma separated list of items. The first
+# item is used for the token as mentioned above. If second and subsequent
+# items exist, the middleware will handle authorization for an account with
+# that prefix. For example, for prefixes "AUTH, SERVICE", a path of
+# /v1/SERVICE_account is handled the same as /v1/AUTH_account. If an empty
+# (blank) reseller prefix is required, it must be first in the list. Two
+# single quote characters indicates an empty (blank) reseller prefix.
 # reseller_prefix = AUTH
+
+#
+# The require_group parameter names a group that must be presented by
+# either X-Auth-Token or X-Service-Token. Usually this parameter is
+# used only with multiple reseller prefixes (e.g., SERVICE_require_group=blah).
+# By default, no group is needed. Do not use .admin.
+# require_group =
+
 # The auth prefix will cause requests beginning with this prefix to be routed
 # to the auth subsystem, for granting tokens, etc.
 # auth_prefix = /auth/
+# Token lifetime (in seconds)
 # token_life = 86400
-# This is a comma separated list of hosts allowed to send X-Container-Sync-Key
-# requests.
-# allowed_sync_hosts = 127.0.0.1
+#
 # This allows middleware higher in the WSGI pipeline to override auth
 # processing, useful for middleware such as tempurl and formpost. If you know
 # you're not going to use such middleware and you want a bit of extra security,
 # you can set this to false.
 # allow_overrides = true
-# This specifies what scheme to return with storage urls:
+#
+# This specifies what scheme to return with storage URLs:
 # http, https, or default (chooses based on what the server is running as)
 # This can be useful with an SSL load balancer in front of a non-SSL server.
 # storage_url_scheme = default
+#
+# Fernet keys may be used for storage, rather than relying on memcached.
+# Multiple keys may be configured using options named 'fernet_key_'
+# where 'key_id' is a unique identifier. The value should be 32 url-safe
+# base64-encoded bytes, such as may be generated using
+# `openssl rand -base64 32 | tr '+/' '-_'`
+# Any of these keys may be used for decryption. Only one key may be used
+# for encryption by a proxy at any given time; configure it with the
+# 'active_fernet_key_id' option. All proxies in the cluster should know
+# about a key before it is activated. If blank (the default),
+# memcached-backed tokens will be issued.
+# fernet_key_myid = <32 url-safe base64-encoded bytes>
+# active_fernet_key_id = myid
+#
 # Lastly, you need to list all the accounts/users you want here. The format is:
 #   user__ =  [group] [group] [...] [storage_url]
 # or if you want underscores in  or , you can base64 encode them
@@ -123,8 +503,9 @@ use = egg:swift#tempauth
 #   user64__ =  [group] [group] [...] [storage_url]
 # There are special groups of:
 #   .reseller_admin = can do anything to any account for this auth
+#   .reseller_reader = can GET/HEAD anything in any account for this auth
 #   .admin = can do anything within the account
-# If neither of these groups are specified, the user can only access containers
+# If none of these groups are specified, the user can only access containers
 # that have been explicitly allowed for them by a .admin or .reseller_admin.
 # The trailing optional storage_url allows you to specify an alternate url to
 # hand back to the user upon authentication. If not specified, this defaults to
@@ -132,35 +513,303 @@ use = egg:swift#tempauth
 # to what the requester would need to use to reach this host.
 # Here are example entries, required for running the tests:
 user_admin_admin = admin .admin .reseller_admin
+user_admin_auditor = admin_ro .reseller_reader
 user_test_tester = testing .admin
-user_test2_tester2 = testing2 .admin
+user_test_tester2 = testing2 .admin
 user_test_tester3 = testing3
+user_test2_tester2 = testing2 .admin
+user_test5_tester5 = testing5 service
 
 # To enable Keystone authentication you need to have the auth token
 # middleware first to be configured. Here is an example below, please
 # refer to the keystone's documentation for details about the
 # different settings.
 #
-# You'll need to have as well the keystoneauth middleware enabled
-# and have it in your main pipeline so instead of having tempauth in
-# there you can change it to: authtoken keystone
+# You'll also need to have the keystoneauth middleware enabled and have it in
+# your main pipeline, as show in the sample pipeline at the top of this file.
+#
+# Following parameters are known to work with keystonemiddleware v2.3.0
+# (above v2.0.0), but checking the latest information in the wiki page[1]
+# is recommended.
+# 1. https://docs.openstack.org/keystonemiddleware/latest/middlewarearchitecture.html#configuration
 #
 # [filter:authtoken]
-# paste.filter_factory = keystoneclient.middleware.auth_token:filter_factory
-# auth_host = keystonehost
-# auth_port = 35357
-# auth_protocol = http
-# auth_uri = http://keystonehost:5000/
-# admin_tenant_name = service
-# admin_user = swift
-# admin_password = password
-# delay_auth_decision = 1
+# paste.filter_factory = keystonemiddleware.auth_token:filter_factory
+# www_authenticate_uri = http://keystonehost:5000
+# auth_url = http://keystonehost:5000
+# auth_plugin = password
+# The following credentials must match the Keystone credentials for the Swift
+# service and may need to be changed to match your Keystone configuration. The
+# example values shown here assume a user named 'swift' with admin role on a
+# project named 'service', both being in the Keystone domain with id 'default'.
+# Refer to the keystonemiddleware documentation link above [1] for other
+# examples.
+# project_domain_id = default
+# user_domain_id = default
+# project_name = service
+# username = swift
+# password = password
+#
+# delay_auth_decision defaults to False, but leaving it as false will
+# prevent other auth systems, staticweb, tempurl, formpost, and ACLs from
+# working. This value must be explicitly set to True.
+# delay_auth_decision = False
+#
+# cache = swift.cache
+# include_service_catalog = False
 #
 # [filter:keystoneauth]
 # use = egg:swift#keystoneauth
-# Operator roles is the role which user would be allowed to manage a
-# tenant and be able to create container or give ACL to others.
+# The reseller_prefix option lists account namespaces that this middleware is
+# responsible for. The prefix is placed before the Keystone project id.
+# For example, for project 12345678, and prefix AUTH, the account is
+# named AUTH_12345678 (i.e., path is /v1/AUTH_12345678/...).
+# Several prefixes are allowed by specifying a comma-separated list
+# as in: "reseller_prefix = AUTH, SERVICE". The empty string indicates a
+# single blank/empty prefix. If an empty prefix is required in a list of
+# prefixes, a value of '' (two single quote characters) indicates a
+# blank/empty prefix. Except for the blank/empty prefix, an underscore ('_')
+# character is appended to the value unless already present.
+# reseller_prefix = AUTH
+#
+# The user must have at least one role named by operator_roles on a
+# project in order to create, delete and modify containers and objects
+# and to set and read privileged headers such as ACLs.
+# If there are several reseller prefix items, you can prefix the
+# parameter so it applies only to those accounts (for example
+# the parameter SERVICE_operator_roles applies to the /v1/SERVICE_
+# path). If you omit the prefix, the option applies to all reseller
+# prefix items. For the blank/empty prefix, prefix with '' (do not put
+# underscore after the two single quote characters).
 # operator_roles = admin, swiftoperator
+#
+# The reseller admin role has the ability to create and delete accounts
+# reseller_admin_role = ResellerAdmin
+#
+# This allows middleware higher in the WSGI pipeline to override auth
+# processing, useful for middleware such as tempurl and formpost. If you know
+# you're not going to use such middleware and you want a bit of extra security,
+# you can set this to false.
+# allow_overrides = true
+#
+# If the service_roles parameter is present, an X-Service-Token must be
+# present in the request that when validated, grants at least one role listed
+# in the parameter. The X-Service-Token may be scoped to any project.
+# If there are several reseller prefix items, you can prefix the
+# parameter so it applies only to those accounts (for example
+# the parameter SERVICE_service_roles applies to the /v1/SERVICE_
+# path). If you omit the prefix, the option applies to all reseller
+# prefix items. For the blank/empty prefix, prefix with '' (do not put
+# underscore after the two single quote characters).
+# By default, no service_roles are required.
+# service_roles =
+#
+# For backwards compatibility, keystoneauth will match names in cross-tenant
+# access control lists (ACLs) when both the requesting user and the tenant
+# are in the default domain i.e the domain to which existing tenants are
+# migrated. The default_domain_id value configured here should be the same as
+# the value used during migration of tenants to keystone domains.
+# default_domain_id = default
+#
+# For a new installation, or an installation in which keystone projects may
+# move between domains, you should disable backwards compatible name matching
+# in ACLs by setting allow_names_in_acls to false:
+# allow_names_in_acls = true
+#
+# In OpenStack terms, these reader roles are scoped for system: they
+# can read anything across projects and domains.
+# They are used for auditing and compliance fuctions.
+# In Swift terms, these roles are as powerful as the reseller_admin_role,
+# only do not modify the cluster.
+# By default the list of reader roles is empty.
+# system_reader_roles =
+#
+# This is a reader role scoped for a Keystone project.
+# An identity that has this role can read anything in a project, so it is
+# basically a swiftoperator, but read-only.
+# project_reader_roles =
+
+[filter:s3api]
+use = egg:swift#s3api
+
+# s3api setup:
+#
+# With either tempauth or your custom auth:
+# - Put s3api just before your auth filter(s) in the pipeline
+# With keystone:
+# - Put s3api and s3token before keystoneauth in the pipeline, but after
+#   auth_token
+# If you have ratelimit enabled for Swift requests, you may want to place a
+# second copy after auth to also ratelimit S3 requests.
+#
+# Swift has no concept of the S3's resource owner; the resources
+# (i.e. containers and objects) created via the Swift API have no owner
+# information. This option specifies how the s3api middleware handles them
+# with the S3 API.  If this option is 'false', such kinds of resources will be
+# invisible and no users can access them with the S3 API.  If set to 'true',
+# a resource without an owner belongs to everyone and everyone can access it
+# with the S3 API.  If you care about S3 compatibility, set 'false' here.  This
+# option makes sense only when the s3_acl option is set to 'true' and your
+# Swift cluster has the resources created via the Swift API.
+# allow_no_owner = false
+#
+# Set a region name of your Swift cluster.  Note that the s3api doesn't choose
+# a region of the newly created bucket.  This value is used for the
+# GET Bucket location API and v4 signatures calculation.
+# location = us-east-1
+#
+# Set whether to enforce DNS-compliant bucket names. Note that S3 enforces
+# these conventions in all regions except the US Standard region.
+# dns_compliant_bucket_names = True
+#
+# Set the default maximum number of objects returned in the GET Bucket
+# response.
+# max_bucket_listing = 1000
+#
+# Set the maximum number of parts returned in the List Parts operation.
+# (default: 1000 as well as S3 specification)
+# If setting it larger than 10000 (swift container_listing_limit default)
+# make sure you also increase the container_listing_limit in swift.conf.
+# max_parts_listing = 1000
+#
+# Set the maximum number of objects we can delete with the Multi-Object Delete
+# operation.
+# max_multi_delete_objects = 1000
+#
+# Set the number of objects to delete at a time with the Multi-Object Delete
+# operation.
+# multi_delete_concurrency = 2
+#
+# If set to 'true', s3api uses its own metadata for ACLs
+# (e.g. X-Container-Sysmeta-S3Api-Acl) to achieve the best S3 compatibility.
+# If set to 'false', s3api tries to use Swift ACLs (e.g. X-Container-Read)
+# instead of S3 ACLs as far as possible.
+# There are some caveats that one should know about this setting. Firstly,
+# if set to 'false' after being previously set to 'true' any new objects or
+# containers stored while 'true' setting will be accessible to all users
+# because the s3 ACLs will be ignored under s3_acl=False setting. Secondly,
+# s3_acl True mode don't keep ACL consistency between both the S3 and Swift
+# API. Meaning with s3_acl enabled S3 ACLs only effect objects and buckets
+# via the S3 API. As this ACL information wont be available via the Swift API
+# and so the ACL wont be applied.
+# Note that s3_acl currently supports only keystone and tempauth.
+# DON'T USE THIS for production before enough testing for your use cases.
+# This stuff is still under development and it might cause something
+# you don't expect.
+# s3_acl = false
+#
+# Specify a (comma-separated) list of host names for your Swift cluster.
+# This enables virtual-hosted style requests.
+# storage_domain =
+#
+# Enable pipeline order check for SLO, s3token, authtoken, keystoneauth
+# according to standard s3api/Swift construction using either tempauth or
+# keystoneauth. If the order is incorrect, it raises an exception to stop
+# proxy. Turn auth_pipeline_check off only when you want to bypass these
+# authenticate middlewares in order to use other 3rd party (or your
+# proprietary) authenticate middleware.
+# auth_pipeline_check = True
+#
+# Enable multi-part uploads. (default: true)
+# This is required to store files larger than Swift's max_file_size (by
+# default, 5GiB). Note that has performance implications when deleting objects,
+# as we now have to check for whether there are also segments to delete. The
+# SLO middleware must be in the pipeline after s3api for this option to have
+# effect.
+# allow_multipart_uploads = True
+#
+# Set the maximum number of parts for Upload Part operation.(default: 1000)
+# When setting it to be larger than the default value in order to match the
+# specification of S3, set to be larger max_manifest_segments for slo
+# middleware.(specification of S3: 10000)
+# max_upload_part_num = 1000
+#
+# Enable returning only buckets which owner are the user who requested
+# GET Service operation. (default: false)
+# If you want to enable the above feature, set this and s3_acl to true.
+# That might cause significant performance degradation. So, only if your
+# service absolutely need this feature, set this setting to true.
+# If you set this to false, s3api returns all buckets.
+# check_bucket_owner = false
+#
+# By default, Swift reports only S3 style access log.
+# (e.g. PUT /bucket/object) If set force_swift_request_proxy_log
+# to be 'true', Swift will become to output Swift style log
+# (e.g. PUT /v1/account/container/object) in addition to S3 style log.
+# Note that they will be reported twice (i.e. s3api doesn't care about
+# the duplication) and Swift style log will includes also various subrequests
+# to achieve S3 compatibilities when force_swift_request_proxy_log is set to
+# 'true'
+# force_swift_request_proxy_log = false
+#
+# AWS S3 document says that each part must be at least 5 MB in a multipart
+# upload, except the last part.
+# min_segment_size = 5242880
+#
+# AWS allows clock skew up to 15 mins; note that older versions of swift/swift3
+# allowed at most 5 mins.
+# allowable_clock_skew = 900
+#
+# CORS preflight requests don't contain enough information for us to
+# identify the account that should be used for the real request, so
+# the allowed origins must be set cluster-wide. (default: blank; all
+# preflight requests will be denied)
+# cors_preflight_allow_origin =
+#
+# AWS will return a 503 Slow Down when clients are making too many requests,
+# but that can make client logs confusing if they only log/give metrics on
+# status ints. Turn this on to return 429 instead.
+# ratelimit_as_client_error = false
+
+# You can override the default log routing for this filter here:
+# log_name = s3api
+
+[filter:s3token]
+# s3token middleware authenticates with keystone using the s3 credentials
+# provided in the request header. Please put s3token between s3api
+# and keystoneauth if you're using keystoneauth.
+use = egg:swift#s3token
+
+# Prefix that will be prepended to the tenant to form the account
+reseller_prefix = AUTH_
+
+# By default, s3token will reject all invalid S3-style requests. Set this to
+# True to delegate that decision to downstream WSGI components. This may be
+# useful if there are multiple auth systems in the proxy pipeline.
+delay_auth_decision = False
+
+# Keystone server details. Note that this differs from how swift3 was
+# configured: in particular, the Keystone API version must be included.
+auth_uri = http://keystonehost:5000/v3
+
+# Connect/read timeout (in seconds) to use when communicating with Keystone
+http_timeout = 10.0
+
+# SSL-related options
+# insecure = False
+# certfile =
+# keyfile =
+
+# You can override the default log routing for this filter here:
+# log_name = s3token
+
+# Secrets may be cached to reduce latency for the client and load on Keystone.
+# This configures the duration that secrets may be cached; set to zero to
+# disable caching and prevent Swift from retrieving secrets from Keystone.
+# secret_cache_duration = 60
+# Note that caching is required to enable signed aws-chunked transfers.
+
+# Recent Keystone deployments require credentials similar to the authtoken
+# middleware; these credentials require access to the s3tokens endpoint.
+# Additionally, if secret caching is enabled, the credentials should have
+# access to view all project credentials.
+# auth_url = http://keystonehost:5000
+# auth_type = password
+# project_domain_id = default
+# project_name = service
+# user_domain_id = default
+# username = swift
+# password = password
 
 [filter:healthcheck]
 use = egg:swift#healthcheck
@@ -177,25 +826,32 @@ use = egg:swift#memcache
 # set log_name = cache
 # set log_facility = LOG_LOCAL0
 # set log_level = INFO
-# set log_headers = False
+# set log_headers = false
 # set log_address = /dev/log
+#
 # If not set here, the value for memcache_servers will be read from
 # memcache.conf (see memcache.conf-sample) or lacking that file, it will
 # default to the value below. You can specify multiple servers separated with
-# commas, as in: 10.1.2.3:11211,10.1.2.4:11211
+# commas, as in: 10.1.2.3:11211,10.1.2.4:11211 (IPv6 addresses must
+# follow rfc3986 section-3.2.2, i.e. [::1]:11211)
 # memcache_servers = 127.0.0.1:11211
 #
-# Sets how memcache values are serialized and deserialized:
-# 0 = older, insecure pickle serialization
-# 1 = json serialization but pickles can still be read (still insecure)
-# 2 = json serialization only (secure and the default)
-# If not set here, the value for memcache_serialization_support will be read
-# from /etc/swift/memcache.conf (see memcache.conf-sample).
-# To avoid an instant full cache flush, existing installations should
-# upgrade with 0, then set to 1 and reload, then after some time (24 hours)
-# set to 2 and reload.
-# In the future, the ability to use pickle serialization will be removed.
-# memcache_serialization_support = 2
+# Sets the maximum number of connections to each memcached server per worker
+# memcache_max_connections = 2
+#
+# How long (in seconds) without an error before a server's error count is
+# reset. This will also be how long before a server is reenabled after
+# suppression is triggered.  Set to 0 to disable error-limiting.
+# error_suppression_interval = 60.0
+#
+# How many errors can accumulate before a server is temporarily ignored.
+# error_suppression_limit = 10
+#
+# (Optional) Global toggle for TLS usage when comunicating with
+# the caching servers.
+# tls_enabled =
+#
+# More options documented in memcache.conf-sample
 
 [filter:ratelimit]
 use = egg:swift#ratelimit
@@ -203,44 +859,91 @@ use = egg:swift#ratelimit
 # set log_name = ratelimit
 # set log_facility = LOG_LOCAL0
 # set log_level = INFO
-# set log_headers = False
+# set log_headers = false
 # set log_address = /dev/log
+#
 # clock_accuracy should represent how accurate the proxy servers' system clocks
 # are with each other. 1000 means that all the proxies' clock are accurate to
 # each other within 1 millisecond.  No ratelimit should be higher than the
 # clock accuracy.
 # clock_accuracy = 1000
+#
 # max_sleep_time_seconds = 60
+#
 # log_sleep_time_seconds of 0 means disabled
 # log_sleep_time_seconds = 0
+#
 # allows for slow rates (e.g. running up to 5 sec's behind) to catch up.
 # rate_buffer_seconds = 5
+#
 # account_ratelimit of 0 means disabled
 # account_ratelimit = 0
 
+# DEPRECATED- these will continue to work but will be replaced
+# by the X-Account-Sysmeta-Global-Write-Ratelimit flag.
+# Please see ratelimiting docs for details.
 # these are comma separated lists of account names
 # account_whitelist = a,b
 # account_blacklist = c,d
 
 # with container_limit_x = r
-# for containers of size x limit requests per second to r.  The container
+# for containers of size x limit write requests per second to r.  The container
 # rate will be linearly interpolated from the values given. With the values
 # below, a container of size 5 will get a rate of 75.
 # container_ratelimit_0 = 100
 # container_ratelimit_10 = 50
 # container_ratelimit_50 = 20
 
+# Similarly to the above container-level write limits, the following will limit
+# container GET (listing) requests.
+# container_listing_ratelimit_0 = 100
+# container_listing_ratelimit_10 = 50
+# container_listing_ratelimit_50 = 20
+
+[filter:read_only]
+use = egg:swift#read_only
+# read_only set to true means turn global read only on
+# read_only = false
+# allow_deletes set to true means to allow deletes
+# allow_deletes = false
+# Note: Put after ratelimit in the pipeline.
+
+# Note: needs to be placed before listing_formats;
+# otherwise remapped listings will always be JSON
 [filter:domain_remap]
 use = egg:swift#domain_remap
 # You can override the default log routing for this filter here:
 # set log_name = domain_remap
 # set log_facility = LOG_LOCAL0
 # set log_level = INFO
-# set log_headers = False
+# set log_headers = false
 # set log_address = /dev/log
+#
+# Specify the storage_domain that match your cloud, multiple domains
+# can be specified separated by a comma
 # storage_domain = example.com
+
+# Specify a root path part that will be added to the start of paths if not
+# already present.
 # path_root = v1
+
+# Browsers can convert a host header to lowercase, so check that reseller
+# prefix on the account is the correct case. This is done by comparing the
+# items in the reseller_prefixes config option to the found prefix. If they
+# match except for case, the item from reseller_prefixes will be used
+# instead of the found reseller prefix. When none match, the default reseller
+# prefix is used. When no default reseller prefix is configured, any request
+# with an account prefix not in that list will be ignored by this middleware.
 # reseller_prefixes = AUTH
+# default_reseller_prefix =
+
+# Enable legacy remapping behavior for versioned path requests:
+#   c.a.example.com/v1/o -> /v1/AUTH_a/c/o
+# instead of
+#   c.a.example.com/v1/o -> /v1/AUTH_a/c/v1/o
+# ... by default all path parts after a remapped domain are considered part of
+# the object name with no special case for the path "v1"
+# mangle_client_paths = False
 
 [filter:catch_errors]
 use = egg:swift#catch_errors
@@ -248,7 +951,7 @@ use = egg:swift#catch_errors
 # set log_name = catch_errors
 # set log_facility = LOG_LOCAL0
 # set log_level = INFO
-# set log_headers = False
+# set log_headers = false
 # set log_address = /dev/log
 
 [filter:cname_lookup]
@@ -258,35 +961,54 @@ use = egg:swift#cname_lookup
 # set log_name = cname_lookup
 # set log_facility = LOG_LOCAL0
 # set log_level = INFO
-# set log_headers = False
+# set log_headers = false
 # set log_address = /dev/log
+#
+# Specify the storage_domain that match your cloud, multiple domains
+# can be specified separated by a comma
 # storage_domain = example.com
+#
 # lookup_depth = 1
+#
+# Specify the nameservers to use to do the CNAME resolution. If unset, the
+# system configuration is used. Multiple nameservers can be specified
+# separated by a comma. Default port 53 can be overridden. IPv6 is accepted.
+# Example: 127.0.0.1, 127.0.0.2, 127.0.0.3:5353, [::1], [::1]:5353
+# nameservers =
 
 # Note: Put staticweb just after your auth filter(s) in the pipeline
 [filter:staticweb]
 use = egg:swift#staticweb
-# Seconds to cache container x-container-meta-web-* header values.
-# cache_timeout = 300
 # You can override the default log routing for this filter here:
 # set log_name = staticweb
 # set log_facility = LOG_LOCAL0
 # set log_level = INFO
+# set log_headers = false
 # set log_address = /dev/log
-# set access_log_name = staticweb
-# set access_log_facility = LOG_LOCAL0
-# set access_log_level = INFO
-# set log_headers = False
+#
+# At times when it's impossible for staticweb to guess the outside
+# endpoint correctly, the url_base may be used to supply the URL
+# scheme and/or the host name (and port number) in order to generate
+# redirects.
+# Example values:
+#    http://www.example.com    - redirect to www.example.com
+#    https:                    - changes the schema only
+#    https://                  - same, changes the schema only
+#    //www.example.com:8080    - redirect www.example.com on port 8080
+#                                (schema unchanged)
+# url_base =
 
-# Note: Put tempurl just before your auth filter(s) in the pipeline
+# Note: Put tempurl before dlo, slo and your auth filter(s) in the pipeline
 [filter:tempurl]
 use = egg:swift#tempurl
+# The methods allowed with Temp URLs.
+# methods = GET HEAD PUT POST DELETE
 #
 # The headers to remove from incoming requests. Simply a whitespace delimited
 # list of header names and names can optionally end with '*' to indicate a
 # prefix match. incoming_allow_headers is a list of exceptions to these
 # removals.
-# incoming_remove_headers = x-timestamp
+# incoming_remove_headers = x-timestamp x-open-expired
 #
 # The headers allowed as exceptions to incoming_remove_headers. Simply a
 # whitespace delimited list of header names and names can optionally end with
@@ -303,10 +1025,17 @@ use = egg:swift#tempurl
 # whitespace delimited list of header names and names can optionally end with
 # '*' to indicate a prefix match.
 # outgoing_allow_headers = x-object-meta-public-*
+#
+# The digest algorithm(s) supported for generating signatures;
+# whitespace-delimited.
+# allowed_digests = sha1 sha256 sha512
 
 # Note: Put formpost just before your auth filter(s) in the pipeline
 [filter:formpost]
 use = egg:swift#formpost
+# The digest algorithm(s) supported for generating signatures;
+# whitespace-delimited.
+# allowed_digests = sha1 sha256 sha512
 
 # Note: Just needs to be placed before the proxy-server in the pipeline.
 [filter:name_check]
@@ -315,6 +1044,26 @@ use = egg:swift#name_check
 # maximum_length = 255
 # forbidden_regexp = /\./|/\.\./|/\.$|/\.\.$
 
+# Note: Etag quoter should be placed just after cache in the pipeline.
+[filter:etag-quoter]
+use = egg:swift#etag_quoter
+# Historically, Swift has emitted bare MD5 hex digests as ETags, which is not
+# RFC compliant. With this middleware in the pipeline, users can opt-in to
+# RFC-compliant ETags on a per-account or per-container basis.
+#
+# Set to true to enable RFC-compliant ETags cluster-wide by default. Users
+# can still opt-out by setting appropriate account or container metadata.
+# enable_by_default = false
+
+[filter:list-endpoints]
+use = egg:swift#list_endpoints
+# list_endpoints_path = /endpoints/
+
+# Note: The double proxy-logging in the pipeline is not a mistake. The
+# left-most proxy-logging is there to log requests that were handled in
+# middleware and never made it through to the right-most middleware (and
+# proxy server). Double logging is prevented for normal requests. See
+# proxy-logging docs.
 [filter:proxy-logging]
 use = egg:swift#proxy_logging
 # If not set, logging directives from [DEFAULT] without "access_" will be used
@@ -322,15 +1071,354 @@ use = egg:swift#proxy_logging
 # access_log_facility = LOG_LOCAL0
 # access_log_level = INFO
 # access_log_address = /dev/log
+#
+# Log route for this filter. Useful if you want to have different configs for
+# the two proxy-logging filters.
+# access_log_route = proxy-server
+#
 # If set, access_log_udp_host will override access_log_address
 # access_log_udp_host =
 # access_log_udp_port = 514
+#
 # You can use log_statsd_* from [DEFAULT] or override them here:
-# access_log_statsd_host = localhost
+# access_log_statsd_host =
 # access_log_statsd_port = 8125
-# access_log_statsd_default_sample_rate = 1
+# access_log_statsd_default_sample_rate = 1.0
+# access_log_statsd_sample_rate_factor = 1.0
 # access_log_statsd_metric_prefix =
-# access_log_headers = False
+# access_log_headers = false
+#
+# If access_log_headers is True and access_log_headers_only is set only
+# these headers are logged. Multiple headers can be defined as comma separated
+# list like this: access_log_headers_only = Host, X-Object-Meta-Mtime
+# access_log_headers_only =
+#
+# The default log format includes several sensitive values in logs:
+#   * X-Auth-Token header
+#   * temp_url_sig query parameter
+#   * Authorization header
+#   * X-Amz-Signature query parameter
+# To prevent an unauthorized access of the log file leading to an unauthorized
+# access of cluster data, only a portion of these values are written, with the
+# remainder replaced by '...' in the log. Set reveal_sensitive_prefix to the
+# number of characters to log.  Set to 0 to suppress the values entirely; set
+# to something large (1000, say) to write full values. Note that some values
+# may start appearing in full at values as low as 33.
+# reveal_sensitive_prefix = 16
+#
 # What HTTP methods are allowed for StatsD logging (comma-sep); request methods
 # not in this list will have "BAD_METHOD" for the  portion of the metric.
-# log_statsd_valid_http_methods = GET,HEAD,POST,PUT,DELETE,COPY,OPTIONS
+# log_statsd_valid_http_methods = GET,HEAD,POST,PUT,DELETE,COPY,OPTIONS,UPDATE
+#
+# Hashing algorithm for log anonymization. Must be one of algorithms supported
+# by Python's hashlib.
+# log_anonymization_method = MD5
+#
+# Salt added during log anonymization
+# log_anonymization_salt =
+#
+# Template used to format access logs. All words surrounded by curly brackets
+# will be substituted with the appropriate values. For more information, see
+# https://docs.openstack.org/swift/latest/logs.html
+# log_msg_template = {client_ip} {remote_addr} {end_time.datetime} {method} {path} {protocol} {status_int} {referer} {user_agent} {auth_token} {bytes_recvd} {bytes_sent} {client_etag} {transaction_id} {headers} {request_time} {source} {log_info} {start_time} {end_time} {policy_index} {access_user_id}
+# The proxy_logging middleware attempts to translate s3api request paths to
+# swift paths. To do so it needs to know which, if any, storage domains are
+# configured for virtual-hosted style requests. This option should be a
+# comma-separated list of host names, exactly the same as that set for the
+# s3api middleware.
+# storage_domain =
+#
+# A float value in seconds for how often the proxy server should emit statsd
+# real-time buffer transfer bytes counter metrics for WSGI input or output.
+# Negative value for not emitting such metrics. 0 for always emitting such
+# metrics on every input or output call. Default is -1.
+# statsd_emit_buffer_xfer_bytes_seconds = -1
+#
+# Note: Put before both ratelimit and auth in the pipeline.
+[filter:bulk]
+use = egg:swift#bulk
+# max_containers_per_extraction = 10000
+# max_failed_extractions = 1000
+# max_deletes_per_request = 10000
+# max_failed_deletes = 1000
+#
+# In order to keep a connection active during a potentially long bulk request,
+# Swift may return whitespace prepended to the actual response body. This
+# whitespace will be yielded no more than every yield_frequency seconds.
+# yield_frequency = 10
+#
+# Note: The following parameter is used during a bulk delete of objects and
+# their container. This would frequently fail because it is very likely
+# that all replicated objects have not been deleted by the time the middleware got a
+# successful response. It can be configured the number of retries. And the
+# number of seconds to wait between each retry will be 1.5**retry
+# delete_container_retry_count = 0
+#
+# To speed up the bulk delete process, multiple deletes may be executed in
+# parallel. Avoid setting this too high, as it gives clients a force multiplier
+# which may be used in DoS attacks. The suggested range is between 2 and 10.
+# delete_concurrency = 2
+
+# Note: Put after auth and staticweb in the pipeline.
+[filter:slo]
+use = egg:swift#slo
+# max_manifest_segments = 1000
+# max_manifest_size = 8388608
+#
+# Rate limiting applies only to segments smaller than this size (bytes).
+# rate_limit_under_size = 1048576
+#
+# Start rate-limiting SLO segment serving after the Nth small segment of a
+# segmented object.
+# rate_limit_after_segment = 10
+#
+# Once segment rate-limiting kicks in for an object, limit segments served
+# to N per second. 0 means no rate-limiting.
+# rate_limit_segments_per_sec = 1
+#
+# Time limit on GET requests (seconds)
+# max_get_time = 86400
+#
+# When creating an SLO, multiple segment validations may be executed in
+# parallel. Further, multiple deletes may be executed in parallel when deleting
+# with ?multipart-manifest=delete. Use this setting to limit how many
+# subrequests may be executed concurrently. Avoid setting it too high, as it
+# gives clients a force multiplier which may be used in DoS attacks. The
+# suggested range is between 2 and 10.
+# concurrency = 2
+#
+# This may be used to separately tune validation and delete concurrency values.
+# Default is to use the concurrency value from above; all of the same caveats
+# apply regarding recommended ranges.
+# delete_concurrency = 2
+#
+# In order to keep a connection active during a potentially long PUT request,
+# clients may request that Swift send whitespace ahead of the final response
+# body. This whitespace will be yielded at most every yield_frequency seconds.
+# yield_frequency = 10
+#
+# Since SLOs may have thousands of segments, clients may request that the
+# object-expirer handle the deletion of segments using query params like
+# `?multipart-manifest=delete&async=on`. You may want to keep this off if it
+# negatively impacts your expirers; in that case, the deletes will still
+# be done as part of the client request.
+# allow_async_delete = true
+
+# Note: Put after auth and staticweb in the pipeline.
+# If you don't put it in the pipeline, it will be inserted for you.
+[filter:dlo]
+use = egg:swift#dlo
+# Start rate-limiting DLO segment serving after the Nth segment of a
+# segmented object.
+# rate_limit_after_segment = 10
+#
+# Once segment rate-limiting kicks in for an object, limit segments served
+# to N per second. 0 means no rate-limiting.
+# rate_limit_segments_per_sec = 1
+#
+# Time limit on GET requests (seconds)
+# max_get_time = 86400
+
+# Note: Put after auth and server-side copy in the pipeline.
+[filter:container-quotas]
+use = egg:swift#container_quotas
+
+# Note: Put after auth and server-side copy in the pipeline.
+[filter:account-quotas]
+use = egg:swift#account_quotas
+
+[filter:gatekeeper]
+use = egg:swift#gatekeeper
+# Set this to false if you want to allow clients to set arbitrary X-Timestamps
+# on uploaded objects. This may be used to preserve timestamps when migrating
+# from a previous storage system, but risks allowing users to upload
+# difficult-to-delete data.
+# shunt_inbound_x_timestamp = true
+#
+# Set this to true if you want to allow clients to access and manipulate the
+# (normally internal-to-swift) null namespace by including a header like
+#    X-Allow-Reserved-Names: true
+# allow_reserved_names_header = false
+#
+# You can override the default log routing for this filter here:
+# set log_name = gatekeeper
+# set log_facility = LOG_LOCAL0
+# set log_level = INFO
+# set log_headers = false
+# set log_address = /dev/log
+
+[filter:container_sync]
+use = egg:swift#container_sync
+# Set this to false if you want to disallow any full URL values to be set for
+# any new X-Container-Sync-To headers. This will keep any new full URLs from
+# coming in, but won't change any existing values already in the cluster.
+# Updating those will have to be done manually, as knowing what the true realm
+# endpoint should be cannot always be guessed.
+# allow_full_urls = true
+# Set this to specify this clusters //realm/cluster as "current" in /info
+# current = //REALM/CLUSTER
+
+# Note: Put it at the beginning of the pipeline to profile all middleware. But
+# it is safer to put this after catch_errors, gatekeeper and healthcheck.
+# Not intended for production environments!
+[filter:xprofile]
+use = egg:swift#xprofile
+# This option enable you to switch profilers which should inherit from python
+# standard profiler. Currently the supported value can be 'cProfile',
+# 'eventlet.green.profile' etc.
+# profile_module = eventlet.green.profile
+#
+# This prefix will be used to combine process ID and timestamp to name the
+# profile data file.  Make sure the executing user has permission to write
+# into this path (missing path segments will be created, if necessary).
+# If you enable profiling in more than one type of daemon, you must override
+# it with an unique value like: /var/log/swift/profile/proxy.profile
+# log_filename_prefix = /tmp/log/swift/profile/default.profile
+#
+# the profile data will be dumped to local disk based on above naming rule
+# in this interval (in seconds).
+# dump_interval = 5.0
+#
+# Be careful, this option will enable profiler to dump data into the file with
+# time stamp which means there will be lots of files piled up in the directory.
+# dump_timestamp = false
+#
+# This is the path of the URL to access the mini web UI.
+# path = /__profile__
+#
+# Clear the data when the wsgi server shutdown.
+# flush_at_shutdown = false
+#
+# unwind the iterator of applications
+# unwind = false
+
+# Note: Put after slo, dlo in the pipeline.
+# If you don't put it in the pipeline, it will be inserted automatically.
+[filter:versioned_writes]
+use = egg:swift#versioned_writes
+# Enables using versioned writes middleware and exposing configuration
+# settings via HTTP GET /info.
+# WARNING: Setting this option bypasses the "allow_versions" option
+# in the container configuration file, which will be eventually
+# deprecated. See documentation for more details.
+# allow_versioned_writes = false
+# Enables Swift object-versioning API
+# allow_object_versioning = false
+
+# Note: Put after auth and before dlo and slo middlewares.
+# If you don't put it in the pipeline, it will be inserted for you.
+[filter:copy]
+use = egg:swift#copy
+
+# Note: To enable encryption, add the following 2 dependent pieces of crypto
+# middleware to the proxy-server pipeline. They should be to the right of all
+# other middleware apart from the final proxy-logging middleware, and in the
+# order shown in this example:
+#  keymaster encryption proxy-logging proxy-server
+[filter:keymaster]
+use = egg:swift#keymaster
+
+# Over time, the format of crypto metadata on disk may change slightly to resolve
+# ambiguities. In general, you want to be writing the newest version, but to
+# ensure that all writes can still be read during rolling upgrades, there's the
+# option to write older formats as well.
+# Before upgrading from Swift 2.20.0 or Swift 2.19.1 or earlier, ensure this is set to 1
+# Before upgrading from Swift 2.25.0 or earlier, ensure this is set to at most 2
+# After upgrading all proxy servers, set this to 3 (currently the highest version)
+#
+# The default is currently 2 to support upgrades with no configuration changes,
+# but may change to 3 in the future.
+meta_version_to_write = 2
+
+# Sets the root secret from which encryption keys are derived. This must be set
+# before first use to a value that is a base64 encoding of at least 32 bytes.
+# The security of all encrypted data critically depends on this key, therefore
+# it should be set to a high-entropy value. For example, a suitable value may
+# be obtained by base-64 encoding a 32 byte (or longer) value generated by a
+# cryptographically secure random number generator. Changing the root secret is
+# likely to result in data loss.
+encryption_root_secret = changeme
+
+# Multiple root secrets may be configured using options named
+# 'encryption_root_secret_' where 'secret_id' is a unique
+# identifier. This enables the root secret to be changed from time to time.
+# Only one root secret is used for object PUTs or POSTs at any moment in time.
+# This is specified by the 'active_root_secret_id' option. If
+# 'active_root_secret_id' is not specified then the root secret specified by
+# 'encryption_root_secret' is considered to be the default. Once a root secret
+# has been used as the default root secret it must remain in the config file in
+# order that any objects that were encrypted with it may be subsequently
+# decrypted. The secret_id used to identify the key cannot change.
+# encryption_root_secret_myid = changeme
+# active_root_secret_id = myid
+
+# Sets the path from which the keymaster config options should be read. This
+# allows multiple processes which need to be encryption-aware (for example,
+# proxy-server and container-sync) to share the same config file, ensuring
+# that the encryption keys used are the same. The format expected is similar
+# to other config files, with a single [keymaster] section and a single
+# encryption_root_secret option. If this option is set, the root secret
+# MUST NOT be set in proxy-server.conf.
+# keymaster_config_path =
+
+# To store the encryption root secret in a remote key management system (KMS)
+# such as Barbican, replace the keymaster middleware with the kms_keymaster
+# middleware in the proxy-server pipeline. They should be to the right of all
+# other middleware apart from the final proxy-logging middleware, and in the
+# order shown in this example:
+#  kms_keymaster encryption proxy-logging proxy-server
+[filter:kms_keymaster]
+use = egg:swift#kms_keymaster
+
+# Sets the path from which the keymaster config options should be read. This
+# allows multiple processes which need to be encryption-aware (for example,
+# proxy-server and container-sync) to share the same config file, ensuring
+# that the encryption keys used are the same. The format expected is similar
+# to other config files, with a single [kms_keymaster] section. See the
+# keymaster.conf-sample file for details on the kms_keymaster configuration
+# options.
+# keymaster_config_path =
+
+# kmip_keymaster middleware may be used to fetch an encryption root secret from
+# a KMIP service. It should replace, in the same position, any other keymaster
+# middleware in the proxy-server pipeline, so that the middleware order is as
+# shown in this example:
+#  kmip_keymaster encryption proxy-logging proxy-server
+[filter:kmip_keymaster]
+use = egg:swift#kmip_keymaster
+
+# Sets the path from which the keymaster config options should be read. This
+# allows multiple processes which need to be encryption-aware (for example,
+# proxy-server and container-sync) to share the same config file, ensuring
+# that the encryption keys used are the same. As an added benefit the
+# keymaster configuration file can have different permissions than the
+# `proxy-server.conf` file. The format expected is similar
+# to other config files, with a single [kmip_keymaster] section. See the
+# keymaster.conf-sample file for details on the kmip_keymaster configuration
+# options.
+# keymaster_config_path =
+
+[filter:encryption]
+use = egg:swift#encryption
+
+# By default all PUT or POST'ed object data and/or metadata will be encrypted.
+# Encryption of new data and/or metadata may be disabled by setting
+# disable_encryption to True. However, all encryption middleware should remain
+# in the pipeline in order for existing encrypted data to be read.
+# disable_encryption = False
+
+# listing_formats should be just right of the first proxy-logging middleware,
+# and left of most other middlewares. If it is not already present, it will
+# be automatically inserted for you.
+[filter:listing_formats]
+use = egg:swift#listing_formats
+
+# Note: Put after slo, dlo, versioned_writes, but before encryption in the
+# pipeline.
+[filter:symlink]
+use = egg:swift#symlink
+# Symlinks can point to other symlinks provided the number of symlinks in a
+# chain does not exceed the symloop_max value. If the number of chained
+# symlinks exceeds the limit symloop_max a 409 (HTTPConflict) error
+# response will be produced.
+# symloop_max = 2
diff --git a/etc/rsyncd.conf-sample b/etc/rsyncd.conf-sample
index c3b9952b16..00d205e1dc 100644
--- a/etc/rsyncd.conf-sample
+++ b/etc/rsyncd.conf-sample
@@ -2,6 +2,9 @@ uid = swift
 gid = swift
 log file = /var/log/rsyncd.log
 pid file = /var/run/rsyncd.pid
+# since rsync default for reverse lookup is true, you have to set it to false
+# here globally or after a few 100 nodes your dns team will fuss at you
+reverse lookup = false
 
 [account]
 max connections = 2
@@ -20,3 +23,59 @@ max connections = 8
 path = /srv/node
 read only = false
 lock file = /var/lock/object.lock
+
+
+# If rsync_module includes the device, you can tune rsyncd to permit 4
+# connections per device instead of simply allowing 8 connections for all
+# devices:
+# rsync_module = {replication_ip}::object_{device}
+#
+# (if devices in your object ring are named sda, sdb and sdc)
+#
+#[object_sda]
+#max connections = 4
+#path = /srv/node
+#read only = false
+#lock file = /var/lock/object_sda.lock
+#
+#[object_sdb]
+#max connections = 4
+#path = /srv/node
+#read only = false
+#lock file = /var/lock/object_sdb.lock
+#
+#[object_sdc]
+#max connections = 4
+#path = /srv/node
+#read only = false
+#lock file = /var/lock/object_sdc.lock
+
+
+# On a swift-all-in-one VM, you might tune rsync by replication port instead:
+# rsync_module = {replication_ip}::object{replication_port}
+#
+# So, on your SAIO, you have to set the following rsyncd configuration:
+#
+#[object6210]
+#max connections = 25
+#path = /srv/1/node/
+#read only = false
+#lock file = /var/lock/object6210.lock
+#
+#[object6220]
+#max connections = 25
+#path = /srv/2/node/
+#read only = false
+#lock file = /var/lock/object6220.lock
+#
+#[object6230]
+#max connections = 25
+#path = /srv/3/node/
+#read only = false
+#lock file = /var/lock/object6230.lock
+#
+#[object6240]
+#max connections = 25
+#path = /srv/4/node/
+#read only = false
+#lock file = /var/lock/object6240.lock
diff --git a/etc/swift-bench.conf-sample b/etc/swift-bench.conf-sample
deleted file mode 100644
index 423608ec48..0000000000
--- a/etc/swift-bench.conf-sample
+++ /dev/null
@@ -1,60 +0,0 @@
-[bench]
-# auth = http://localhost:8080/auth/v1.0
-# user = test:tester
-# key = testing
-# auth_version = 1.0
-# log-level = INFO
-# timeout = 10
-
-# You can configure PUT, GET, and DELETE concurrency independently or set all
-# three with "concurrency"
-# put_concurrency = 10
-# get_concurrency = 10
-# del_concurrency = 10
-# concurrency =
-
-# A space-sep list of files whose contents will be read and randomly chosen
-# as the body (object contents) for each PUT.
-# object_sources =
-
-# If object_sources is not set and lower_object_size != upper_object_size,
-# each PUT will randomly select an object size between the two values.  Units
-# are bytes.
-# lower_object_size = 10
-# upper_object_size = 10
-
-# If object_sources is not set and lower_object_size == upper_object_size,
-# every object PUT will contain this many bytes.
-# object_size = 1
-
-# num_objects = 1000
-# num_gets = 10000
-# num_containers = 20
-
-# The base name for created containers.
-# container_name = (randomly-chosen uuid4)
-
-# Should swift-bench benchmark DELETEing the created objects and then delete
-# all created containers?
-# delete = yes
-
-# Without use_proxy, swift-bench will talk directly to the backend Swift
-# servers.  Doing that will require "url", "account", and at least one
-# "devices" entry.
-# use_proxy = yes
-
-# If use_proxy = yes, this will override any returned X-Storage-Url returned
-# by authenticaion (the account name will still be extracted from
-# X-Storage-Url though and may NOT be set with the "account" conf var).  If
-# use_proxy = no, this setting is required and used as the X-Storage-Url when
-# deleting containers and as a source for IP and port for back-end Swift server
-# connections.  The IP and port specified in this setting must have local
-# storage access to every device specified in "devices".
-# url =
-
-# Only used (and required) when use_proxy = no.
-# account =
-
-# A space-sep list of devices names; only relevant (and required) when
-# use_proxy = no.
-# devices = sdb1
diff --git a/etc/swift-rsyslog.conf-sample b/etc/swift-rsyslog.conf-sample
new file mode 100644
index 0000000000..4d9c2f3ea1
--- /dev/null
+++ b/etc/swift-rsyslog.conf-sample
@@ -0,0 +1,40 @@
+# Uncomment the following to have a log containing all logs together
+#local.* /var/log/swift/all.log
+
+# Uncomment the following to have hourly swift logs.
+#$template HourlyProxyLog,"/var/log/swift/hourly/%$YEAR%%$MONTH%%$DAY%%$HOUR%"
+#local0.* ?HourlyProxyLog
+
+# Use the following to have separate log files for each of the main servers:
+# account-server, container-server, object-server, proxy-server. Note:
+# object-updater's output will be stored in object.log.
+if $programname contains 'swift' then /var/log/swift/swift.log
+if $programname contains 'account' then /var/log/swift/account.log
+if $programname contains 'container' then /var/log/swift/container.log
+if $programname contains 'object' then /var/log/swift/object.log
+if $programname contains 'proxy' then /var/log/swift/proxy.log
+
+# Uncomment the following to have specific log via program name.
+#if $programname == 'swift' then /var/log/swift/swift.log
+#if $programname == 'account-server' then /var/log/swift/account-server.log
+#if $programname == 'account-replicator' then /var/log/swift/account-replicator.log
+#if $programname == 'account-auditor' then /var/log/swift/account-auditor.log
+#if $programname == 'account-reaper' then /var/log/swift/account-reaper.log
+#if $programname == 'container-server' then /var/log/swift/container-server.log
+#if $programname == 'container-replicator' then /var/log/swift/container-replicator.log
+#if $programname == 'container-updater' then /var/log/swift/container-updater.log
+#if $programname == 'container-auditor' then /var/log/swift/container-auditor.log
+#if $programname == 'container-sync' then /var/log/swift/container-sync.log
+#if $programname == 'container-sharder' then /var/log/swift/container-sharder.log
+#if $programname == 'container-reconciler' then /var/log/swift/container-reconciler.log
+#if $programname == 'object-server' then /var/log/swift/object-server.log
+#if $programname == 'object-replicator' then /var/log/swift/object-replicator.log
+#if $programname == 'object-updater' then /var/log/swift/object-updater.log
+#if $programname == 'object-auditor' then /var/log/swift/object-auditor.log
+#if $programname == 'object-expirer' then /var/log/swift/object-expirer.log
+#if $programname == 'object-reconstructor' then /var/log/swift/object-reconstructor.log
+#if $programname == 'object-relinker' then /var/log/swift/object-relinker.log
+
+# Use the following to discard logs that don't match any of the above to avoid
+# them filling up /var/log/messages.
+local0.* ~
diff --git a/etc/swift.conf-sample b/etc/swift.conf-sample
index 2f4192a3c1..84454ec4f8 100644
--- a/etc/swift.conf-sample
+++ b/etc/swift.conf-sample
@@ -1,15 +1,116 @@
 [swift-hash]
 
-# swift_hash_path_suffix is used as part of the hashing algorithm
-# when determining data placement in the cluster. This value should
-# remain secret and MUST NOT change once a cluster has been deployed.
+# swift_hash_path_suffix and swift_hash_path_prefix are used as part of the
+# hashing algorithm when determining data placement in the cluster.
+# These values should remain secret and MUST NOT change
+# once a cluster has been deployed.
+# Use only printable chars (python -c "import string; print(string.printable)")
 
 swift_hash_path_suffix = changeme
-
-
+swift_hash_path_prefix = changeme
+
+# Storage policies are defined here and determine various characteristics
+# about how objects are stored and treated. More documentation can be found at
+# https://docs.openstack.org/swift/latest/overview_policies.html.
+
+# Client requests specify a policy on a per container basis using the policy
+# name. Internally the policy name is mapped to the policy index specified in
+# the policy's section header in this config file. Policy names are
+# case-insensitive and, to avoid confusion with indexes names, should not be
+# numbers.
+#
+# The policy with index 0 is always used for legacy containers and can be given
+# a name for use in metadata however the ring file name will always be
+# 'object.ring.gz' for backwards compatibility.  If no policies are defined a
+# policy with index 0 will be automatically created for backwards compatibility
+# and given the name Policy-0.  A default policy is used when creating new
+# containers when no policy is specified in the request.  If no other policies
+# are defined the policy with index 0 will be declared the default.  If
+# multiple policies are defined you must define a policy with index 0 and you
+# must specify a default.  It is recommended you always define a section for
+# storage-policy:0.
+#
+# A 'policy_type' argument is also supported but is not mandatory.  Default
+# policy type 'replication' is used when 'policy_type' is unspecified.
+#
+# A 'diskfile_module' optional argument lets you specify an alternate backend
+# object storage plug-in architecture. The default is
+# "egg:swift#replication.fs", or "egg:swift#erasure_coding.fs", depending on
+# the policy type.
+#
+# Aliases for the storage policy name may be defined, but are not required.
+#
+[storage-policy:0]
+name = Policy-0
+default = yes
+#policy_type = replication
+#diskfile_module = egg:swift#replication.fs
+aliases = yellow, orange
+
+# The following section would declare a policy called 'silver', the number of
+# replicas will be determined by how the ring is built.  In this example the
+# 'silver' policy could have a lower or higher # of replicas than the
+# 'Policy-0' policy above.  The ring filename will be 'object-1.ring.gz'.  You
+# may only specify one storage policy section as the default.  If you changed
+# this section to specify 'silver' as the default, when a client created a new
+# container w/o a policy specified, it will get the 'silver' policy because
+# this config has specified it as the default.  However if a legacy container
+# (one created with a pre-policy version of swift) is accessed, it is known
+# implicitly to be assigned to the policy with index 0 as opposed to the
+# current default. Note that even without specifying any aliases, a policy
+# always has at least the default name stored in aliases because this field is
+# used to contain all human readable names for a storage policy.
+#
+#[storage-policy:1]
+#name = silver
+#policy_type = replication
+#diskfile_module = egg:swift#replication.fs
+
+# The following declares a storage policy of type 'erasure_coding' which uses
+# Erasure Coding for data reliability. Please refer to Swift documentation for
+# details on how the 'erasure_coding' storage policy is implemented.
+#
+# Swift uses PyECLib, a Python Erasure coding API library, for encode/decode
+# operations.  Please refer to Swift documentation for details on how to
+# install PyECLib.
+#
+# When defining an EC policy, 'policy_type' needs to be 'erasure_coding' and
+# EC configuration parameters 'ec_type', 'ec_num_data_fragments' and
+# 'ec_num_parity_fragments' must be specified.  'ec_type' is chosen from the
+# list of EC backends supported by PyECLib.  The ring configured for the
+# storage policy must have its "replica" count configured to
+# 'ec_num_data_fragments' + 'ec_num_parity_fragments' - this requirement is
+# validated when services start.  'ec_object_segment_size' is the amount of
+# data that will be buffered up before feeding a segment into the
+# encoder/decoder.  More information about these configuration options and
+# supported 'ec_type' schemes is available in the Swift documentation.  See
+# https://docs.openstack.org/swift/latest/overview_erasure_code.html
+# for more information on how to configure EC policies.
+#
+# The example 'deepfreeze10-4' policy defined below is a _sample_
+# configuration with an alias of 'df10-4' as well as 10 'data' and 4 'parity'
+# fragments. 'ec_type' defines the Erasure Coding scheme.
+# 'liberasurecode_rs_vand' (Reed-Solomon Vandermonde) is used as an example
+# below.
+#
+#[storage-policy:2]
+#name = deepfreeze10-4
+#aliases = df10-4
+#policy_type = erasure_coding
+#diskfile_module = egg:swift#erasure_coding.fs
+#ec_type = liberasurecode_rs_vand
+#ec_num_data_fragments = 10
+#ec_num_parity_fragments = 4
+#ec_object_segment_size = 1048576
+#
+# Duplicated EC fragments is proof-of-concept experimental support to enable
+# Global Erasure Coding policies with multiple regions acting as independent
+# failure domains.  Do not change the default except in development/testing.
+#ec_duplication_factor = 1
 
 # The swift-constraints section sets the basic constraints on data
-# saved in the swift cluster.
+# saved in the swift cluster. These constraints are automatically
+# published by the proxy server in responses to /info requests.
 
 [swift-constraints]
 
@@ -46,6 +147,35 @@ swift_hash_path_suffix = changeme
 
 #max_meta_overall_size = 4096
 
+# max_header_size is the max number of bytes in the utf8 encoding of each
+# header. Using 8192 as default because eventlet use 8192 as max size of
+# header line. This value may need to be increased when using identity
+# v3 API tokens including more than 7 catalog entries.
+# See also include_service_catalog in proxy-server.conf-sample
+# (documented at https://docs.openstack.org/swift/latest/overview_auth.html)
+
+#max_header_size = 8192
+
+
+# max_request_line constrains the max number of characters in the request
+# line; a request line '  \r\n' must have less
+# than this number of characters.
+# Using 8192 as default because eventlet use 8192 as default max request line.
+#
+# Note: Request urls are concatenated with the request method to form
+# the Referer header that is sent to backend servers, so it may be
+# necessary to increase max_header_size when increasing max_request_line.
+
+#max_request_line = 8192
+
+# By default the maximum number of allowed headers depends on the number of max
+# allowed metadata settings plus a default value of 36 for swift internally
+# generated headers and regular http headers.  If for some reason this is not
+# enough (custom middleware for example) it can be increased with the
+# extra_header_count constraint.
+
+#extra_header_count = 0
+
 
 # max_object_name_length is the max number of bytes in the utf8 encoding
 # of an object name
@@ -74,3 +204,19 @@ swift_hash_path_suffix = changeme
 # of a container name
 
 #max_container_name_length = 256
+
+
+# By default all REST API calls should use "v1" or "v1.0" as the version string,
+# for example "/v1/account". This can be manually overridden to make this
+# backward-compatible, in case a different version string has been used before.
+# Use a comma-separated list in case of multiple allowed versions, for example
+# valid_api_versions = v0,v1,v2
+# This is only enforced for account, container and object requests. The allowed
+# api versions are by default excluded from /info.
+
+# valid_api_versions = v1,v1.0
+
+# The prefix used for hidden auto-created accounts, for example accounts in
+# which shard containers are created. It defaults to '.'; don't change it.
+
+# auto_create_account_prefix = .
diff --git a/examples/apache2/account-server.template b/examples/apache2/account-server.template
new file mode 100644
index 0000000000..d336a130d8
--- /dev/null
+++ b/examples/apache2/account-server.template
@@ -0,0 +1,25 @@
+# Account Server VHOST Template For Apache2
+#
+# Change %PORT% to the port that you wish to use on your system
+# Change %SERVICENAME% to the service name you are using
+# Change %USER% to the system user that will run the daemon process
+# Change the debug level as you see fit
+#
+# For example:
+#     Replace %PORT% by 6212
+#     Replace %SERVICENAME% by account-server-1
+#     Replace %USER% with apache (or remove it for default)
+
+NameVirtualHost *:%PORT%
+Listen %PORT%
+
+
+    WSGIDaemonProcess %SERVICENAME% processes=5 threads=1 user=%USER% display-name=%{GROUP}
+    WSGIProcessGroup %SERVICENAME%
+    WSGIScriptAlias / /var/www/swift/%SERVICENAME%.wsgi
+    WSGIApplicationGroup %{GLOBAL}
+    LimitRequestFields 200
+    ErrorLog /var/log/%APACHE_NAME%/%SERVICENAME%
+    LogLevel debug
+    CustomLog /var/log/%APACHE_NAME%/access.log combined
+
diff --git a/examples/apache2/container-server.template b/examples/apache2/container-server.template
new file mode 100644
index 0000000000..5b504b1958
--- /dev/null
+++ b/examples/apache2/container-server.template
@@ -0,0 +1,25 @@
+# Container Server VHOST Template For Apache2
+#
+# Change %PORT% to the port that you wish to use on your system
+# Change %SERVICENAME% to the service name you are using
+# Change %USER% to the system user that will run the daemon process
+# Change the debug level as you see fit
+#
+# For example:
+#     Replace %PORT% by 6211
+#     Replace %SERVICENAME% by container-server-1
+#     Replace %USER% with apache (or remove it for default)
+
+NameVirtualHost *:%PORT%
+Listen %PORT%
+
+
+    WSGIDaemonProcess %SERVICENAME% processes=5 threads=1 user=%USER% display-name=%{GROUP}
+    WSGIProcessGroup %SERVICENAME%
+    WSGIScriptAlias / /var/www/swift/%SERVICENAME%.wsgi
+    WSGIApplicationGroup %{GLOBAL}
+    LimitRequestFields 200
+    ErrorLog /var/log/%APACHE_NAME%/%SERVICENAME%
+    LogLevel debug
+    CustomLog /var/log/%APACHE_NAME%/access.log combined
+
diff --git a/examples/apache2/object-server.template b/examples/apache2/object-server.template
new file mode 100644
index 0000000000..8e8ed65670
--- /dev/null
+++ b/examples/apache2/object-server.template
@@ -0,0 +1,25 @@
+# Object Server VHOST Template For Apache2
+#
+# Change %PORT% to the port that you wish to use on your system
+# Change %SERVICENAME% to the service name you are using
+# Change %USER% to the system user that will run the daemon process
+# Change the debug level as you see fit
+#
+# For example:
+#     Replace %PORT% by 6210
+#     Replace %SERVICENAME% by object-server-1
+#     Replace %USER% with apache (or remove it for default)
+
+NameVirtualHost *:%PORT%
+Listen %PORT%
+
+
+    WSGIDaemonProcess %SERVICENAME% processes=5 threads=1 user=%USER% display-name=%{GROUP}
+    WSGIProcessGroup %SERVICENAME%
+    WSGIScriptAlias / /var/www/swift/%SERVICENAME%.wsgi
+    WSGIApplicationGroup %{GLOBAL}
+    LimitRequestFields 200
+    ErrorLog /var/log/%APACHE_NAME%/%SERVICENAME%
+    LogLevel debug
+    CustomLog /var/log/%APACHE_NAME%/access.log combined
+
diff --git a/examples/apache2/proxy-server.template b/examples/apache2/proxy-server.template
new file mode 100644
index 0000000000..f2b500be65
--- /dev/null
+++ b/examples/apache2/proxy-server.template
@@ -0,0 +1,27 @@
+# Proxy Server VHOST Template For Apache2
+#
+# Change %PORT% to the port that you wish to use on your system
+# Change %SERVICENAME% to the service name you are using
+# Change %USER% to the system user that will run the daemon process
+# Change the debug level as you see fit
+#
+# For example:
+#     Replace %PORT% by 8080
+#     Replace %SERVICENAME% by proxy-server
+#     Replace %USER% with apache (or remove it for default)
+
+NameVirtualHost *:%PORT%
+Listen %PORT%
+
+
+    # The limit of an object size
+    LimitRequestBody 5368709122
+    WSGIDaemonProcess %SERVICENAME% processes=5 threads=1 user=%USER% display-name=%{GROUP}
+    WSGIProcessGroup %SERVICENAME%
+    WSGIScriptAlias / /var/www/swift/%SERVICENAME%.wsgi
+    WSGIApplicationGroup %{GLOBAL}
+    LimitRequestFields 200
+    ErrorLog /var/log/%APACHE_NAME%/%SERVICENAME%
+    LogLevel debug
+    CustomLog /var/log/%APACHE_NAME%/access.log combined
+
diff --git a/examples/wsgi/account-server.wsgi.template b/examples/wsgi/account-server.wsgi.template
new file mode 100644
index 0000000000..51bcc00d92
--- /dev/null
+++ b/examples/wsgi/account-server.wsgi.template
@@ -0,0 +1,14 @@
+# Account Server wsgi Template
+#
+# Change %SERVICECONF% to the service conf file you are using
+#
+# For example:
+#     Replace %SERVICECONF% by account-server/1.conf
+#
+# This file than need to be saved under /var/www/swift/%SERVICENAME%.wsgi
+# * Replace %SERVICENAME% with the service name you use your system
+#   E.g. Replace %SERVICENAME% by account-server-1
+
+from swift.common.wsgi import init_request_processor
+application, conf, logger, log_name = \
+    init_request_processor('/etc/swift/%SERVICECONF%','account-server')
diff --git a/examples/wsgi/container-server.wsgi.template b/examples/wsgi/container-server.wsgi.template
new file mode 100644
index 0000000000..bebd3e6a41
--- /dev/null
+++ b/examples/wsgi/container-server.wsgi.template
@@ -0,0 +1,14 @@
+# Container Server wsgi Template
+#
+# Change %SERVICECONF% to the service conf file you are using
+#
+# For example:
+#     Replace %SERVICECONF% by container-server/1.conf
+#
+# This file than need to be saved under /var/www/swift/%SERVICENAME%.wsgi
+# * Replace %SERVICENAME% with the service name you use your system
+#   E.g. Replace %SERVICENAME% by container-server-1
+
+from swift.common.wsgi import init_request_processor
+application, conf, logger, log_name = \
+    init_request_processor('/etc/swift/%SERVICECONF%','container-server')
diff --git a/examples/wsgi/object-server.wsgi.template b/examples/wsgi/object-server.wsgi.template
new file mode 100644
index 0000000000..afcb14324c
--- /dev/null
+++ b/examples/wsgi/object-server.wsgi.template
@@ -0,0 +1,14 @@
+# Object Server wsgi Template
+#
+# Change %SERVICECONF% to the service conf file you are using
+#
+# For example:
+#     Replace %SERVICECONF% by object-server/1.conf
+#
+# This file than need to be saved under /var/www/swift/%SERVICENAME%.wsgi
+# * Replace %SERVICENAME% with the service name you use your system
+#   E.g. Replace %SERVICENAME% by object-server-1
+
+from swift.common.wsgi import init_request_processor
+application, conf, logger, log_name = \
+    init_request_processor('/etc/swift/%SERVICECONF%','object-server')
diff --git a/examples/wsgi/proxy-server.wsgi.template b/examples/wsgi/proxy-server.wsgi.template
new file mode 100644
index 0000000000..1a48afeca6
--- /dev/null
+++ b/examples/wsgi/proxy-server.wsgi.template
@@ -0,0 +1,14 @@
+# Proxy Server wsgi Template
+#
+# Change %SERVICECONF% to the service conf file you are using
+#
+# For example:
+#     Replace %SERVICECONF% by proxy-server.conf
+#
+# This file than need to be saved under /var/www/swift/%SERVICENAME%.wsgi
+# * Replace %SERVICENAME% with the service name you use your system
+#   E.g. Replace %SERVICENAME% by proxy-server
+
+from swift.common.wsgi import init_request_processor
+application, conf, logger, log_name = \
+    init_request_processor('/etc/swift/%SERVICECONF%','proxy-server')
diff --git a/locale/swift.pot b/locale/swift.pot
deleted file mode 100644
index 7f905f2940..0000000000
--- a/locale/swift.pot
+++ /dev/null
@@ -1,1030 +0,0 @@
-# Translations template for swift.
-# Copyright (C) 2011 ORGANIZATION
-# This file is distributed under the same license as the swift project.
-# FIRST AUTHOR , 2011.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: swift 1.2.0\n"
-"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n"
-"POT-Creation-Date: 2011-01-26 23:59+0000\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME \n"
-"Language-Team: LANGUAGE \n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 0.9.4\n"
-
-#: swift/account/auditor.py:52 swift/account/auditor.py:75
-#, python-format
-msgid ""
-"Since %(time)s: Account audits: %(passed)s passed audit, %(failed)s "
-"failed audit"
-msgstr ""
-
-#: swift/account/auditor.py:100 swift/container/auditor.py:103
-#, python-format
-msgid "Audit passed for %s"
-msgstr ""
-
-#: swift/account/auditor.py:103
-#, python-format
-msgid "ERROR Could not get account info %s"
-msgstr ""
-
-#: swift/account/reaper.py:80 swift/container/updater.py:64
-#, python-format
-msgid "Loading account ring from %s"
-msgstr ""
-
-#: swift/account/reaper.py:88 swift/obj/updater.py:57
-#, python-format
-msgid "Loading container ring from %s"
-msgstr ""
-
-#: swift/account/reaper.py:96
-#, python-format
-msgid "Loading object ring from %s"
-msgstr ""
-
-#: swift/account/reaper.py:106
-msgid "Daemon started."
-msgstr ""
-
-#: swift/account/reaper.py:122
-#, python-format
-msgid "Begin devices pass: %s"
-msgstr ""
-
-#: swift/account/reaper.py:128 swift/common/utils.py:805
-#: swift/obj/updater.py:74 swift/obj/updater.py:113
-#, python-format
-msgid "Skipping %s as it is not mounted"
-msgstr ""
-
-#: swift/account/reaper.py:132
-#, python-format
-msgid "Devices pass completed: %.02fs"
-msgstr ""
-
-#: swift/account/reaper.py:215
-#, python-format
-msgid "Beginning pass on account %s"
-msgstr ""
-
-#: swift/account/reaper.py:238
-#, python-format
-msgid "Exception with containers for account %s"
-msgstr ""
-
-#: swift/account/reaper.py:243
-#, python-format
-msgid "Exception with account %s"
-msgstr ""
-
-#: swift/account/reaper.py:244
-#, python-format
-msgid "Incomplete pass on account %s"
-msgstr ""
-
-#: swift/account/reaper.py:246
-#, python-format
-msgid ", %s containers deleted"
-msgstr ""
-
-#: swift/account/reaper.py:248
-#, python-format
-msgid ", %s objects deleted"
-msgstr ""
-
-#: swift/account/reaper.py:250
-#, python-format
-msgid ", %s containers remaining"
-msgstr ""
-
-#: swift/account/reaper.py:253
-#, python-format
-msgid ", %s objects remaining"
-msgstr ""
-
-#: swift/account/reaper.py:255
-#, python-format
-msgid ", %s containers possibly remaining"
-msgstr ""
-
-#: swift/account/reaper.py:258
-#, python-format
-msgid ", %s objects possibly remaining"
-msgstr ""
-
-#: swift/account/reaper.py:261
-msgid ", return codes: "
-msgstr ""
-
-#: swift/account/reaper.py:265
-#, python-format
-msgid ", elapsed: %.02fs"
-msgstr ""
-
-#: swift/account/reaper.py:320 swift/account/reaper.py:355
-#: swift/account/reaper.py:406 swift/container/updater.py:277
-#, python-format
-msgid "Exception with %(ip)s:%(port)s/%(device)s"
-msgstr ""
-
-#: swift/account/reaper.py:333
-#, python-format
-msgid "Exception with objects for container %(container)s for account %(account)s"
-msgstr ""
-
-#: swift/account/server.py:309 swift/container/server.py:397
-#: swift/obj/server.py:597
-#, python-format
-msgid "ERROR __call__ error with %(method)s %(path)s "
-msgstr ""
-
-#: swift/auth/server.py:96 swift/common/middleware/swauth.py:94
-msgid "No super_admin_key set in conf file! Exiting."
-msgstr ""
-
-#: swift/auth/server.py:152
-#, python-format
-msgid ""
-"\n"
-"THERE ARE ACCOUNTS IN YOUR auth.db THAT DO NOT BEGIN WITH YOUR NEW "
-"RESELLER\n"
-"PREFIX OF \"%(reseller)s\".\n"
-"YOU HAVE A FEW OPTIONS:\n"
-"    1. RUN \"swift-auth-update-reseller-prefixes %(db_file)s "
-"%(reseller)s\",\n"
-"       \"swift-init auth-server restart\", AND\n"
-"       \"swift-auth-recreate-accounts -K ...\" TO CREATE FRESH ACCOUNTS.\n"
-"    OR\n"
-"    2. REMOVE %(db_file)s, RUN \"swift-init auth-server restart\", AND "
-"RUN\n"
-"       \"swift-auth-add-user ...\" TO CREATE BRAND NEW ACCOUNTS THAT WAY."
-"\n"
-"    OR\n"
-"    3. ADD \"reseller_prefix = %(previous)s\" (WITHOUT THE QUOTES) TO "
-"YOUR\n"
-"       proxy-server.conf IN THE [filter:auth] SECTION AND TO YOUR\n"
-"       auth-server.conf IN THE [app:auth-server] SECTION AND RUN\n"
-"       \"swift-init proxy-server restart\" AND \"swift-init auth-server "
-"restart\"\n"
-"       TO REVERT BACK TO YOUR PREVIOUS RESELLER PREFIX.\n"
-"\n"
-"    %(note)s\n"
-"                    "
-msgstr ""
-
-#: swift/auth/server.py:173
-msgid ""
-"\n"
-"    SINCE YOUR PREVIOUS RESELLER PREFIX WAS AN EMPTY STRING, IT IS NOT\n"
-"    RECOMMENDED TO PERFORM OPTION 3 AS THAT WOULD MAKE SUPPORTING "
-"MULTIPLE\n"
-"    RESELLERS MORE DIFFICULT.\n"
-"                    "
-msgstr ""
-
-#: swift/auth/server.py:178
-msgid "CRITICAL: "
-msgstr ""
-
-#: swift/auth/server.py:213
-#, python-format
-msgid "ERROR attempting to create account %(url)s: %(status)s %(reason)s"
-msgstr ""
-
-#: swift/auth/server.py:346
-#, python-format
-msgid ""
-"ALREADY EXISTS create_user(%(account)s, %(user)s, _, %(admin)s, "
-"%(reseller_admin)s) [%(elapsed).02f]"
-msgstr ""
-
-#: swift/auth/server.py:364
-#, python-format
-msgid ""
-"FAILED create_user(%(account)s, %(user)s, _, %(admin)s, "
-"%(reseller_admin)s) [%(elapsed).02f]"
-msgstr ""
-
-#: swift/auth/server.py:381
-#, python-format
-msgid ""
-"SUCCESS create_user(%(account)s, %(user)s, _, %(admin)s, "
-"%(reseller_admin)s) = %(url)s [%(elapsed).02f]"
-msgstr ""
-
-#: swift/auth/server.py:656
-msgid "ERROR Unhandled exception in ReST request"
-msgstr ""
-
-#: swift/common/bench.py:85
-#, python-format
-msgid "%(complete)s %(title)s [%(fail)s failures], %(rate).01f/s"
-msgstr ""
-
-#: swift/common/bench.py:97
-msgid "CannotSendRequest.  Skipping..."
-msgstr ""
-
-#: swift/common/bufferedhttp.py:96
-#, python-format
-msgid "HTTP PERF: %(time).5f seconds to %(method)s %(host)s:%(port)s %(path)s)"
-msgstr ""
-
-#: swift/common/db.py:299
-msgid "Broker error trying to rollback locked connection"
-msgstr ""
-
-#: swift/common/db.py:754 swift/common/db.py:1221
-#, python-format
-msgid "Invalid pending entry %(file)s: %(entry)s"
-msgstr ""
-
-#: swift/common/db_replicator.py:84
-#, python-format
-msgid "ERROR reading HTTP response from %s"
-msgstr ""
-
-#: swift/common/db_replicator.py:123
-#, python-format
-msgid "Attempted to replicate %(count)d dbs in %(time).5f seconds (%(rate).5f/s)"
-msgstr ""
-
-#: swift/common/db_replicator.py:129
-#, python-format
-msgid "Removed %(remove)d dbs"
-msgstr ""
-
-#: swift/common/db_replicator.py:130
-#, python-format
-msgid "%(success)s successes, %(failure)s failures"
-msgstr ""
-
-#: swift/common/db_replicator.py:155
-#, python-format
-msgid "ERROR rsync failed with %(code)s: %(args)s"
-msgstr ""
-
-#: swift/common/db_replicator.py:205
-#, python-format
-msgid "Syncing chunks with %s"
-msgstr ""
-
-#: swift/common/db_replicator.py:213
-#, python-format
-msgid "ERROR Bad response %(status)s from %(host)s"
-msgstr ""
-
-#: swift/common/db_replicator.py:278
-#, python-format
-msgid "ERROR Unable to connect to remote server: %s"
-msgstr ""
-
-#: swift/common/db_replicator.py:316
-#, python-format
-msgid "Replicating db %s"
-msgstr ""
-
-#: swift/common/db_replicator.py:325 swift/common/db_replicator.py:479
-#, python-format
-msgid "Quarantining DB %s"
-msgstr ""
-
-#: swift/common/db_replicator.py:328
-#, python-format
-msgid "ERROR reading db %s"
-msgstr ""
-
-#: swift/common/db_replicator.py:361
-#, python-format
-msgid "ERROR Remote drive not mounted %s"
-msgstr ""
-
-#: swift/common/db_replicator.py:363
-#, python-format
-msgid "ERROR syncing %(file)s with node %(node)s"
-msgstr ""
-
-#: swift/common/db_replicator.py:405
-msgid "ERROR Failed to get my own IPs?"
-msgstr ""
-
-#: swift/common/db_replicator.py:412
-#, python-format
-msgid "Skipping %(device)s as it is not mounted"
-msgstr ""
-
-#: swift/common/db_replicator.py:420
-msgid "Beginning replication run"
-msgstr ""
-
-#: swift/common/db_replicator.py:425
-msgid "Replication run OVER"
-msgstr ""
-
-#: swift/common/db_replicator.py:436
-msgid "ERROR trying to replicate"
-msgstr ""
-
-#: swift/common/memcached.py:69
-#, python-format
-msgid "Timeout %(action)s to memcached: %(server)s"
-msgstr ""
-
-#: swift/common/memcached.py:72
-#, python-format
-msgid "Error %(action)s to memcached: %(server)s"
-msgstr ""
-
-#: swift/common/memcached.py:81
-#, python-format
-msgid "Error limiting server %s"
-msgstr ""
-
-#: swift/common/utils.py:88
-#, python-format
-msgid "Unable to locate %s in libc.  Leaving as a no-op."
-msgstr ""
-
-#: swift/common/utils.py:255
-msgid "STDOUT: Connection reset by peer"
-msgstr ""
-
-#: swift/common/utils.py:257 swift/common/utils.py:260
-#, python-format
-msgid "STDOUT: %s"
-msgstr ""
-
-#: swift/common/utils.py:324
-msgid "Connection refused"
-msgstr ""
-
-#: swift/common/utils.py:326
-msgid "Host unreachable"
-msgstr ""
-
-#: swift/common/utils.py:328
-msgid "Connection timeout"
-msgstr ""
-
-#: swift/common/utils.py:464
-msgid "UNCAUGHT EXCEPTION"
-msgstr ""
-
-#: swift/common/utils.py:511
-msgid "Error: missing config file argument"
-msgstr ""
-
-#: swift/common/utils.py:516
-#, python-format
-msgid "Error: unable to locate %s"
-msgstr ""
-
-#: swift/common/utils.py:743
-#, python-format
-msgid "Unable to read config file %s"
-msgstr ""
-
-#: swift/common/utils.py:749
-#, python-format
-msgid "Unable to find %s config section in %s"
-msgstr ""
-
-#: swift/common/middleware/catch_errors.py:39
-#, python-format
-msgid "Error: %s"
-msgstr ""
-
-#: swift/common/middleware/cname_lookup.py:91
-#, python-format
-msgid "Mapped %(given_domain)s to %(found_domain)s"
-msgstr ""
-
-#: swift/common/middleware/cname_lookup.py:102
-#, python-format
-msgid "Following CNAME chain for  %(given_domain)s to %(found_domain)s"
-msgstr ""
-
-#: swift/common/middleware/ratelimit.py:172
-msgid "Returning 497 because of blacklisting"
-msgstr ""
-
-#: swift/common/middleware/ratelimit.py:185
-#, python-format
-msgid "Ratelimit sleep log: %(sleep)s for %(account)s/%(container)s/%(object)s"
-msgstr ""
-
-#: swift/common/middleware/ratelimit.py:192
-#, python-format
-msgid "Returning 498 because of ops rate limiting (Max Sleep) %s"
-msgstr ""
-
-#: swift/common/middleware/ratelimit.py:212
-msgid "Warning: Cannot ratelimit without a memcached client"
-msgstr ""
-
-#: swift/common/middleware/swauth.py:635
-#, python-format
-msgid ""
-"ERROR: Exception while trying to communicate with "
-"%(scheme)s://%(host)s:%(port)s/%(path)s"
-msgstr ""
-
-#: swift/container/auditor.py:54 swift/container/auditor.py:78
-#, python-format
-msgid ""
-"Since %(time)s: Container audits: %(pass)s passed audit, %(fail)s failed "
-"audit"
-msgstr ""
-
-#: swift/container/auditor.py:68
-msgid "Begin container audit \"once\" mode"
-msgstr ""
-
-#: swift/container/auditor.py:88
-#, python-format
-msgid "Container audit \"once\" mode completed: %.02fs"
-msgstr ""
-
-#: swift/container/auditor.py:106
-#, python-format
-msgid "ERROR Could not get container info %s"
-msgstr ""
-
-#: swift/container/server.py:114
-#, python-format
-msgid ""
-"ERROR Account update failed with %(ip)s:%(port)s/%(device)s (will retry "
-"later): Response %(status)s %(reason)s"
-msgstr ""
-
-#: swift/container/server.py:122
-#, python-format
-msgid ""
-"ERROR account update failed with %(ip)s:%(port)s/%(device)s (will retry "
-"later)"
-msgstr ""
-
-#: swift/container/updater.py:78 swift/obj/replicator.py:492
-#, python-format
-msgid "%s is not mounted"
-msgstr ""
-
-#: swift/container/updater.py:97
-#, python-format
-msgid "ERROR with loading suppressions from %s: "
-msgstr ""
-
-#: swift/container/updater.py:107
-msgid "Begin container update sweep"
-msgstr ""
-
-#: swift/container/updater.py:140
-#, python-format
-msgid ""
-"Container update sweep of %(path)s completed: %(elapsed).02fs, "
-"%(success)s successes, %(fail)s failures, %(no_change)s with no changes"
-msgstr ""
-
-#: swift/container/updater.py:154
-#, python-format
-msgid "Container update sweep completed: %.02fs"
-msgstr ""
-
-#: swift/container/updater.py:164
-msgid "Begin container update single threaded sweep"
-msgstr ""
-
-#: swift/container/updater.py:172
-#, python-format
-msgid ""
-"Container update single threaded sweep completed: %(elapsed).02fs, "
-"%(success)s successes, %(fail)s failures, %(no_change)s with no changes"
-msgstr ""
-
-#: swift/container/updater.py:224
-#, python-format
-msgid "Update report sent for %(container)s %(dbfile)s"
-msgstr ""
-
-#: swift/container/updater.py:232
-#, python-format
-msgid "Update report failed for %(container)s %(dbfile)s"
-msgstr ""
-
-#: swift/container/updater.py:266
-#, python-format
-msgid ""
-"ERROR account update failed with %(ip)s:%(port)s/%(device)s (will retry "
-"later): "
-msgstr ""
-
-#: swift/obj/auditor.py:61
-#, python-format
-msgid "Begin object audit \"%s\" mode"
-msgstr ""
-
-#: swift/obj/auditor.py:73
-#, python-format
-msgid ""
-"Since %(start_time)s: Locally: %(passes)d passed audit, %(quars)d "
-"quarantined, %(errors)d errors files/sec: %(frate).2f , bytes/sec: "
-"%(brate).2f"
-msgstr ""
-
-#: swift/obj/auditor.py:90
-#, python-format
-msgid ""
-"Object audit \"%(mode)s\" mode completed: %(elapsed).02fs. Total "
-"files/sec: %(frate).2f , Total bytes/sec: %(brate).2f "
-msgstr ""
-
-#: swift/obj/auditor.py:141
-#, python-format
-msgid "ERROR Object %(obj)s failed audit and will be quarantined: %(err)s"
-msgstr ""
-
-#: swift/obj/auditor.py:150
-#, python-format
-msgid "ERROR Trying to audit %s"
-msgstr ""
-
-#: swift/obj/replicator.py:182
-msgid "Error hashing suffix"
-msgstr ""
-
-#: swift/obj/replicator.py:246
-#, python-format
-msgid "Killing long-running rsync: %s"
-msgstr ""
-
-#: swift/obj/replicator.py:257
-#, python-format
-msgid "Bad rsync return code: %(args)s -> %(ret)d"
-msgstr ""
-
-#: swift/obj/replicator.py:261 swift/obj/replicator.py:265
-#, python-format
-msgid "Successful rsync of %(src)s at %(dst)s (%(time).03f)"
-msgstr ""
-
-#: swift/obj/replicator.py:350
-#, python-format
-msgid "Removing partition: %s"
-msgstr ""
-
-#: swift/obj/replicator.py:353
-msgid "Error syncing handoff partition"
-msgstr ""
-
-#: swift/obj/replicator.py:383
-#, python-format
-msgid "%(ip)s/%(device)s responded as unmounted"
-msgstr ""
-
-#: swift/obj/replicator.py:388
-#, python-format
-msgid "Invalid response %(resp)s from %(ip)s"
-msgstr ""
-
-#: swift/obj/replicator.py:410
-#, python-format
-msgid "Error syncing with node: %s"
-msgstr ""
-
-#: swift/obj/replicator.py:414
-msgid "Error syncing partition"
-msgstr ""
-
-#: swift/obj/replicator.py:424
-#, python-format
-msgid ""
-"%(replicated)d/%(total)d (%(percentage).2f%%) partitions replicated in "
-"%(time).2fs (%(rate).2f/sec, %(remaining)s remaining)"
-msgstr ""
-
-#: swift/obj/replicator.py:433
-#, python-format
-msgid ""
-"%(checked)d suffixes checked - %(hashed).2f%% hashed, %(synced).2f%% "
-"synced"
-msgstr ""
-
-#: swift/obj/replicator.py:439
-#, python-format
-msgid "Partition times: max %(max).4fs, min %(min).4fs, med %(med).4fs"
-msgstr ""
-
-#: swift/obj/replicator.py:446
-#, python-format
-msgid "Nothing replicated for %s seconds."
-msgstr ""
-
-#: swift/obj/replicator.py:475
-msgid "Lockup detected.. killing live coros."
-msgstr ""
-
-#: swift/obj/replicator.py:530
-msgid "Ring change detected. Aborting current replication pass."
-msgstr ""
-
-#: swift/obj/replicator.py:540
-msgid "Exception in top-level replication loop"
-msgstr ""
-
-#: swift/obj/replicator.py:549
-msgid "Running object replicator in script mode."
-msgstr ""
-
-#: swift/obj/replicator.py:553 swift/obj/replicator.py:565
-#, python-format
-msgid "Object replication complete. (%.02f minutes)"
-msgstr ""
-
-#: swift/obj/replicator.py:560
-msgid "Starting object replication pass."
-msgstr ""
-
-#: swift/obj/replicator.py:566
-#, python-format
-msgid "Replication sleeping for %s seconds."
-msgstr ""
-
-#: swift/obj/server.py:313
-#, python-format
-msgid ""
-"ERROR Container update failed (saving for async update later): %(status)d"
-" response from %(ip)s:%(port)s/%(dev)s"
-msgstr ""
-
-#: swift/obj/server.py:319
-#, python-format
-msgid ""
-"ERROR container update failed with %(ip)s:%(port)s/%(dev)s (saving for "
-"async update later)"
-msgstr ""
-
-#: swift/obj/updater.py:65
-msgid "Begin object update sweep"
-msgstr ""
-
-#: swift/obj/updater.py:89
-#, python-format
-msgid ""
-"Object update sweep of %(device)s completed: %(elapsed).02fs, %(success)s"
-" successes, %(fail)s failures"
-msgstr ""
-
-#: swift/obj/updater.py:98
-#, python-format
-msgid "Object update sweep completed: %.02fs"
-msgstr ""
-
-#: swift/obj/updater.py:105
-msgid "Begin object update single threaded sweep"
-msgstr ""
-
-#: swift/obj/updater.py:117
-#, python-format
-msgid ""
-"Object update single threaded sweep completed: %(elapsed).02fs, "
-"%(success)s successes, %(fail)s failures"
-msgstr ""
-
-#: swift/obj/updater.py:157
-#, python-format
-msgid "ERROR Pickle problem, quarantining %s"
-msgstr ""
-
-#: swift/obj/updater.py:177
-#, python-format
-msgid "Update sent for %(obj)s %(path)s"
-msgstr ""
-
-#: swift/obj/updater.py:182
-#, python-format
-msgid "Update failed for %(obj)s %(path)s"
-msgstr ""
-
-#: swift/obj/updater.py:206
-#, python-format
-msgid "ERROR with remote server %(ip)s:%(port)s/%(device)s"
-msgstr ""
-
-#: swift/proxy/server.py:165 swift/proxy/server.py:629
-#: swift/proxy/server.py:696 swift/proxy/server.py:712
-#: swift/proxy/server.py:721 swift/proxy/server.py:1004
-#: swift/proxy/server.py:1044 swift/proxy/server.py:1089
-msgid "Object"
-msgstr ""
-
-#: swift/proxy/server.py:170
-#, python-format
-msgid "Could not load object segment %(path)s: %(status)s"
-msgstr ""
-
-#: swift/proxy/server.py:177 swift/proxy/server.py:210
-#: swift/proxy/server.py:257
-#, python-format
-msgid "ERROR: While processing manifest /%(acc)s/%(cont)s/%(obj)s"
-msgstr ""
-
-#: swift/proxy/server.py:292
-#, python-format
-msgid "%(msg)s %(ip)s:%(port)s"
-msgstr ""
-
-#: swift/proxy/server.py:304
-#, python-format
-msgid "ERROR with %(type)s server %(ip)s:%(port)s/%(device)s re: %(info)s"
-msgstr ""
-
-#: swift/proxy/server.py:328
-#, python-format
-msgid "Node error limited %(ip)s:%(port)s (%(device)s)"
-msgstr ""
-
-#: swift/proxy/server.py:388 swift/proxy/server.py:1451
-#: swift/proxy/server.py:1497 swift/proxy/server.py:1545
-#: swift/proxy/server.py:1590
-msgid "Account"
-msgstr ""
-
-#: swift/proxy/server.py:389
-#, python-format
-msgid "Trying to get account info for %s"
-msgstr ""
-
-#: swift/proxy/server.py:466 swift/proxy/server.py:740
-#: swift/proxy/server.py:772 swift/proxy/server.py:1214
-#: swift/proxy/server.py:1301 swift/proxy/server.py:1356
-#: swift/proxy/server.py:1413
-msgid "Container"
-msgstr ""
-
-#: swift/proxy/server.py:467
-#, python-format
-msgid "Trying to get container info for %s"
-msgstr ""
-
-#: swift/proxy/server.py:552
-#, python-format
-msgid "%(type)s returning 503 for %(statuses)s"
-msgstr ""
-
-#: swift/proxy/server.py:598 swift/proxy/server.py:697
-#, python-format
-msgid "Trying to %(method)s %(path)s"
-msgstr ""
-
-#: swift/proxy/server.py:627
-msgid "Client disconnected on read"
-msgstr ""
-
-#: swift/proxy/server.py:630
-#, python-format
-msgid "Trying to read during GET of %s"
-msgstr ""
-
-#: swift/proxy/server.py:653
-#, python-format
-msgid "ERROR %(status)d %(body)s From %(type)s Server"
-msgstr ""
-
-#: swift/proxy/server.py:692
-#, python-format
-msgid "ERROR %(status)d %(body)s From Object Server"
-msgstr ""
-
-#: swift/proxy/server.py:776 swift/proxy/server.py:783
-#, python-format
-msgid "Object manifest GET could not continue listing: %s %s"
-msgstr ""
-
-#: swift/proxy/server.py:905
-msgid "Object POST"
-msgstr ""
-
-#: swift/proxy/server.py:1005
-#, python-format
-msgid "Expect: 100-continue on %s"
-msgstr ""
-
-#: swift/proxy/server.py:1017
-#, python-format
-msgid "Object PUT returning 503, %(conns)s/%(nodes)s required connections"
-msgstr ""
-
-#: swift/proxy/server.py:1045
-#, python-format
-msgid "Trying to write to %s"
-msgstr ""
-
-#: swift/proxy/server.py:1049
-#, python-format
-msgid ""
-"Object PUT exceptions during send, %(conns)s/%(nodes)s required "
-"connections"
-msgstr ""
-
-#: swift/proxy/server.py:1058
-#, python-format
-msgid "ERROR Client read timeout (%ss)"
-msgstr ""
-
-#: swift/proxy/server.py:1063
-msgid "ERROR Exception causing client disconnect"
-msgstr ""
-
-#: swift/proxy/server.py:1068
-msgid "Client disconnected without sending enough data"
-msgstr ""
-
-#: swift/proxy/server.py:1083
-#, python-format
-msgid "ERROR %(status)d %(body)s From Object Server re: %(path)s"
-msgstr ""
-
-#: swift/proxy/server.py:1090
-#, python-format
-msgid "Trying to get final status of PUT to %s"
-msgstr ""
-
-#: swift/proxy/server.py:1093
-#, python-format
-msgid "Object servers returned %s mismatched etags"
-msgstr ""
-
-#: swift/proxy/server.py:1101
-msgid "Object PUT"
-msgstr ""
-
-#: swift/proxy/server.py:1153
-msgid "Object DELETE"
-msgstr ""
-
-#: swift/proxy/server.py:1302 swift/proxy/server.py:1498
-#, python-format
-msgid "Trying to PUT to %s"
-msgstr ""
-
-#: swift/proxy/server.py:1314
-msgid "Container PUT"
-msgstr ""
-
-#: swift/proxy/server.py:1357 swift/proxy/server.py:1546
-#, python-format
-msgid "Trying to POST %s"
-msgstr ""
-
-#: swift/proxy/server.py:1369
-msgid "Container POST"
-msgstr ""
-
-#: swift/proxy/server.py:1414 swift/proxy/server.py:1591
-#, python-format
-msgid "Trying to DELETE %s"
-msgstr ""
-
-#: swift/proxy/server.py:1426
-msgid "Container DELETE"
-msgstr ""
-
-#: swift/proxy/server.py:1433
-msgid "Returning 503 because not all container nodes confirmed DELETE"
-msgstr ""
-
-#: swift/proxy/server.py:1508
-msgid "Account PUT"
-msgstr ""
-
-#: swift/proxy/server.py:1556
-msgid "Account POST"
-msgstr ""
-
-#: swift/proxy/server.py:1601
-msgid "Account DELETE"
-msgstr ""
-
-#: swift/proxy/server.py:1757
-msgid "ERROR Unhandled exception in request"
-msgstr ""
-
-#: swift/stats/access_processor.py:63 swift/stats/stats_processor.py:40
-#, python-format
-msgid "Bad line data: %s"
-msgstr ""
-
-#: swift/stats/access_processor.py:67
-#, python-format
-msgid "Bad server name: found \"%(found)s\" expected \"%(expected)s\""
-msgstr ""
-
-#: swift/stats/access_processor.py:75
-#, python-format
-msgid "Invalid path: %(error)s from data: %(log)s"
-msgstr ""
-
-#: swift/stats/access_processor.py:199
-#, python-format
-msgid "I found a bunch of bad lines in %(name)s (%(bad)d bad, %(total)d total)"
-msgstr ""
-
-#: swift/stats/account_stats.py:55
-msgid "Gathering account stats"
-msgstr ""
-
-#: swift/stats/account_stats.py:59
-#, python-format
-msgid "Gathering account stats complete (%0.2f minutes)"
-msgstr ""
-
-#: swift/stats/account_stats.py:75
-#, python-format
-msgid "Device %s is not mounted, skipping."
-msgstr ""
-
-#: swift/stats/account_stats.py:81
-#, python-format
-msgid "Path %s does not exist, skipping."
-msgstr ""
-
-#: swift/stats/log_processor.py:62
-#, python-format
-msgid "Loaded plugin \"%s\""
-msgstr ""
-
-#: swift/stats/log_processor.py:79
-#, python-format
-msgid "Processing %(obj)s with plugin \"%(plugin)s\""
-msgstr ""
-
-#: swift/stats/log_processor.py:179
-#, python-format
-msgid "Bad compressed data for %s"
-msgstr ""
-
-#: swift/stats/log_processor.py:240
-msgid "Beginning log processing"
-msgstr ""
-
-#: swift/stats/log_processor.py:278
-#, python-format
-msgid "found %d processed files"
-msgstr ""
-
-#: swift/stats/log_processor.py:283
-#, python-format
-msgid "loaded %d files to process"
-msgstr ""
-
-#: swift/stats/log_processor.py:286 swift/stats/log_processor.py:360
-#, python-format
-msgid "Log processing done (%0.2f minutes)"
-msgstr ""
-
-#: swift/stats/log_uploader.py:71
-msgid "Uploading logs"
-msgstr ""
-
-#: swift/stats/log_uploader.py:74
-#, python-format
-msgid "Uploading logs complete (%0.2f minutes)"
-msgstr ""
-
-#: swift/stats/log_uploader.py:129
-#, python-format
-msgid "Unexpected log: %s"
-msgstr ""
-
-#: swift/stats/log_uploader.py:135
-#, python-format
-msgid "Skipping log: %(file)s (< %(cutoff)d seconds old)"
-msgstr ""
-
-#: swift/stats/log_uploader.py:142
-#, python-format
-msgid "Log %s is 0 length, skipping"
-msgstr ""
-
-#: swift/stats/log_uploader.py:144
-#, python-format
-msgid "Processing log: %s"
-msgstr ""
-
-#: swift/stats/log_uploader.py:165
-#, python-format
-msgid "Uploaded log %(file)s to %(target)s"
-msgstr ""
-
-#: swift/stats/log_uploader.py:170
-#, python-format
-msgid "ERROR: Upload of log %s failed!"
-msgstr ""
-
diff --git a/lower-constraints.txt b/lower-constraints.txt
new file mode 100644
index 0000000000..ee9acd4d94
--- /dev/null
+++ b/lower-constraints.txt
@@ -0,0 +1,81 @@
+alabaster==0.7.10
+asn1crypto==0.24.0
+attrs==21.4.0
+Babel==2.5.3
+bandit==1.1.0
+boto==2.32.1
+boto3==1.9
+botocore==1.12
+castellan==0.13.0
+certifi==2018.1.18
+cffi==1.11.5
+chardet==3.0.4
+cliff==2.11.0
+cmd2==0.8.1
+coverage==5.2.1
+cryptography==2.0.2
+debtcollector==1.19.0
+dnspython==1.15.0
+docutils==0.15
+dulwich==0.19.0
+enum-compat==0.0.2
+eventlet==0.25.0
+extras==1.0.0
+fixtures==3.0.0
+future==0.16.0
+gitdb2==2.0.3
+GitPython==2.1.8
+greenlet==0.4.14
+idna==2.6
+imagesize==1.0.0
+iso8601==0.1.12
+Jinja2==2.10
+keystoneauth1==3.4.0
+keystonemiddleware==4.17.0
+linecache2==1.0.0
+lxml==4.2.3
+MarkupSafe==1.0
+mock==3.0
+monotonic==1.4
+msgpack==0.5.6
+netaddr==0.7.19
+netifaces==0.10.4
+oslo.config==5.2.0
+oslo.i18n==3.20.0
+oslo.log==3.22.0
+oslo.serialization==2.25.0
+oslo.utils==3.36.0
+PasteDeploy==2.0.0
+pbr==3.1.1
+prettytable==0.7.2
+pycparser==2.18
+pyeclib==1.3.1
+pykmip==0.7.0
+Pygments==2.2.0
+pyparsing==2.2.0
+pyperclip==1.6.0
+pytest==4.6.11
+pytest-cov==2.12.1
+pytest-subtests==0.3.0
+python-keystoneclient==3.19.0
+python-mimeparse==1.6.0
+python-subunit==1.2.0
+python-swiftclient==3.2.0
+pytz==2018.3
+PyYAML==3.12
+requests==2.14.2
+requests-mock==1.2.0
+rfc3986==1.1.0
+smmap2==2.0.3
+snowballstemmer==1.2.1
+stestr==2.0.0
+stevedore==1.28.0
+testtools==2.3.0
+traceback2==1.4.0
+unittest2==1.1.0
+urllib3==1.22
+voluptuous==0.11.1
+wrapt==1.10.11
+xattr==0.7.2
+pycadf===2.10.0
+python-dateutil===2.4.2
diff --git a/py3-constraints.txt b/py3-constraints.txt
new file mode 100644
index 0000000000..e947dda139
--- /dev/null
+++ b/py3-constraints.txt
@@ -0,0 +1,205 @@
+GitPython===3.1.45;python_version>='3.7'
+PasteDeploy===3.1.0;python_version>='3.7'
+PyJWT===2.10.1;python_version>='3.9'
+PyJWT===2.9.0;python_version=='3.8'
+# PyJWT only required for keystonemiddleware>10, which requires python>=3.8
+PyYAML===6.0.2;python_version>='3.8'
+PyYAML===6.0.1;python_version=='3.7'
+Pygments===2.19.2;python_version>='3.8'
+Pygments===2.17.2;python_version=='3.7'
+WebOb===1.8.9
+attrs===25.3.0;python_version>='3.8'
+attrs===24.2.0;python_version=='3.7'
+autopage===0.5.2
+bandit===1.8.6;python_version>='3.9'
+bandit===1.7.10;python_version=='3.8'
+bandit===1.7.5;python_version=='3.7'
+boto3===1.40.19;python_version>='3.9'
+boto3===1.37.38;python_version=='3.8'
+boto3===1.33.13;python_version=='3.7'
+botocore===1.40.19;python_version>='3.9'
+botocore===1.37.38;python_version=='3.8'
+botocore===1.33.13;python_version=='3.7'
+certifi===2025.8.3
+cffi===1.17.1;python_version>='3.8'
+cffi===1.15.1;python_version=='3.7'
+charset-normalizer===3.4.3;python_version>='3.7'
+cliff===4.11.0;python_version>='3.10'
+cliff===4.9.1;python_version=='3.9'
+cliff===4.7.0;python_version=='3.8'
+cliff===3.10.1;python_version=='3.7'
+cmd2===2.7.0;python_version>='3.9'
+cmd2===2.5.11;python_version=='3.8'
+cmd2===2.4.3;python_version=='3.7'
+coverage===7.10.5;python_version>='3.9'
+coverage===7.6.1;python_version=='3.8'
+coverage===7.2.7;python_version=='3.7'
+cryptography===45.0.6;python_version>='3.7'
+debtcollector===3.0.0;python_version>='3.8'
+debtcollector===2.5.0;python_version=='3.7'
+decorator===5.2.1;python_version>='3.8'
+decorator===5.1.1;python_version=='3.7'
+dnspython===2.7.0;python_version>='3.9'
+dnspython===2.6.1;python_version=='3.8'
+dnspython===2.3.0;python_version=='3.7'
+docutils===0.22;python_version>='3.9'
+docutils===0.20.1;python_version=='3.8'
+docutils===0.20.1;python_version=='3.7'
+dogpile.cache===1.4.0;python_version>='3.9'
+dogpile.cache===1.3.4;python_version=='3.8'
+dogpile.cache===1.2.2;python_version=='3.7'
+eventlet===0.40.3;python_version>='3.9'
+eventlet===0.39.1;python_version=='3.8'
+eventlet===0.38.2;python_version=='3.7'
+exceptiongroup===1.3.0;python_version>='3.7'
+extras===1.0.0
+fixtures===4.2.6;python_version>='3.9'
+fixtures===4.2.5;python_version=='3.8'
+fixtures===4.1.0;python_version=='3.7'
+flake8===7.1.2;python_version>='3.8'
+flake8===3.8.4;python_version=='3.7'
+gitdb===4.0.12;python_version>='3.7'
+greenlet===3.2.4;python_version>='3.9'
+greenlet===3.1.1;python_version=='3.8'
+greenlet===3.1.1;python_version=='3.7'
+hacking===7.0.0;python_version>='3.8'
+hacking===4.1.0;python_version=='3.7'
+idna===3.10
+importlib-metadata===8.7.0;python_version>='3.9'
+importlib-metadata===8.5.0;python_version=='3.8'
+importlib-metadata===6.7.0;python_version=='3.7'
+importlib-resources===6.5.2;python_version>='3.9'
+importlib-resources===6.4.5;python_version=='3.8'
+importlib-resources===5.12.0;python_version=='3.7'
+iniconfig===2.1.0;python_version>='3.8'
+iniconfig===2.0.0;python_version=='3.7'
+iso8601===2.1.0;python_version>='3.7'
+jmespath===1.0.1;python_version>='3.7'
+keystoneauth1===5.12.0;python_version>='3.10'
+keystoneauth1===5.11.1;python_version=='3.9'
+keystoneauth1===5.8.1;python_version=='3.8'
+keystoneauth1===5.1.3;python_version=='3.7'
+keystonemiddleware===10.12.0;python_version>='3.10'
+keystonemiddleware===10.11.0;python_version=='3.9'
+keystonemiddleware===10.7.1;python_version=='3.8'
+keystonemiddleware===9.5.0;python_version=='3.7'
+lxml===6.0.1;python_version>='3.8'
+lxml===5.4.0;python_version=='3.7'
+markdown-it-py===4.0.0;python_version>='3.10'
+markdown-it-py===3.0.0;python_version=='3.9'
+markdown-it-py===3.0.0;python_version=='3.8'
+markdown-it-py===2.2.0;python_version=='3.7'
+mccabe===0.7.0;python_version>='3.8'
+mccabe===0.6.1;python_version=='3.7'
+mdurl===0.1.2;python_version>='3.7'
+mock===5.2.0
+msgpack===1.1.1;python_version>='3.8'
+msgpack===1.0.5;python_version=='3.7'
+netaddr===1.3.0;python_version>='3.7'
+netifaces===0.11.0
+os-service-types===1.8.0;python_version>='3.10'
+os-service-types===1.7.0;python_version=='3.9'
+os-service-types===1.7.0;python_version=='3.8'
+os-service-types===1.7.0;python_version=='3.7'
+oslo.cache===3.12.0;python_version>='3.9'
+oslo.cache===3.8.0;python_version=='3.8'
+oslo.cache===2.11.0;python_version=='3.7'
+oslo.config===10.0.0;python_version>='3.9'
+oslo.config===9.6.0;python_version=='3.8'
+oslo.config===8.8.1;python_version=='3.7'
+oslo.context===6.1.0;python_version>='3.9'
+oslo.context===5.6.0;python_version=='3.8'
+oslo.context===4.1.0;python_version=='3.7'
+oslo.i18n===6.6.0;python_version>='3.9'
+oslo.i18n===6.4.0;python_version=='3.8'
+oslo.i18n===5.1.0;python_version=='3.7'
+oslo.log===7.2.1;python_version>='3.9'
+oslo.log===6.1.2;python_version=='3.8'
+oslo.log===4.8.0;python_version=='3.7'
+oslo.serialization===5.8.0;python_version>='3.9'
+oslo.serialization===5.5.0;python_version=='3.8'
+oslo.serialization===4.3.0;python_version=='3.7'
+oslo.utils===9.1.0;python_version>='3.9'
+oslo.utils===7.3.0;python_version=='3.8'
+oslo.utils===4.13.0;python_version=='3.7'
+packaging===25.0;python_version>='3.8'
+packaging===24.0;python_version=='3.7'
+pbr===7.0.1
+pluggy===1.6.0;python_version>='3.9'
+pluggy===1.5.0;python_version=='3.8'
+pluggy===1.2.0;python_version=='3.7'
+prettytable===3.16.0;python_version>='3.9'
+prettytable===3.11.0;python_version=='3.8'
+prettytable===3.7.0;python_version=='3.7'
+py===1.11.0
+pycadf===4.0.1;python_version>='3.9'
+pycadf===4.0.0;python_version=='3.8'
+pycadf===3.1.1;python_version=='3.7'
+pycodestyle===2.12.1;python_version>='3.8'
+pycodestyle===2.6.0;python_version=='3.7'
+pycparser===2.22;python_version>='3.8'
+pycparser===2.21;python_version=='3.7'
+pyeclib===1.6.4
+pyflakes===3.2.0;python_version>='3.8'
+pyflakes===2.2.0;python_version=='3.7'
+pyinotify===0.9.6
+pyparsing===3.2.3;python_version>='3.9'
+pyparsing===3.1.4;python_version=='3.8'
+pyparsing===3.1.4;python_version=='3.7'
+pyperclip===1.9.0
+pytest===8.4.1;python_version>='3.9'
+pytest===8.3.5;python_version=='3.8'
+pytest===7.4.4;python_version=='3.7'
+pytest-cov===6.2.1;python_version>='3.9'
+pytest-cov===5.0.0;python_version=='3.8'
+pytest-cov===4.1.0;python_version=='3.7'
+python-dateutil===2.9.0.post0
+python-keystoneclient===5.7.0;python_version>='3.10'
+python-keystoneclient===5.6.0;python_version=='3.9'
+python-keystoneclient===5.5.0;python_version=='3.8'
+python-keystoneclient===4.5.0;python_version=='3.7'
+python-subunit===1.4.4;python_version>='3.7'
+python-swiftclient===4.8.0
+pytz===2025.2
+requests===2.32.5;python_version>='3.9'
+requests===2.32.4;python_version=='3.8'
+requests===2.31.0;python_version=='3.7'
+requests-mock===1.12.1
+rfc3986===2.0.0;python_version>='3.7'
+rich===14.1.0;python_version>='3.8'
+rich===13.8.1;python_version=='3.7'
+s3transfer===0.13.1;python_version>='3.9'
+s3transfer===0.11.5;python_version=='3.8'
+s3transfer===0.8.2;python_version=='3.7'
+setuptools===75.3.0;python_version>='3.12'
+smmap===5.0.2;python_version>='3.7'
+stestr===4.2.0;python_version>='3.8'
+stestr===4.1.0;python_version=='3.7'
+stevedore===5.5.0;python_version>='3.9'
+stevedore===5.3.0;python_version=='3.8'
+stevedore===3.5.2;python_version=='3.7'
+testtools===2.7.2;python_version>='3.8'
+testtools===2.7.1;python_version=='3.7'
+tomli===2.2.1;python_version>='3.8'
+tomli===2.0.1;python_version=='3.7'
+tomlkit===0.13.3;python_version>='3.8'
+tomlkit===0.12.5;python_version=='3.7'
+typing_extensions===4.15.0;python_version>='3.9'
+typing_extensions===4.13.2;python_version=='3.8'
+typing_extensions===4.7.1;python_version=='3.7'
+tzdata===2025.2;python_version>='3.9'
+urllib3===2.5.0;python_version>='3.10'
+urllib3===1.26.20;python_version=='3.9'
+urllib3===1.26.20;python_version=='3.8'
+urllib3===1.26.20;python_version=='3.7'
+voluptuous===0.15.2;python_version>='3.9'
+voluptuous===0.14.2;python_version=='3.8'
+voluptuous===0.14.1;python_version=='3.7'
+wcwidth===0.2.13
+wrapt===1.17.3;python_version>='3.8'
+wrapt===1.16.0;python_version=='3.7'
+xattr===1.2.0;python_version>='3.8'
+xattr===0.10.1;python_version=='3.7'
+zipp===3.23.0;python_version>='3.9'
+zipp===3.20.2;python_version=='3.8'
+zipp===3.15.0;python_version=='3.7'
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000..e63322f962
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["pbr>=6.0.0", "setuptools>=64"]
+build-backend = "pbr.build"
diff --git a/releasenotes/notes/2_10_0_release-666a76f4975657a5.yaml b/releasenotes/notes/2_10_0_release-666a76f4975657a5.yaml
new file mode 100644
index 0000000000..4b2e889112
--- /dev/null
+++ b/releasenotes/notes/2_10_0_release-666a76f4975657a5.yaml
@@ -0,0 +1,59 @@
+---
+features:
+ - >
+   Object versioning now supports a "history" mode in addition to
+   the older "stack" mode. The difference is in how DELETE requests
+   are handled. For full details, please read
+   https://docs.openstack.org/swift/latest/overview_object_versioning.html.
+ - >
+   New config variables to change the schedule priority and I/O
+   scheduling class. Servers and daemons now understand
+   `nice_priority`, `ionice_class`, and `ionice_priority` to
+   schedule their relative importance. Please read
+   https://docs.openstack.org/swift/latest/deployment_guide.html
+   for full config details.
+ - >
+   On newer kernels (3.15+ when using xfs), Swift will use the O_TMPFILE
+   flag when opening a file instead of creating a temporary file
+   and renaming it on commit. This makes the data path simpler and
+   allows the filesystem to more efficiently optimize the files on
+   disk, resulting in better performance.
+ - >
+   Erasure code GET performance has been significantly
+   improved in clusters that are not completely healthy.
+ - >
+   Significant improvements to the api-ref doc available at
+   https://developer.openstack.org/api-ref/object-storage/.
+ - >
+   A PUT or POST to a container will now update the container's
+   Last-Modified time, and that value will be included in a
+   GET/HEAD response.
+ - >
+   Include object sysmeta in POST responses. Sysmeta is still
+   stripped from the response before being sent to the client, but
+   this allows middleware to make use of the information.
+upgrade:
+ - >
+   Update dnspython dependency to 1.14, removing the need to have
+   separate dnspython dependencies for Py2 and Py3.
+ - >
+   Deprecate swift-temp-url and call python-swiftclient's
+   implementation instead. This adds python-swiftclient as an
+   optional dependency of Swift.
+ - >
+   Moved other-requirements.txt to bindep.txt. bindep.txt lists
+   non-python dependencies of Swift.
+fixes:
+ - >
+   Fixed a bug where a container listing delimiter wouldn't work
+   with encryption.
+ - >
+   Fixed a bug where some headers weren't being copied correctly
+   in a COPY request.
+ - >
+   Container sync can now copy SLOs more efficiently by allowing
+   the manifest to be synced before all of the referenced segments.
+   This fixes a bug where container sync would not copy SLO manifests.
+ - Fixed a bug where some tombstone files might never be reclaimed.
+other:
+  - Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/2_11_0_release-ac1d256e455d347e.yaml b/releasenotes/notes/2_11_0_release-ac1d256e455d347e.yaml
new file mode 100644
index 0000000000..f07982bbc0
--- /dev/null
+++ b/releasenotes/notes/2_11_0_release-ac1d256e455d347e.yaml
@@ -0,0 +1,54 @@
+---
+features:
+  - >
+    The improvements to EC reads made in Swift 2.10.0 have also been
+    applied to the reconstructor. This allows fragments to be rebuilt
+    in more circumstances, resulting in faster recovery from failures.
+  - >
+    Instead of using a separate .durable file to indicate the
+    durable status of an EC fragment archive, we rename the .data
+    to include a durable marker in the filename. This saves one
+    inode for every EC .data file. Existing .durable files will not
+    be removed, and they will continue to work just fine.
+  - >
+    Closed a bug where ssync may have written bad fragment data in
+    some circumstances. A check was added to ensure the correct number
+    of bytes is written for a fragment before finalizing the write.
+    Also, erasure coded fragment metadata will now be validated on read
+    requests and, if bad data is found, the fragment will be quarantined.
+  - Added a configurable URL base to staticweb.
+  - Support multi-range GETs for static large objects.
+  - >
+    TempURLs using the "inline" parameter can now also set the
+    "filename" parameter. Both are used in the Content-Disposition
+    response header.
+  - Mirror X-Trans-Id to X-Openstack-Request-Id.
+  - >
+    SLO will now concurrently HEAD segments, resulting in much faster
+    manifest validation and object creation. By default, two HEAD requests
+    will be done at a time, but this can be changed by the operator via
+    the new `concurrency` setting in the "[filter:slo]" section of
+    the proxy server config.
+  - Suppressed the KeyError message when auditor finds an expired object.
+  - Daemons using InternalClient can now be properly killed with SIGTERM.
+  - >
+    Added a "user" option to the drive-audit config file. Its value is
+    used to set the owner of the drive-audit recon cache.
+  - >
+    Throttle update_auditor_status calls so it updates no more than once
+    per minute.
+  - Suppress unexpected-file warnings for rsync temp files.
+upgrade:
+  - Updated the PyECLib dependency to 1.3.1.
+  - >
+    Note that after writing EC data with Swift 2.11.0 or later, that
+    data will not be accessible to earlier versions of Swift.
+critical:
+  - >
+    WARNING: If you are using the ISA-L library for erasure codes,
+    please upgrade to liberasurecode 1.3.1 (or later) as soon as
+    possible. If you are using isa_l_rs_vand with more than 4 parity,
+    please read https://bugs.launchpad.net/swift/+bug/1639691 and take
+    necessary action.
+other:
+  - Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/2_12_0_release-06af226abc7b91ef.yaml b/releasenotes/notes/2_12_0_release-06af226abc7b91ef.yaml
new file mode 100644
index 0000000000..0246fe5f3a
--- /dev/null
+++ b/releasenotes/notes/2_12_0_release-06af226abc7b91ef.yaml
@@ -0,0 +1,57 @@
+---
+features:
+  - >
+    Ring files now include byteorder information about the endian of
+    the machine used to generate the file, and the values are
+    appropriately byteswapped if deserialized on a machine with a
+    different endianness.
+
+    Newly created ring files will be byteorder agnostic, but
+    previously generated ring files will still fail on different
+    endian architectures. Regenerating older ring files will cause
+    them to become byteorder agnostic. The regeneration of the ring
+    files will not cause any new data movement. Newer ring files
+    will still be usable by older versions of Swift (on machines
+    with the same endianness--this maintains existing behavior).
+  - >
+    All 416 responses will now include a Content-Range header with
+    an unsatisfied-range value. This allows the caller to know the
+    valid range request value for an object.
+  - >
+    TempURLs now support a validation against a common prefix. A
+    prefix-based signature grants access to all objects which share the
+    same prefix. This avoids the creation of a large amount of signatures,
+    when a whole container or pseudofolder is shared.
+  - >
+    In SLO manifests, the `etag` and `size_bytes` keys are now fully
+    optional and not required. Previously, the keys needed to exist
+    but the values were optional. The only required key is `path`.
+  - Respect server type for --md5 check in swift-recon.
+fixes:
+  - Correctly handle deleted files with if-none-match requests.
+  - >
+    Correctly send 412 Precondition Failed if a user sends an
+    invalid copy destination. Previously Swift would send a 500
+    Internal Server Error.
+  - Fixed a rare infinite loop in `swift-ring-builder` while placing parts.
+  - >
+    Ensure update of the container by object-updater, removing a rare
+    possibility that objects would never be added to a container listing.
+  - >
+    Fixed non-deterministic suffix updates in hashes.pkl where a partition
+    may be updated much less often than expected.
+  - >
+    Fixed regression in consolidate_hashes that occurred when a new
+    file was stored to new suffix to a non-empty partition. This bug
+    was introduced in 2.7.0 and could cause an increase in rsync
+    replication stats during and after upgrade, due to inconsistent
+    hashing of partition suffixes.
+  - >
+    Account and container databases will now be quarantined if the
+    database schema has been corrupted.
+  - Remove empty db hash and suffix directories if a db gets quarantined.
+other:
+  - >
+    Removed "in-process-" from func env tox name to work with
+    upstream CI.
+  - Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/2_13_0_release-875e1fb1ef59f015.yaml b/releasenotes/notes/2_13_0_release-875e1fb1ef59f015.yaml
new file mode 100644
index 0000000000..cd922910d7
--- /dev/null
+++ b/releasenotes/notes/2_13_0_release-875e1fb1ef59f015.yaml
@@ -0,0 +1,83 @@
+---
+features:
+  - >
+    Improved performance by eliminating an unneeded directory
+    structure hash.
+  - >
+    Optimized the common case for hashing filesystem trees, thus
+    eliminating a lot of extraneous disk I/O.
+  - >
+    Updated the `hashes.pkl` file format to include timestamp information
+    for race detection. Also simplified hashing logic to prevent race
+    conditions and optimize for the common case.
+  - >
+    The erasure code reconstructor will now shuffle work jobs across all
+    disks instead of going disk-by-disk. This eliminates single-disk I/O
+    contention and allows continued scaling as concurrency is increased.
+  - >
+    Erasure code reconstruction handles moving data from handoff nodes
+    better. Instead of moving the data to another handoff, it waits
+    until it can be moved to a primary node.
+  - >
+    Temporary URLs now support one common form of ISO 8601 timestamps in
+    addition to Unix seconds-since-epoch timestamps. The ISO 8601 format
+    accepted is '%Y-%m-%dT%H:%M:%SZ'. This makes TempURLs more
+    user-friendly to produce and consume.
+  - >
+    Listing containers in accounts with json or xml now includes a
+    `last_modified` time. This does not change any on-disk data, but simply
+    exposes the value to offer consistency with the object listings on
+    containers.
+  - I/O priority is now supported on AArch64 architecture.
+upgrade:
+  - If you upgrade and roll back, you must delete all `hashes.pkl` files.
+deprecations:
+  - >
+    If using erasure coding with ISA-L in rs_vand mode and 5 or more parity
+    fragments, Swift will emit a warning. This is a configuration that is
+    known to harm data durability. In a future release, this warning will be
+    upgraded to an error unless the policy is marked as deprecated. All data
+    in an erasure code storage policy using isa_l_rs_vand with 5 or more
+    parity should be migrated as soon as possible. Please see
+    https://bugs.launchpad.net/swift/+bug/1639691 for more information.
+  - >
+    The erasure code reconstructor `handoffs_first` option has been
+    deprecated in favor of `handoffs_only`. `handoffs_only` is far more
+    useful, and just like `handoffs_first` mode in the replicator, it gives
+    the operator the option of forcing the consistency engine to focus
+    solely on revert (handoff) jobs, thus improving the speed of
+    rebalances.  The `handoffs_only` behavior is somewhat consistent with
+    the replicator's `handoffs_first` option (any error on any handoff in
+    the replicator will make it essentially handoff only forever) but the
+    `handoff_only` option does what you want and is named correctly in the
+    reconstructor.
+  - >
+    The default for `object_post_as_copy` has been changed to False. The
+    option is now deprecated and will be removed in a future release. If
+    your cluster is still running with post-as-copy enabled, please update
+    it to use the "fast-post" method. Future versions of Swift will not
+    support post-as-copy, and future features will not be supported under
+    post-as-copy. ("Fast-post" is where `object_post_as_copy` is false).
+fixes:
+  - >
+    Fixed a bug where the ring builder would not allow removal of a device
+    when min_part_seconds_left was greater than zero.
+  - >
+    PUT subrequests generated from a client-side COPY will now properly log
+    the SSC (server-side copy) Swift source field. See
+    https://docs.openstack.org/developer/swift/logs.html#swift-source for
+    more information.
+  - >
+    Fixed a bug where an SLO download with a range request may have resulted
+    in a 5xx series response.
+  - >
+    SLO manifest PUT requests can now be properly validated by sending an
+    ETag header of the md5 sum of the concatenated md5 sums of the
+    referenced segments.
+  - Fixed the stats calculation in the erasure code reconstructor.
+  - >
+    Rings with min_part_hours set to zero will now only move one partition
+    replica per rebalance, thus matching behavior when min_part_hours is
+    greater than zero.
+other:
+  - Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/2_14_0_release-7c3ef515ebded888.yaml b/releasenotes/notes/2_14_0_release-7c3ef515ebded888.yaml
new file mode 100644
index 0000000000..0307a4985a
--- /dev/null
+++ b/releasenotes/notes/2_14_0_release-7c3ef515ebded888.yaml
@@ -0,0 +1,41 @@
+---
+features:
+  - EC Fragment Duplication - Foundational Global EC Cluster Support.
+  - name_check and cname_lookup keys have been added to `/info`.
+  - Add Vary headers for CORS responses.
+  - Always set Swift processes to use UTC.
+  - >
+    Removed per-device reconstruction stats. Now that the reconstructor
+    is shuffling parts before going through them, those stats no longer
+    make sense.
+  - domain_remap now accepts a list of domains in "storage_domain".
+  - Do not follow CNAME when host is in storage_domain.
+  - >
+    Enable cluster-wide CORS Expose-Headers setting via
+    "cors_expose_headers".
+  - Cache all answers from nameservers in cname_lookup.
+fixes:
+  - >
+    Fixed error where a container drive error resulted in double space
+    usage on rest drives. When drive with container or account database
+    is unmounted, the bug would create handoff replicas on all remaining
+    drives, increasing the drive space used and filling the cluster.
+  - >
+    Fixed UnicodeDecodeError in the object reconstructor that would
+    prevent objects with non-ascii names from being reconstructed and
+    caused the reconstructor process to hang.
+  - >
+    Fixed encoding issue in ssync where a mix of ascii and non-ascii
+    metadata values would cause an error.
+  - Log the correct request type of a subrequest downstream of copy.
+  - >
+    Prevent logged traceback in object-server on client disconnect for
+    chunked transfers to replicated policies.
+  - >
+    Fixed a race condition in updating hashes.pkl where a partition
+    suffix invalidation may have been skipped.
+  - Include received fragment index in reconstructor log warnings.
+  - Log correct status code for conditional requests.
+other:
+  - Drop support for auth-server from common/manager.py and `swift-init`.
+  - Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/2_15_0_release-0a05a011fb85a9c9.yaml b/releasenotes/notes/2_15_0_release-0a05a011fb85a9c9.yaml
new file mode 100644
index 0000000000..e0b239ddbc
--- /dev/null
+++ b/releasenotes/notes/2_15_0_release-0a05a011fb85a9c9.yaml
@@ -0,0 +1,102 @@
+---
+features:
+  - |
+    Add Composite Ring Functionality
+
+    A composite ring comprises two or more component rings that are
+    combined to form a single ring with a replica count equal to the
+    sum of the component rings. The component rings are built
+    independently, using distinct devices in distinct regions, which
+    means that the dispersion of replicas between the components can
+    be guaranteed.
+
+    Composite rings can be used for explicit replica placement and
+    "replicated EC" for global erasure codes policies.
+
+    Composite rings support 'cooperative' rebalance which means that
+    during rebalance all component rings will be consulted before a
+    partition is moved in any component ring. This avoids the same
+    partition being simultaneously moved in multiple components.
+
+    We do not yet have CLI tools for creating composite rings, but
+    the functionality has been enabled in the ring modules to
+    support this advanced functionality. CLI tools will be delivered
+    in a subsequent release.
+
+    For further information see the
+    `docs `__
+  - |
+    The EC reconstructor process has been dramatically improved by
+    adding support for multiple concurrent workers. Multiple
+    processes are required to get high concurrency, and this change
+    results in much faster rebalance times on servers with many
+    drives.
+
+    Currently the default is still only one process, and no workers.
+    Set ``reconstructor_workers`` in the ``[object-reconstructor]``
+    section to some whole number <= the number of devices on a node
+    to get that many reconstructor workers.
+  - |
+    Add support to increase object ring partition power transparently
+    to end users and with no cluster downtime. Increasing the ring
+    part power allows for incremental adjustment to the upper bound
+    of the cluster size. Please review the
+    `full docs `__
+    for more information.
+  - |
+    Added support for per-policy proxy config options. This allows
+    per-policy affinity options to be set for use with duplicated EC
+    policies and composite rings. Certain options found in per-policy
+    conf sections will override their equivalents that may be set
+    in the [app:proxy-server] section. Currently the options handled that
+    way are ``sorting_method``, ``read_affinity``, ``write_affinity``,
+    ``write_affinity_node_count``, and ``write_affinity_handoff_delete_count``.
+  - Enabled versioned writes on Dynamic Large Objects (DLOs).
+  - |
+    Write-affinity aware object deletion
+
+    Previously, when deleting objects in multi-region swift
+    deployment with write affinity configured, users always get 404
+    when deleting object before it's replicated to appropriate nodes.
+
+    Now Swift will use ``write_affinity_handoff_delete_count`` to
+    define how many local handoff nodes should swift send request to
+    get more candidates for the final response. The default value
+    "auto" means Swift will calculate the number automatically based
+    on the number of replicas and current cluster topology.
+  - |
+    Require that known-bad EC schemes be deprecated
+
+    Erasure-coded storage policies using ``isa_l_rs_vand`` and ``nparity``
+    >= 5 must be configured as deprecated, preventing any new
+    containers from being created with such a policy. This
+    configuration is known to harm data durability. Any data in such
+    policies should be migrated to a new policy. See
+    See `Launchpad bug 1639691 `__
+    for more information.
+  - |
+    Optimize the Erasure Code reconstructor protocol to reduce IO
+    load on servers.
+  - Fixed a bug where SSYNC would fail to replicate unexpired object.
+  - Fixed a bug in domain_remap when obj starts/ends with slash.
+  - Fixed a socket leak in copy middleware when a large object was copied.
+  - Fixed a few areas where the ``swiftdir`` option was not respected.
+  - swift-recon now respects storage policy aliases.
+  - |
+    cname_lookup middleware now accepts a ``nameservers`` config
+    variable that, if defined, will be used for DNS lookups instead of
+    the system default.
+  - |
+    Make mount_check option usable in containerized environments by
+    adding a check for an ".ismount" file at the root directory of
+    a device.
+  - Remove deprecated ``vm_test_mode`` option.
+  - |
+    The object and container server config option ``slowdown`` has been
+    deprecated in favor of the new ``objects_per_second`` and
+    ``containers_per_second`` options.
+  - |
+    The output of devices from ``swift-ring-builder`` has been reordered
+    by region, zone, ip, and device.
+  - Imported docs content from openstack-manuals project.
+  - Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/2_15_1_release-be25e67bfc5e886a.yaml b/releasenotes/notes/2_15_1_release-be25e67bfc5e886a.yaml
new file mode 100644
index 0000000000..5c8ae8787a
--- /dev/null
+++ b/releasenotes/notes/2_15_1_release-be25e67bfc5e886a.yaml
@@ -0,0 +1,19 @@
+---
+fixes:
+  - |
+    Fixed a bug introduced in 2.15.0 where the object reconstructor
+    would exit with a traceback if no EC policy was configured.
+  - |
+    Fixed deadlock when logging from a tpool thread.
+
+    The object server runs certain IO-intensive methods outside the
+    main pthread for performance. Previously, if one of those methods
+    tried to log, this can cause a crash that eventually leads to an
+    object server with hundreds or thousands of greenthreads, all
+    deadlocked. The fix is to use a mutex that works across different
+    greenlets and different pthreads.
+  - |
+    The object reconstructor can now rebuild an EC fragment for an
+    expired object.
+other:
+  - Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/2_15_2_release-6996eccabba558b4.yaml b/releasenotes/notes/2_15_2_release-6996eccabba558b4.yaml
new file mode 100644
index 0000000000..74bb146012
--- /dev/null
+++ b/releasenotes/notes/2_15_2_release-6996eccabba558b4.yaml
@@ -0,0 +1,22 @@
+---
+fixes:
+  - >
+    Fixed a cache invalidation issue related to GET and PUT requests to
+    containers that would occasionally cause object PUTs to a container to
+    404 after the container had been successfully created.
+
+  - >
+    Removed a race condition where a POST to an SLO could modify the
+    X-Static-Large-Object metadata.
+
+  - Fixed rare socket leak on range requests to erasure-coded objects.
+
+  - Fix SLO delete for accounts with non-ASCII names.
+
+  - >
+    Fixed an issue in COPY where concurrent requests may have copied the
+    wrong data.
+
+  - Fixed time skew when using X-Delete-After.
+
+  - Send ETag header in 206 Partial Content responses to SLO reads.
diff --git a/releasenotes/notes/2_16_0_release-d48cb9b2629df8ab.yaml b/releasenotes/notes/2_16_0_release-d48cb9b2629df8ab.yaml
new file mode 100644
index 0000000000..51b819b9c9
--- /dev/null
+++ b/releasenotes/notes/2_16_0_release-d48cb9b2629df8ab.yaml
@@ -0,0 +1,107 @@
+---
+features:
+  - Add checksum to object extended attributes.
+
+  - |
+    Let clients request heartbeats during SLO PUTs by including
+    the query parameter ``heartbeat=on``.
+
+    With heartbeating turned on, the proxy will start its response
+    immediately with 202 Accepted then send a single whitespace
+    character periodically until the request completes. At that
+    point, a final summary chunk will be sent which includes a
+    "Response Status" key indicating success or failure and (if
+    successful) an "Etag" key indicating the Etag of the resulting
+    SLO.
+
+  - |
+    Added support for retrieving the encryption root secret from an
+    external key management system. In practice, this is currently limited
+    to Barbican.
+
+  - |
+    Move listing formatting out to a new proxy middleware named
+    ``listing_formats``. ``listing_formats`` should be just right of the
+    first proxy-logging middleware, and left of most other
+    middlewares. If it is not already present, it will be
+    automatically inserted for you.
+
+    Note: if you have a custom middleware that makes account or
+    container listings, it will only receive listings in JSON format.
+
+  - |
+    Log deprecation warning for ``allow_versions`` in the container
+    server config. Configure the ``versioned_writes`` middleware in
+    the proxy server instead. This option will be ignored in a
+    future release.
+
+  - |
+    Replaced ``replication_one_per_device`` by custom count defined by
+    ``replication_concurrency_per_device``. The original config value
+    is deprecated, but continues to function for now. If both values
+    are defined, the old ``replication_one_per_device`` is ignored.
+
+  - |
+    Fixed a rare issue where multiple backend timeouts could result
+    in bad data being returned to the client.
+
+  - Cleaned up logged tracebacks when talking to memcached servers.
+
+  - |
+    Account and container replication stats logs now include
+    ``remote_merges``, the number of times a whole database was sent
+    to another node.
+
+  - |
+    Respond 400 Bad Request when Accept headers fail to parse
+    instead of returning 406 Not Acceptable.
+
+  - |
+    The ``domain_remap`` middleware now supports the
+    ``mangle_client_paths`` option. Its default "false" value changes
+    ``domain_remap`` parsing to stop stripping the ``path_root`` value
+    from URL paths. If users depend on this path mangling, operators
+    should set ``mangle_client_paths`` to "True" before upgrading.
+
+  - |
+    Remove ``swift-temp-url`` script. The functionality has been in
+    swiftclient for a long time and this script has been deprecated
+    since 2.10.0.
+
+  - |
+    Removed all ``post_as_copy`` related code and configs. The option
+    has been deprecated since 2.13.0.
+
+  - |
+    Fixed XML responses (eg on bulk extractions and SLO upload
+    failures) to be more correct. The enclosing "delete" tag was
+    removed where it doesn't make sense and replaced with "extract"
+    or "upload" depending on the context.
+
+  - |
+    Static Large Object (SLO) manifest may now (again) have zero-byte
+    last segments.
+
+  - |
+    Fixed an issue where background consistency daemon child
+    processes would deadlock waiting on the same file descriptor.
+
+  - |
+    Removed a race condition where a POST to an SLO could modify the
+    X-Static-Large-Object metadata.
+
+  - |
+    Accept a trade off of dispersion for balance in the ring builder
+    that will result in getting to balanced rings much more quickly
+    in some cases.
+
+  - |
+    Fixed using ``swift-ring-builder set_weight`` with more than one
+    device.
+
+  - |
+    When requesting objects, return 404 if a tombstone is found and
+    is newer than any data found. Previous behavior was to return
+    stale data.
+other:
+  - Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/2_17_0_release-bd35f18c41c5ef18.yaml b/releasenotes/notes/2_17_0_release-bd35f18c41c5ef18.yaml
new file mode 100644
index 0000000000..cdb9ff04c2
--- /dev/null
+++ b/releasenotes/notes/2_17_0_release-bd35f18c41c5ef18.yaml
@@ -0,0 +1,119 @@
+---
+features:
+  - |
+    Added symlink objects support.
+
+    Symlink objects reference one other object. They are created by
+    creating an empty object with an X-Symlink-Target header. The value of
+    the header is of the format /, and the target does
+    not need to exist at the time of symlink creation. Cross-account
+    symlinks can be created by including the
+    X-Symlink-Target-Account header.
+
+    GET and HEAD requests to a symlink will operate on the
+    referenced object and require appropriate permission in the
+    target container. DELETE and PUT requests will operate on the
+    symlink object itself. POST requests are not forwarded to the
+    referenced object. POST requests sent to a symlink will result
+    in a 307 Temporary Redirect response.
+
+  - |
+    Added support for inline data segments in SLO manifests.
+
+    Upgrade impact -- during a rolling upgrade, an updated proxy server
+    may write a manifest that an out-of-date proxy server will not be
+    able to read. This will resolve itself once the upgrade completes
+    on all nodes.
+
+  - |
+    The tempurl digest algorithm is now configurable, and Swift added
+    support for both SHA-256 and SHA-512. Supported tempurl digests
+    are exposed to clients in ``/info``. Additionally, tempurl signatures
+    can now be base64 encoded.
+
+  - |
+    Object expiry improvements
+
+    - Disallow X-Delete-At header values equal to the X-Timestamp header.
+
+    - X-Delete-At computation now uses X-Timestamp instead of
+      system time. This prevents clock skew causing inconsistent
+      expiry data.
+
+    - Deleting an expiring object will now cause less work in the system.
+      The number of async pending files written has been reduced for all
+      objects and greatly reduced for erasure-coded objects. This
+      dramatically reduces the burden on container servers.
+
+    - Stopped logging tracebacks when receiving an unexpected response.
+
+    - Allow the expirer to gracefully move past updating stale work items.
+
+  - |
+    When the object auditor examines an object, it will now add any
+    missing metadata checksums.
+
+  - |
+    ``swift-ring-builder`` improvements
+
+    - Save the ring when dispersion improves, even if balance
+      doesn't improve.
+
+    - Improved the granularity of the ring dispersion metric so that
+      small improvements after a rebalance can show changes in the
+      dispersion number. Dispersion in existing and new rings can be
+      recalculated using the new ``--recalculate`` option to
+      ``swift-ring-builder``.
+
+    - Display more info on empty rings.
+
+  - |
+    Fixed rare socket leak on range requests to erasure-coded objects.
+
+  - |
+    The number of container updates on object PUTs (ie to update listings)
+    has been recomputed to be far more efficient  while maintaining
+    durability guarantees. Specifically, object PUTs to erasure-coded
+    policies will now normally result in far fewer container updates.
+
+  - |
+    Moved Zuul v3 tox jobs into the Swift code repo.
+
+  - |
+    Changed where liberasurecode-devel for CentOS 7 is referenced and
+    installed as a dependency.
+
+  - |
+    Added container/object listing with prefix to InternalClient.
+
+  - |
+    Added ``--swift-versions`` to ``swift-recon`` CLI to compare installed
+    versions in the cluster.
+
+  - |
+    Stop logging tracebacks in the ``object-replicator`` when it runs
+    out of handoff locations.
+
+  - |
+    Send ETag header in 206 Partial Content responses to SLO reads.
+
+  - |
+    Now ``swift-recon-cron`` works with conf.d configs.
+
+  - |
+    Improved ``object-updater`` stats logging. It now tells you all of
+    its stats (successes, failures, quarantines due to bad pickles,
+    unlinks, and errors), and it tells you incremental progress every
+    five minutes. The logging at the end of a pass remains and has
+    been expanded to also include all stats.
+
+  - |
+    If a proxy server is configured to autocreate accounts and the
+    account create fails, it will now return a server error (500)
+    instead of Not Found (404).
+
+  - |
+    Fractional replicas are no longer allowed for erasure code policies.
+
+  - |
+    Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/2_17_1_release-dd6e6879cbb94f85.yaml b/releasenotes/notes/2_17_1_release-dd6e6879cbb94f85.yaml
new file mode 100644
index 0000000000..b1fbf2cf37
--- /dev/null
+++ b/releasenotes/notes/2_17_1_release-dd6e6879cbb94f85.yaml
@@ -0,0 +1,11 @@
+---
+fixes:
+  - Fix SLO delete for accounts with non-ASCII names.
+
+  - >
+    Fixed an issue in COPY where concurrent requests may have copied the
+    wrong data.
+
+  - >
+    Fixed a bug in how Swift uses eventlet that was exposed under high
+    concurrency.
diff --git a/releasenotes/notes/2_18_0_release-3acf63cfe2475c65.yaml b/releasenotes/notes/2_18_0_release-3acf63cfe2475c65.yaml
new file mode 100644
index 0000000000..1b98c48100
--- /dev/null
+++ b/releasenotes/notes/2_18_0_release-3acf63cfe2475c65.yaml
@@ -0,0 +1,85 @@
+---
+features:
+  - |
+    Added container sharding, an operator controlled feature that
+    may be used to shard very large container databases into a
+    number of smaller shard containers. This mitigates the issues
+    with one large DB by distributing the data across multiple
+    smaller databases throughout the cluster. Please read the full
+    overview at
+    https://docs.openstack.org/swift/latest/overview_container_sharding.html
+
+  - |
+    Provide an S3 API compatibility layer. The external "swift3"
+    project has been imported into Swift's codebase as the "s3api"
+    middleware.
+
+  - |
+    Added "emergency mode" hooks in the account and container replicators.
+    These options may be used to prioritize moving handoff
+    partitions to primary locations more quickly. This helps when
+    adding capacity to a ring.
+
+    - Added ``-d `` and ``-p `` command line options.
+
+    - Added a handoffs-only mode.
+
+  - |
+    Add a multiprocess mode to the object replicator. Setting the
+    ``replicator_workers`` setting to a positive value N will result
+    in the replicator using up to N worker processes to perform
+    replication tasks. At most one worker per disk will be spawned.
+
+    Worker process logs will have a bit of information prepended so
+    operators can tell which messages came from which worker. The
+    prefix is "[worker M/N pid=P] ", where M is the worker's index,
+    N is the total number of workers, and P is the process ID. Every
+    message from the replicator's logger will have the prefix
+
+  - |
+    The object reconstructor will now fork all available worker
+    processes when operating on a subset of local devices.
+
+  - |
+    Add support for PROXY protocol v1 to the proxy server. This
+    allows the Swift proxy server to log accurate client IP
+    addresses when there is a proxy or SSL-terminator between the
+    client and the Swift proxy server.  Example servers supporting
+    this PROXY protocol include stunnel, haproxy, hitch, and
+    varnish. See the sample proxy server config file for the
+    appropriate config setting to enable or disable this
+    functionality.
+
+  - |
+    In the ratelimit middleware, account whitelist and blacklist
+    settings have been deprecated and may be removed in a future
+    release. When found, a deprecation message will be logged.
+    Instead of these config file values, set X-Account-Sysmeta-
+    Global-Write-Ratelimit:WHITELIST and X-Account-Sysmeta-Global-
+    Write-Ratelimit:BLACKLIST on the particular accounts that need
+    to be whitelisted or blacklisted. System metadata cannot be added
+    or modified by standard clients. Use the internal client to set sysmeta.
+
+  - |
+    Add a ``--drop-prefixes`` flag to swift-account-info,
+    swift-container-info, and swift-object-info. This makes the
+    output between the three more consistent.
+
+  - |
+    statsd error messages correspond to 5xx responses only. This
+    makes monitoring more useful because actual errors (5xx) will
+    not be hidden by common user requests (4xx). Previously, some 4xx
+    responses would be included in timing information in the statsd
+    error messages.
+
+  - |
+    Truncate error logs to prevent log handler from running out of buffer.
+
+  - |
+    Updated requirements.txt to match global exclusions and formatting.
+
+  - |
+    tempauth user names now support unicode characters.
+
+  - |
+    Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/2_19_0_release-3e6ee3e6a1fcc6bb.yaml b/releasenotes/notes/2_19_0_release-3e6ee3e6a1fcc6bb.yaml
new file mode 100644
index 0000000000..d06e263952
--- /dev/null
+++ b/releasenotes/notes/2_19_0_release-3e6ee3e6a1fcc6bb.yaml
@@ -0,0 +1,103 @@
+---
+features:
+  - |
+    TempURLs now support IP range restrictions. Please see
+    https://docs.openstack.org/swift/latest/middleware.html#client-usage
+    for more information on how to use this additional restriction.
+
+  - |
+    Add support for multiple root encryption secrets for the trivial
+    and KMIP keymasters. This allows operators to rotate encryption
+    keys over time without needing to re-encrypt all existing data
+    in the cluster. Please see the included sample config files for
+    instructions on how to multiple encryption keys.
+
+  - |
+    The object updater now supports two configuration settings:
+    "concurrency" and "updater_workers". The latter controls how many
+    worker processes are spawned, while the former controls how many
+    concurrent container updates are performed by each worker
+    process. This should speed the processing of async_pendings.
+
+    On upgrade, a node configured with concurrency=N will still handle
+    async updates N-at-a-time, but will do so using only one process
+    instead of N.
+
+    If you have a config file like this::
+
+        [object-updater]
+        concurrency = 
+
+    and you want to take advantage of faster updates, then do this::
+
+        [object-updater]
+        concurrency = 8  # the default; you can omit this line
+        updater_workers = 
+
+    If you want updates to be processed exactly as before, do this::
+
+        [object-updater]
+        concurrency = 1
+        updater_workers = 
+
+  - |
+    When listing objects in a container in json format, static large
+    objects (SLOs) will now include an additional new "slo_etag" key
+    that matches the etag returned when requesting the SLO. The
+    existing "hash" key remains unchanged as the MD5 of the SLO
+    manifest. Text and XML listings are unaffected by this change.
+
+  - |
+    Log deprecation warnings for ``run_pause``. This setting was
+    deprecated in Swift 2.4.0 and is replaced by ``interval``.
+    It may be removed in a future release.
+
+  - |
+    Object reconstructor logs are now prefixed with information
+    about the specific worker process logging the message. This
+    makes reading the logs and understanding the messages much simpler.
+
+  - |
+    Lower bounds of dependencies have been updated to reflect what
+    is actually tested.
+
+  - |
+    SSYNC replication mode now removes as much of the directory
+    structure as possible as soon at it observes that the directory
+    is empty. This reduces the work needed for subsequent replication
+    passes.
+
+  - |
+    The container-updater now reports zero objects and bytes used for
+    child DBs in sharded containers. This prevents double-counting in
+    utilization reports.
+
+  - |
+    Add fallocate_reserve to account and container servers. This
+    allows disks shared between account/container and object rings to
+    avoid getting 100% full. The default value of 1% matches the
+    existing default on object servers.
+
+  - |
+    Added an experimental ``swift-ring-composer`` CLI tool to build
+    composite rings.
+
+  - |
+    Added an optional ``read_only`` middleware to make an entire cluster
+    or individual accounts read only.
+
+  - |
+    Fixed a bug where zero-byte PUTs would not work properly
+    with "If-None-Match: \*" conditional requests.
+
+  - ACLs now work with unicode in user/account names.
+
+  - COPY now works with unicode account names.
+
+  - Improved S3 API compatibility.
+
+  - |
+    Lock timeouts in the container updater are now logged at INFO
+    level, not ERROR.
+
+  - Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/2_19_1_release-5072dd72557f5708.yaml b/releasenotes/notes/2_19_1_release-5072dd72557f5708.yaml
new file mode 100644
index 0000000000..0e1a6d259c
--- /dev/null
+++ b/releasenotes/notes/2_19_1_release-5072dd72557f5708.yaml
@@ -0,0 +1,16 @@
+---
+fixes:
+  - >
+    Prevent PyKMIP's kmip_protocol logger from logging at DEBUG.
+    Previously, some versions of PyKMIP would include all wire
+    data when the root logger was configured to log at DEBUG; this
+    could expose key material in logs. Only the kmip_keymaster was
+    affected.
+
+  - >
+    Fixed an issue where a failed drive could prevent the container sharder
+    from making progress.
+
+  - >
+    Fixed a bug in how Swift uses eventlet that was exposed under high
+    concurrency.
diff --git a/releasenotes/notes/2_20_0_release-7b090a5f4bd916e4.yaml b/releasenotes/notes/2_20_0_release-7b090a5f4bd916e4.yaml
new file mode 100644
index 0000000000..7d15183f30
--- /dev/null
+++ b/releasenotes/notes/2_20_0_release-7b090a5f4bd916e4.yaml
@@ -0,0 +1,116 @@
+---
+features:
+  - |
+    S3 API compatibility updates
+
+    - Swift can now cache the S3 secret from Keystone to use for
+      subsequent requests. This functionality is disabled by default but
+      can be enabled by setting the ``secret_cache_duration`` in the
+      ``[filter:s3token]`` section of the proxy server config to a number
+      greater than 0.
+
+    - s3api now mimics the AWS S3 behavior of periodically sending
+      whitespace characters on a Complete Multipart Upload request to keep
+      the connection from timing out. Note that since a request could fail
+      after the initial 200 OK response has been sent, it is important to
+      check the response body to determine if the request succeeded.
+
+    - s3api now properly handles ``x-amz-metadata-directive`` headers on
+      COPY operations.
+
+    - s3api now uses concurrency (default 2) to handle multi-delete
+      requests. This allows multi-delete requests to be processed much
+      more quickly.
+
+    - s3api now mimics some forms of AWS server-side encryption
+      based on whether Swift's at-rest encryption functionality is enabled.
+      Note that S3 API users are now able to know more about how the
+      cluster is configured than they were previously, ie knowledge of
+      encryption at-rest functionality being enabled or not.
+
+    - s3api responses now include a '-' in multipart ETags.
+
+      For new multipart-uploads via the S3 API, the ETag that is
+      stored will be calculated in the same way that AWS uses. This
+      ETag will be used in GET/HEAD responses, bucket listings, and
+      conditional requests via the S3 API. Accessing the same object
+      via the Swift API will use the SLO Etag; however, in JSON
+      container listings the multipart upload etag will be exposed
+      in a new "s3_etag" key. Previously, some S3 clients would complain
+      about download corruption when the ETag did not have a '-'.
+
+    - S3 ETag for SLOs now include a '-'.
+
+      Ordinary objects in S3 use the MD5 of the object as the ETag,
+      just like Swift. Multipart Uploads follow a different format, notably
+      including a dash followed by the number of segments. To that end
+      (and for S3 API requests *only*), SLO responses via the S3 API have a
+      literal '-N' added on the end of the ETag.
+
+    - The default location is now set to "us-east-1". This is more likely
+      to be the default region that a client will try when using v4
+      signatures.
+
+      Deployers with clusters that relied on the old implicit default
+      location of "US" should explicitly set ``location = US`` in the
+      ``[filter:s3api]`` section of proxy-server.conf before upgrading.
+
+    - Add basic support for ?versions bucket listings. We still do not
+      have support for toggling S3 bucket versioning, but we can at least
+      support getting the latest versions of all objects.
+
+  - |
+    Fixed an issue with SSYNC requests to ensure that only one request
+    can be running on a partition at a time.
+
+  - |
+    Data encryption updates
+
+    - The ``kmip_keymaster`` middleware can now be configured directly in the
+      proxy-server config file. The existing behavior of using an external
+      config file is still supported.
+
+    - Multiple keymaster middlewares are now supported. This allows
+      migration from one key provider to another.
+
+      Note that ``secret_id`` values must remain unique across all keymasters
+      in a given pipeline. If they are not unique, the right-most keymaster
+      will take precedence.
+
+      When looking for the active root secret, only the right-most
+      keymaster is used.
+
+    - Prevent PyKMIP's kmip_protocol logger from logging at DEBUG.
+      Previously, some versions of PyKMIP would include all wire
+      data when the root logger was configured to log at DEBUG; this
+      could expose key material in logs. Only the ``kmip_keymaster`` was
+      affected.
+
+  - |
+    Fixed an issue where a failed drive could prevent the container sharder
+    from making progress.
+
+  - |
+    Storage policy definitions in swift.conf can now define the diskfile
+    to use to access objects. See the included swift.conf-sample file for
+    a description of usage.
+
+  - |
+    The EC reconstructor will now attempt to remove empty directories
+    immediately, while the inodes are still cached, rather than waiting
+    until the next run.
+
+  - |
+    Added a ``keep_idle`` config option to configure KEEPIDLE time for TCP
+    sockets. The default value is the old constant of 600.
+
+  - |
+    Add ``databases_per_second`` to the account-replicator,
+    container-replicator, and container-sharder. This prevents them from
+    using a full CPU core when they are not IO limited.
+
+  - |
+    Allow direct_client users to overwrite the ``X-Timestamp`` header.
+
+  - |
+    Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/2_21_0_release-d8ae33ef18b7be3a.yaml b/releasenotes/notes/2_21_0_release-d8ae33ef18b7be3a.yaml
new file mode 100644
index 0000000000..1c9c06a1a9
--- /dev/null
+++ b/releasenotes/notes/2_21_0_release-d8ae33ef18b7be3a.yaml
@@ -0,0 +1,69 @@
+---
+features:
+  - |
+    Change the behavior of the EC reconstructor to perform a
+    fragment rebuild to a handoff node when a primary peer responds
+    with 507 to the REPLICATE request. This changes EC to match the
+    existing behavior of replication when drives fail. After a
+    rebalance of EC rings (potentially removing unmounted/failed
+    devices), it's most IO efficient to run in handoffs_only mode to
+    avoid unnecessary rebuilds.
+
+  - |
+    O_TMPFILE support is now detected by attempting to use it
+    instead of looking at the kernel version. This allows older
+    kernels with backported patches to take advantage of the
+    O_TMPFILE functionality.
+
+  - |
+    Add slo_manifest_hook callback to allow other middlewares to
+    impose additional constraints on or make edits to SLO manifests
+    before being written. For example, a middleware could enforce
+    minimum segment size or insert data segments.
+
+  - |
+    Fixed an issue with multi-region EC policies that caused the EC
+    reconstructor to constantly attempt cross-region rebuild
+    traffic.
+
+  - |
+    Fixed an issue where S3 API v4 signatures would not be validated
+    against the body of the request, allowing a replay attack if
+    request headers were captured by a malicious third party.
+
+  - Display crypto data/metadata details in swift-object-info.
+
+  - formpost can now accept a content-encoding parameter.
+
+  - |
+    Fixed an issue where multipart uploads with the S3 API would
+    sometimes report an error despite all segments being upload
+    successfully.
+
+  - |
+    Multipart object segments are now actually deleted when the
+    multipart object is deleted via the S3 API.
+
+  - |
+    Swift now returns a 503 (instead of a 500) when an account
+    auto-create fails.
+
+  - |
+    Fixed a bug where encryption would store the incorrect key
+    metadata if the object name starts with a slash.
+
+  - |
+    Fixed an issue where an object server failure during a client
+    download could leave an open socket between the proxy and
+    client.
+
+  - |
+    Fixed an issue where deleted EC objects didn't have their
+    on-disk directories cleaned up. This would cause extra resource
+    usage on the object servers.
+
+  - |
+    Fixed issue where bulk requests using xml and expect
+    100-continue would return a malformed HTTP response.
+
+  - Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/2_22_0_release-f60d29508b3c1283.yaml b/releasenotes/notes/2_22_0_release-f60d29508b3c1283.yaml
new file mode 100644
index 0000000000..1e6ea3b1e4
--- /dev/null
+++ b/releasenotes/notes/2_22_0_release-f60d29508b3c1283.yaml
@@ -0,0 +1,87 @@
+---
+features:
+  - |
+    Experimental support for Python 3.6 and 3.7 is now available.
+    Note that this requires ``eventlet>=0.25.0``. All unit tests pass,
+    and running functional tests under Python 2 will pass against
+    services running under Python 3. Expect full support in the
+    next minor release.
+
+  - |
+    Log formats are now more configurable and include support for
+    anonymization. See the ``log_msg_template`` option in ``proxy-server.conf``
+    and `the Swift documentation `__
+    for more information.
+
+  - |
+    Added an operator tool, ``swift-container-deleter``, to asynchronously
+    delete some or all objects in a container using the object expirers.
+
+  - |
+    Swift-all-in-one Docker images are now built and published to
+    https://hub.docker.com/r/openstackswift/saio. These are intended
+    for use as development targets, but will hopefully be useful as a
+    starting point for other work involving containerizing Swift.
+
+upgrade:
+  - |
+    The ``object-expirer`` may now be configured in ``object-server.conf``.
+    This is in anticipation of a future change to allow the ``object-expirer``
+    to be deployed on all nodes that run the ``object-server``.
+
+  - |
+    **Dependency updates**: we've increased our minimum supported version
+    of ``cryptography`` to 2.0.2 and ``netifaces`` to 0.8. This is largely due
+    to the difficulty of continuing to test with the old versions.
+
+    If running Swift under Python 3, ``eventlet`` must be at least 0.25.0.
+
+fixes:
+  - |
+    Correctness improvements
+
+    * The ``proxy-server`` now ignores 404 responses from handoffs without
+      databases when deciding on the correct response for account and
+      container requests.
+
+    * Object writes to a container whose existence cannot be verified
+      now 503 instead of 404.
+
+  - |
+    Sharding improvements
+
+    * The ``container-replicator`` now only attempts to fetch shard ranges if
+      the remote indicates that it has shard ranges. Further, it does so
+      with a timeout to prevent the process from hanging in certain cases.
+
+    * The ``proxy-server`` now caches 'updating' shards, improving write
+      performance for sharded containers. A new config option,
+      ``recheck_updating_shard_ranges``, controls the cache time; set it to
+      0 to disable caching.
+
+    * The ``container-replicator`` now correctly enqueues
+      ``container-reconciler`` work for sharded containers.
+
+  - |
+    S3 API improvements
+
+    * Unsigned payloads work with v4 signatures once more.
+
+    * Multipart upload parts may now be copied from other multipart uploads.
+
+    * CompleteMultipartUpload requests with a ``Content-MD5`` now work.
+
+    * ``Content-Type`` can now be updated when copying an object.
+
+    * Fixed v1 listings that end with a non-ASCII object name.
+
+  - |
+    Background corruption-detection improvements
+
+    * Detect and remove invalid entries from ``hashes.pkl``
+
+    * When object path is not a directory, just quarantine it,
+      rather than the whole suffix.
+
+  - |
+    Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/2_23_0_release-2a2d11c1934f0b61.yaml b/releasenotes/notes/2_23_0_release-2a2d11c1934f0b61.yaml
new file mode 100644
index 0000000000..656950ee71
--- /dev/null
+++ b/releasenotes/notes/2_23_0_release-2a2d11c1934f0b61.yaml
@@ -0,0 +1,74 @@
+---
+features:
+  - |
+    Python 3.6 and 3.7 are now fully supported. If you've been testing Swift
+    on Python 3, upgrade at your earliest convenience.
+
+  - |
+    Added "static symlinks", which perform some validation as they
+    follow redirects and include more information about their target
+    in container listings. For more information, see the `symlink middleware
+    `__
+    section of the documentation.
+
+  - |
+    Multi-character strings may now be used as delimiters in account
+    and container listings.
+
+upgrade:
+  - |
+    **Dependency update**: ``eventlet`` must be at least 0.25.0. This also
+    dragged forward minimum-supported versions of ``dnspython`` (1.15.0),
+    ``greenlet`` (0.3.2), and ``six`` (1.10.0).
+
+fixes:
+  - |
+    Python 3 fixes:
+
+    * Removed a request-smuggling vector when running a mixed
+      py2/py3 cluster.
+
+    * Allow ``fallocate_reserve`` to be specified as a percentage.
+
+    * Fixed listings for sharded containers.
+
+    * Fixed non-ASCII account metadata handling.
+
+    * Fixed ``rsync`` output parsing.
+
+    * Fixed some title-casing of headers.
+
+    If you've been testing Swift on Python 3, upgrade at your earliest
+    convenience.
+
+  - |
+    Sharding improvements
+
+    * Container metadata related to sharding are now removed when no
+      longer needed.
+
+    * Empty container databases (such as might be created on handoffs)
+      now shard much more quickly.
+
+  - |
+    The ``proxy-server`` now ignores 404 responses from handoffs that have
+    no data when deciding on the correct response for object requests,
+    similar to what it already does for account and container requests.
+
+  - |
+    Static Large Object sizes in listings for versioned containers are
+    now more accurate.
+
+  - |
+    When refetching Static Large Object manifests, non-manifest responses
+    are now handled better.
+
+  - |
+    S3 API now translates ``503 Service Unavailable`` responses to a more
+    S3-like response instead of raising an error.
+
+  - |
+    Improved proxy-to-backend requests to be more RFC-compliant.
+
+  - |
+    Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/2_24_0_release-1ca244cc959922fc.yaml b/releasenotes/notes/2_24_0_release-1ca244cc959922fc.yaml
new file mode 100644
index 0000000000..0155cb2c03
--- /dev/null
+++ b/releasenotes/notes/2_24_0_release-1ca244cc959922fc.yaml
@@ -0,0 +1,92 @@
+---
+features:
+  - |
+    Added a new object versioning mode, with APIs for querying and
+    accessing old versions. For more information, see `the documentation
+    `__.
+
+  - |
+    Added support for S3 versioning using the above new mode.
+
+  - |
+    Added a new middleware to allow accounts and containers to opt-in to
+    RFC-compliant ETags. For more information, see `the documentation
+    `__.
+    Clients should be aware of the fact that ETags may be quoted for RFC
+    compliance; this may become the default behavior in some future release.
+
+  - |
+    Proxy, account, container, and object servers now support "seamless
+    reloads" via ``SIGUSR1``. This is similar to the existing graceful
+    restarts but keeps the server socket open the whole time, reducing
+    service downtime.
+
+  - |
+    New buckets created via the S3 API will now store multi-part upload
+    data in the same storage policy as other data rather than the
+    cluster's default storage policy.
+
+  - |
+    Device region and zone can now be changed via ``swift-ring-builder``.
+    Note that this may cause a lot of data movement on the next rebalance
+    as the builder tries to reach full dispersion.
+
+  - |
+    Added support for Python 3.8.
+
+
+deprecations:
+  - |
+    Per-service ``auto_create_account_prefix`` settings are now deprecated
+    and may be ignored in a future release; if you need to use this, please
+    set it in the ``[swift-constraints]`` section of ``/etc/swift/swift.conf``.
+
+fixes:
+  - |
+    The container sharder can now handle containers with special
+    characters in their names.
+
+  - |
+    Internal client no longer logs object DELETEs as status 499.
+
+  - |
+    Objects with an ``X-Delete-At`` value in the far future no longer cause
+    backend server errors.
+
+  - |
+    The bulk extract middleware once again allows clients to specify metadata
+    (including expiration timestamps) for all objects in the archive.
+
+  - |
+    Container sync now synchronizes static symlinks in a way similar to
+    static large objects.
+
+  - |
+    ``swift_source`` is set for more sub-requests in the proxy-server. See
+    `the documentation `__.
+
+  - |
+    Errors encountered while validating static symlink targets no longer
+    cause ``BadResponseLength`` errors in the proxy-server.
+
+  - |
+    On Python 3, the KMS keymaster now works with secrets stored
+    in Barbican with a ``text/plain`` payload-content-type.
+
+  - |
+    On Python 3, the formpost middleware now works with unicode file names.
+
+  - |
+    On Python 3, certain S3 API headers are now lower case as they
+    would be coming from AWS.
+
+  - |
+    Several utility scripts now work better on Python 3:
+
+    * ``swift-account-audit``
+
+    * ``swift-dispersion-populate``
+
+    * ``swift-drive-recon``
+
+    * ``swift-recon``
diff --git a/releasenotes/notes/2_25_0_release-09410c808881bf21.yaml b/releasenotes/notes/2_25_0_release-09410c808881bf21.yaml
new file mode 100644
index 0000000000..4782f61d0c
--- /dev/null
+++ b/releasenotes/notes/2_25_0_release-09410c808881bf21.yaml
@@ -0,0 +1,65 @@
+---
+features:
+  - |
+    WSGI server processes can now notify systemd when they are ready.
+
+  - |
+    Added a new middleware that allows users and operators to configure
+    accounts and containers to use RFC-compliant (i.e., double-quoted)
+    ETags. This may be useful when using Swift as an origin for some content
+    delivery networks. For more information, see `the middleware documentation
+    `__.
+
+  - |
+    Added ``ttfb`` (Time to First Byte) and ``pid`` (Process ID) to the set
+    of available proxy-server log fields. For more information, see
+    `the documentation `__.
+
+fixes:
+  - |
+    Improved proxy-server performance by reducing unnecessary locking,
+    memory copies, and eventlet scheduling.
+
+  - |
+    Reduced object-replicator and object-reconstructor CPU usage by only
+    checking that the device list is current when rings change.
+
+  - |
+    Improved performance of sharded container listings when performing
+    prefix listings.
+
+  - |
+    Improved container-sync performance when data has already been
+    deleted or overwritten.
+
+  - |
+    Account quotas are now enforced even on empty accounts.
+
+  - |
+    Getting an SLO manifest with ``?format=raw`` now responds with an ETag
+    that matches the MD5 of the generated body rather than the MD5 of
+    the manifest stored on disk.
+
+  - |
+    Provide useful status codes in logs for some versioning and symlink
+    subrequests that were previously logged as 499.
+
+  - |
+    Fixed 500 from cname_lookup middleware. Previously, if the looked-up
+    domain was used by domain_remap to update the request path, the
+    server would respond Internal Error.
+
+  - |
+    On Python 3, fixed an issue when reading or writing objects with a content
+    type like ``message/*``. Previously, Swift would fail to respond.
+
+  - |
+    On Python 3, fixed a RecursionError in swift-dispersion-report when
+    using TLS.
+
+  - |
+    Fixed a bug in the new object versioning API that would cause more
+    than ``limit`` results to be returned when listing.
+
+  - |
+    Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/2_26_0_release-6548eadcba544f72.yaml b/releasenotes/notes/2_26_0_release-6548eadcba544f72.yaml
new file mode 100644
index 0000000000..a9c1ab66da
--- /dev/null
+++ b/releasenotes/notes/2_26_0_release-6548eadcba544f72.yaml
@@ -0,0 +1,216 @@
+---
+features:
+  - |
+    Extend concurrent reads to erasure coded policies. Previously, the
+    options ``concurrent_gets`` and ``concurrency_timeout`` only applied to
+    replicated policies.
+
+  - |
+    Add a new ``concurrent_ec_extra_requests`` option to allow the proxy to
+    make some extra backend requests immediately. The proxy will respond as
+    soon as there are enough responses available to reconstruct.
+
+  - |
+    The concurrent read options (``concurrent_gets``, ``concurrency_timeout``,
+    and ``concurrent_ec_extra_requests``) may now be configured per
+    storage-policy.
+
+  - |
+    Replication servers can now handle all request methods. This allows
+    ssync to work with a separate replication network.
+
+  - |
+    All background daemons now use the replication network. This allows
+    better isolation between external, client-facing traffic and internal,
+    background traffic. Note that during a rolling upgrade, replication
+    servers may respond with ``405 Method Not Allowed``. To avoid this,
+    operators should remove the config option ``replication_server = true``
+    from their replication servers; this will allow them to handle all
+    request methods before upgrading.
+
+  - |
+    S3 API improvements:
+
+    * Fixed some SignatureDoesNotMatch errors when using the AWS .NET SDK.
+
+    * Add basic read support for object tagging. This improves
+      compatibility with AWS CLI version 2. Write support is not
+      yet implemented, so the tag set will always be empty.
+
+    * CompleteMultipartUpload requests may now be safely retried.
+
+    * Improved quota-exceeded error messages.
+
+    * Improved logging and statsd metrics. Be aware that this will cause
+      an increase in the proxy-logging statsd metrics emited for S3
+      responses. However, this should more accurately reflect the state
+      of the system.
+
+    * S3 requests are now less demanding on the container layer.
+
+  - |
+    Servers now open one listen socket per worker, ensuring each worker
+    serves roughly the same number of concurrent connections.
+
+  - |
+    Server workers may now be gracefully terminated via ``SIGHUP`` or
+    ``SIGUSR1``. The parent process will then spawn a fresh worker.
+
+  - |
+    Allow proxy-logging middlewares to be configured more independently.
+
+  - |
+    Improve performance when increasing partition power.
+
+issues:
+  - |
+    In a rolling upgrade from liberasurecode 1.5.0 or earlier to 1.6.0 or
+    later, object-servers may quarantine newly-written data, leading to
+    availability issues or even data loss. See `bug 1886088
+    `__ for more
+    information, including how to determine whether you are affected.
+    Several mitigations are available to operators:
+
+    * If proxy and object layers can be upgraded independently and proxies
+      can be upgraded quickly:
+
+      1. Stop and disable the object-reconstructor before upgrading. This
+         ensures no upgraded object server starts writing new fragments
+         that old object servers would quarantine.
+
+      2. Upgrade liberasurecode on all object servers. Object servers can
+         now read both old and new fragments.
+
+      3. Upgrade liberasurecode on all proxy servers. Newly-written data
+         will now use new fragments. Note that not-yet-upgraded proxies
+         will not be able to read these newly-written fragments but will
+         instead respond ``500 Internal Server Error``.
+
+      4. After upgrading, re-enable and restart the object-reconstructor.
+
+    * If your users can tolerate it, consider a read-only rolling upgrade.
+      Before upgrading, enable the `read-only middleware
+      `__
+      cluster-wide to prevent new writes during the upgrade. Additionally,
+      stop and disable the object-reconstructor as above. Upgrade normally,
+      then disable the read-only middleware and re-enable and restart the
+      object-reconstructor.
+
+    * Avoid upgrading liberasurecode until swift and liberasurecode
+      better-support a rolling upgrade. Swift remains compatible with
+      liberasurecode 1.5.0 and earlier.
+
+    .. note::
+       Ubuntu 18.04 and RDO's CentOS 7 repos package liberasurecode 1.5.0,
+       while Ubuntu 20.04 and RDO's CentOS 8 repos currently package
+       liberasurecode 1.6.0 or 1.6.1. Take care when upgrading major distro
+       versions!
+
+upgrade:
+  - |
+    **If your cluster has encryption enabled and is still running Swift
+    under Python 2**, we recommend upgrading Swift *before* transitioning to
+    Python 3. Otherwise, new writes to objects with non-ASCII characters
+    in their paths may result in corrupted downloads when read from a
+    proxy-server still running old swift on Python 2. See `bug 1888037
+    `__ for more information.
+    Note that new tags including a fix for the bug are planned for all
+    maintained stable branches; upgrading to any one of those should be
+    sufficient to ensure a smooth upgrade to the latest Swift.
+
+  - |
+    The above bug was caused by a difference in string types that resulted
+    in ambiguity when decrypting. To prevent the ambiguity for new data, set
+    ``meta_version_to_write = 3`` in your keymaster configuration *after*
+    upgrading all proxy servers.
+
+    If upgrading from Swift 2.20.0 or Swift 2.19.1 or earlier, set
+    ``meta_version_to_write = 1`` in your keymaster configuration *prior*
+    to upgrading.
+
+    See the provided ``keymaster.conf-sample`` for more information about
+    this setting.
+
+  - |
+    **If your cluster is configured with a separate replication network**,
+    note that background daemons will switch to using this network for all
+    traffic. If your account, container, or object replication servers are
+    configured with ``replication_server = true``, these daemons may log a
+    flood of ``405 Method Not Allowed`` messages during a rolling upgrade.
+    To avoid this, comment out the option and restart replication servers
+    before upgrading.
+
+fixes:
+  - |
+    Python 3 bug fixes:
+
+    * Fixed an error when reading encrypted data that was written while
+      running Python 2 for a path that includes non-ASCII characters.
+
+    * Object expiration respects the ``expiring_objects_container_divisor``
+      config option.
+
+    * ``fallocate_reserve`` may be specified as a percentage in more places.
+
+    * The ETag-quoting middleware no longer raises TypeErrors.
+
+  - |
+    Sharding improvements:
+
+    * Prevent object updates from auto-creating shard containers. This
+      ensures more consistent listings for sharded containers during
+      rebalances.
+
+    * Deleted shard containers are no longer considered root containers.
+      This prevents unnecessary sharding audit failures and allows the
+      deleted shard database to actually be unlinked.
+
+    * ``swift-container-info`` now summarizes shard range information.
+      Pass ``-v``/``--verbose`` if you want to see all of them.
+
+    * Improved container-sharder stat reporting to reduce load on root
+      container databases.
+
+    * Don't inject shard ranges when user quits.
+
+  - |
+    During rebalances, clients should no longer get 404s for data that
+    exists but whose replicas are overloaded.
+
+  - |
+    Improved cache management for account and container responses.
+
+  - |
+    Allow operators to pass either raw or URL-quoted paths to
+    ``swift-get-nodes``. Notably, this allows ``swift-get-nodes`` to
+    work with the reserved namespace used for object versioning.
+
+  - |
+    Container read ACLs now work with object versioning. This only
+    allows access to the most-recent version via an unversioned URL.
+
+  - |
+    Improved how containers reclaim deleted rows to reduce locking and object
+    update throughput.
+
+  - |
+    Large object reads log fewer client disconnects.
+
+  - |
+    Allow ratelimit to be placed multiple times in a proxy pipeline,
+    such as both before s3api and auth (to handle swift requests without
+    needing to make an auth decision) and after (to limit S3 requests).
+
+  - |
+    Shuffle object-updater work. This somewhat reduces the impact a
+    single overloaded database has on other containers' listings.
+
+  - |
+    Fix a proxy-server error when retrieving erasure coded data when
+    there are durable fragments but not enough to reconstruct.
+
+  - |
+    Fix an error in the proxy server when finalizing data.
+
+  - |
+    Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/2_27_0_release-a9ae967d6d271342.yaml b/releasenotes/notes/2_27_0_release-a9ae967d6d271342.yaml
new file mode 100644
index 0000000000..2a2231a59d
--- /dev/null
+++ b/releasenotes/notes/2_27_0_release-a9ae967d6d271342.yaml
@@ -0,0 +1,235 @@
+---
+features:
+  - |
+    Added "audit watcher" hooks to allow operators to run arbitrary code
+    against every diskfile in a cluster. For more information, see `the documentation
+    `__.
+
+  - |
+    Added support for system-scoped "reader" roles when authenticating using
+    Keystone. Operators may configure this using the ``system_reader_roles``
+    option in the ``[filter:keystoneauth]`` section of their proxy-server.conf.
+
+    A comparable group, ``.reseller_reader``, is now available for development
+    purposes when authenticating using tempauth.
+
+  - |
+    Allow static large object segments to be deleted asynchronously.
+    Operators may opt into this new behavior by enabling the new
+    ``allow_async_delete`` option in the ``[filter:slo]`` section
+    in their proxy-server.conf. For more information, see `the documentation
+    `__.
+
+  - |
+    Added the ability to connect to memcached over TLS. See the
+    ``tls_*`` options in etc/memcache.conf-sample
+
+  - |
+    The proxy-server now caches 'listing' shards, improving listing
+    performance for sharded containers. A new config option,
+    ``recheck_listing_shard_ranges``, controls the cache time and defaults to
+    10 minutes; set it to 0 to disable caching (the previous behavior).
+
+  - |
+    Added a new optional proxy-logging field ``{wire_status_int}`` for the
+    status code returned to the client. For more information, see `the documentation
+    `__.
+
+  - |
+    Memcache client error-limiting is now configurable. See the
+    ``error_suppression_*`` options in etc/memcache.conf-sample
+
+  - |
+    Added ``tasks_per_second`` option to rate-limit the object-expirer.
+
+  - |
+    Added ``usedforsecurity`` annotations for use on FIPS-compliant systems.
+
+  - |
+    S3 API improvements:
+
+    * Make allowable clock skew configurable, with a default value of
+      15 minutes to match AWS. Note that this was previously hardcoded at
+      5 minutes; operators may want to preserve the prior behavior by setting
+      ``allowable_clock_skew = 300`` in the ``[filter:s3api]`` section of their
+      proxy-server.conf.
+
+    * Container ACLs are now cloned to the ``+segments`` container when it is
+      created.
+
+    * Added the ability to configure auth region in s3token middleware.
+
+    * CORS-related headers are now passed through appropriately when using
+      the S3 API. Note that allowed origins and other container metadata
+      must still be `configured through the Swift API
+      `__.
+
+      Preflight requests do not contain enough information to map a
+      bucket to an account/container pair; a new cluster-wide option
+      ``cors_preflight_allow_origin`` may be configured for such OPTIONS
+      requests. The default (blank) rejects all S3 preflight requests.
+
+  - |
+    Sharding improvements:
+
+    * A ``--no-auto-shard`` option has been added to ``swift-container-sharder``.
+
+    * The sharder daemon has been enhanced to better support the shrinking
+      of shards that are no longer required. Shard containers will now
+      discover from their root container if they should be shrinking. They
+      will also discover the shards into which they should shrink, which may
+      include the root container itself.
+
+    * A 'compact' command has been added to ``swift-manage-shard-ranges`` that
+      enables sequences of contiguous shards with low object counts to be
+      compacted into another existing shard, or into the root container.
+
+    * ``swift-manage-shard-ranges`` can now accept a config file; this
+      may be used to ensure consistency of threshold values with the
+      container-sharder config.
+
+    * The sharding progress reports in recon cache now continue to be included
+      for a period of time after sharding has completed. The time period
+      may be configured using the ``recon_sharded_timeout`` option in the
+      ``[container-sharder]`` section of container-server.conf, and defaults
+      to 12 hours.
+
+    * Add root containers with compactible ranges to recon cache.
+
+    * Expose sharding statistics in the backend recon middleware.
+
+  - |
+    Replication improvements:
+
+    * The post-rsync REPLICATE call no longer recalculates hashes immediately.
+
+    * Hashes are no longer invalidated after a successful ssync; they were
+      already invalidated during the data transfer.
+
+  - |
+    Added support for Python 3.9.
+
+  - |
+    Partition power increase improvements:
+
+    * Fixed a bug where stale state files would cause misplaced data during
+      multiple partition power increases.
+
+    * Removed a race condition that could cause newly-written data to not be
+      linked into the new partition for the new partition power.
+
+    * Improved safety during cleanup to ensure files have been relinked
+      appropriately before unlinking.
+
+    * Added an option to drop privileges when running the relinker as root.
+
+    * Added an option to rate-limit how quickly data files are relinked or
+      cleaned up. This may be used to reduce I/O load during partition power
+      increases, improving end-user performance.
+
+    * Rehash partitions during the partition power increase. Previously, we
+      relied on the replication engine to perform the rehash, which could
+      cause an unexpected I/O spike after a partition power increase.
+
+    * Warn when relinking/cleaning up and any disks are unmounted.
+
+    * Log progress per partition when relinking/cleaning up.
+
+    * During clean-up, stop warning about tombstones that got reaped from
+      the new location but not the old.
+
+    * Added the ability to read options from object-server.conf, similar to
+      background daemons.
+
+issues:
+  - |
+    Operators should verify that encryption is not enabled in their reconciler
+    pipelines; having it enabled there may harm data durability. For more
+    information, see `bug 1910804 `__.
+
+upgrade:
+  - |
+    Added an option to write EC fragments with legacy CRC to ensure a smooth
+    upgrade from liberasurecode<=1.5.0 to >=1.6.2. For more information, see
+    `bug 1886088 `__.
+
+fixes:
+  - |
+    Errors downloading a Static Large Object that cause a shorter-than-expected
+    response are now logged as 500s.
+
+  - |
+    S3 API fixes:
+
+    * Fixed a bug that prevented the s3api pipeline validation described in
+      proxy-server.conf-sample from being performed. As documented, operators
+      can disable this via the ``auth_pipeline_check`` option if proxy startup
+      fails with validation errors.
+
+    * Fixed an issue where SHA mismatches in client XML payloads would cause
+      a server error. Swift now correctly responds with a client error about
+      the bad digest.
+
+    * Fixed an issue where non-base64 signatures would cause a server error.
+      Swift now correctly responds with a client error about the invalid
+      digest.
+
+    * The correct storage policy is now logged for S3 requests.
+
+  - |
+    Sharding fixes:
+
+    * Prevent shard databases from losing track of their root database when
+      deleted.
+
+    * Prevent sharded root databases from being reclaimed to ensure that
+      shards can detect that they have been deleted.
+
+    * Overlapping shrinking shards no longer generate audit warnings; these
+      are expected to sometimes overlap.
+
+  - |
+    Replication fixes:
+
+    * Fixed a race condition in ssync that could lead to a loss of data
+      durability (or even loss of data, for two-replica policies) when some
+      object servers have outdated rings. Replication via rsync is likely
+      still affected by a similar bug.
+
+    * Non-durable fragments can now be reverted from handoffs.
+
+    * Reduced log noise for common ssync errors.
+
+  - |
+    Python 3 fixes:
+
+    * Staticweb correctly handles listings when paths include non-ASCII
+      characters.
+
+    * S3 API now allows multipart uploads with non-ASCII characters in the
+      object name.
+
+    * Fixed an import-ordering issue in ``swift-dispersion-populate``.
+
+  - |
+    Turned off thread-logging when monkey-patching with eventlet. This
+    addresses a potential hang in the proxy-server while logging client
+    disconnects.
+
+  - |
+    Fixed a bug that could cause EC GET responses to return a server error.
+
+  - |
+    Fixed an issue with ``swift-drive-audit`` when run around New Year's.
+
+  - |
+    Server errors encountered when validating the first segment of a Static or
+    Dynamic Large Object now return a 503 to the client, rather than a 409.
+
+  - |
+    Errors when setting keys in memcached are now logged. This helps
+    operators detect when shard ranges for caching have gotten too large to
+    be stored, for example.
+
+  - |
+    Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/2_28_0_release-f2515e07fb61cd01.yaml b/releasenotes/notes/2_28_0_release-f2515e07fb61cd01.yaml
new file mode 100644
index 0000000000..bd3fdde75d
--- /dev/null
+++ b/releasenotes/notes/2_28_0_release-f2515e07fb61cd01.yaml
@@ -0,0 +1,235 @@
+---
+features:
+  - |
+    ``swift-manage-shard-ranges`` improvements:
+
+    * Exit codes are now applied more consistently:
+
+      - 0 for success
+      - 1 for an unexpected outcome
+      - 2 for invalid options
+      - 3 for user exit
+
+      As a result, some errors that previously resulted in exit code 2
+      will now exit with code 1.
+
+    * Added a new 'repair' command to automatically identify and
+      optionally resolve overlapping shard ranges.
+
+    * Added a new 'analyze' command to automatically identify overlapping
+      shard ranges and recommend a resolution based on a JSON listing
+      of shard ranges such as produced by the 'show' command.
+
+    * Added a ``--includes`` option for the 'show' command to only output
+      shard ranges that may include a given object name.
+
+    * Added a ``--dry-run`` option for the 'compact' command.
+
+    * The 'compact' command now outputs the total number of compactible
+      sequences.
+
+  - |
+    Partition power increase improvements:
+
+    * The relinker now spawns multiple subprocesses to process disks
+      in parallel. By default, one worker is spawned per disk; use the
+      new ``--workers`` option to control how many subprocesses are used.
+      Use ``--workers=0`` to maintain the previous behavior.
+
+    * The relinker can now target specific storage policies or
+      partitions by using the new ``--policy`` and ``--partition``
+      options.
+
+  - |
+    More daemons now support systemd notify sockets.
+
+  - |
+    The container-reconciler now scales out better with new ``processes``,
+    ``process``, and ``concurrency`` options, similar to the object-expirer.
+deprecations:
+  - |
+    Container sharding deprecations:
+
+    * Added a new config option, ``shrink_threshold``, to specify the
+      absolute size below which a shard will be considered for shrinking.
+      This overrides the ``shard_shrink_point`` configuration option, which
+      expressed this as a percentage of ``shard_container_threshold``.
+      ``shard_shrink_point`` is now deprecated.
+
+    * Similar to above, ``expansion_limit`` was added as an absolute-size
+      replacement for the now-deprecated ``shard_shrink_merge_point``
+      configuration option.
+fixes:
+  - |
+    Sharding improvements:
+
+    * When building a listing from shards, any failure to retrieve
+      listings will result in a 503 response. Previously, failures
+      fetching a partiucular shard would result in a gap in listings.
+
+    * Container-server logs now include the shard path in the referer
+      field when receiving stat updates.
+
+    * Added a new config option, ``rows_per_shard``, to specify how many
+      objects should be in each shard when scanning for ranges. The default
+      is ``shard_container_threshold / 2``, preserving existing behavior.
+
+    * Added a new config option, ``minimum_shard_size``. When scanning
+      for shard ranges, if the final shard would otherwise contain
+      fewer than this many objects, the previous shard will instead
+      be expanded to the end of the namespace (and so may contain up
+      to ``rows_per_shard + minimum_shard_size`` objects). This reduces
+      the number of small shards generated. The default value is
+      ``rows_per_shard / 5``.
+
+    * The sharder now correctly identifies and fails audits for shard
+      ranges that overlap exactly.
+
+    * The sharder and swift-manage-shard-ranges now consider total row
+      count (instead of just object count) when deciding whether a shard
+      is a candidate for shrinking.
+
+    * If the sharder encounters shard range gaps while cleaving, it will
+      now log an error and halt sharding progress. Previously, rows may
+      not have been moved properly, leading to data loss.
+
+    * Sharding cycle time and last-completion time are now available via
+      swift-recon.
+
+    * Fixed an issue where resolving overlapping shard ranges via shrinking
+      could prematurely mark created or cleaved shards as active.
+
+  - |
+    S3 API improvements:
+
+    * Added an option, ``ratelimit_as_client_error``, to return 429s for
+      rate-limited responses. Several clients/SDKs have seem to support
+      retries with backoffs on 429, and having it as a client error
+      cleans up logging and metrics. By default, Swift will respond 503,
+      matching AWS documentation.
+
+    * Fixed a server error in bucket listings when ``s3_acl`` is enabled
+      and staticweb is configured for the container.
+
+    * Fixed a server error when a client exceeds ``client_timeout`` during an
+      upload. Now, a ``RequestTimeout`` error is correctly returned.
+
+    * Fixed a server error when downloading multipart uploads/static large
+      objects that have missing or inaccessible segments. This is a state
+      that cannot arise in AWS, so a new ``BrokenMPU`` error is returned,
+      indicating that retrying the request is unlikely to succeed.
+
+    * Fixed several issues with the prefix, marker, and delimiter
+      parameters that would be mirrored back to clients when listing
+      buckets.
+
+  - |
+    Partition power increase fixes:
+
+    * The relinker now performs eventlet-hub selection the same way as
+      other daemons. In particular, ``epolls`` will no longer be selected,
+      as it seemed to cause occassional hangs.
+
+    * Partitions that encountered errors during relinking are no longer
+      marked as completed in the relinker state file. This ensures that
+      a subsequent relink will retry the failed partitions.
+
+    * Partition cleanup is more robust, decreasing the likelihood of
+      leaving behind mostly-empty partitions from the old partition
+      power.
+
+    * Improved relinker progress logging, and started collecting
+      progress information for swift-recon.
+
+    * Cleanup is more robust to files and directories being deleted by
+      another process.
+
+    * The relinker better handles data found from earlier partition power
+      increases.
+
+    * The relinker better handles tombstones found for the same object
+      but with different inodes.
+
+    * The reconciler now defers working on policies that have a partition
+      power increase in progress to avoid issues with concurrent writes.
+
+  - |
+    Erasure coding fixes:
+
+    * Added the ability to quarantine EC fragments that have no (or few)
+      other fragments in the cluster. A new configuration option,
+      ``quarantine_threshold``, in the reconstructor controls the point at
+      the fragment will be quarantined; the default (0) will never
+      quarantine. Only fragments older than ``quarantine_age`` (default:
+      ``reclaim_age``) may be quarantined. Before quarantining, the
+      reconstructor will attempt to fetch fragments from handoff nodes
+      in addition to the usual primary nodes; a new ``request_node_count``
+      option (default ``2 * replicas``) limits the total number of nodes to
+      contact.
+
+    * Added a delay before deleting non-durable data. A new configuration
+      option, ``commit_window`` in the ``[DEFAULT]`` section of
+      object-server.conf, adjusts this delay; the default is 60 seconds. This
+      improves the durability of both back-dated PUTs (from the reconciler or
+      container-sync, for example) and fresh writes to handoffs by preventing
+      the reconstructor from deleting data that the object-server was still
+      writing.
+
+    * Improved proxy-server and object-reconstructor logging when data
+      cannot be reconstructed.
+
+    * Fixed an issue where some but not all fragments having metadata
+      applied could prevent reconstruction of missing fragments.
+
+    * Server-side copying of erasure-coded data to a replicated policy no
+      longer copies EC sysmeta. The previous behavior had no material
+      effect, but could confuse operators examining data on disk.
+
+  - |
+    Python 3 fixes:
+
+    * Fixed a server error when performing a PUT authorized via
+      tempurl with some proxy pipelines.
+
+    * Fixed a server error during GET of a symlink with some proxy
+      pipelines.
+
+    * Fixed an issue with logging setup when /dev/log doesn't exist
+      or is not a UNIX socket.
+
+  - |
+    The dark-data audit watcher now skips objects younger than a new
+    configurable ``grace_age`` period. This avoids issues where data
+    could be flagged, quarantined, or deleted because of listing
+    consistency issues. The default is one week.
+
+  - |
+    The dark-data audit watcher now requires that all primary locations
+    for an object's container agree that the data does not appear in
+    listings to consider data "dark". Previously, a network partition
+    that left an object node isolated could cause it to quarantine or
+    delete all of its data.
+
+  - |
+    ``EPIPE`` errors no longer log tracebacks.
+
+  - |
+    The account and container auditors now log and update recon before
+    going to sleep.
+
+  - |
+    The object-expirer logs fewer client disconnects.
+
+  - |
+    ``swift-recon-cron`` now includes the last time it was run in the recon
+    information.
+
+  - |
+    ``EIO`` errors during read now cause object diskfiles to be quarantined.
+
+  - |
+    The formpost middleware now properly supports uploading multiple files
+    with different content-types.
+
+  - |
+    Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/2_29_0_release-af71f7efd73109b0.yaml b/releasenotes/notes/2_29_0_release-af71f7efd73109b0.yaml
new file mode 100644
index 0000000000..0a14ffcece
--- /dev/null
+++ b/releasenotes/notes/2_29_0_release-af71f7efd73109b0.yaml
@@ -0,0 +1,167 @@
+---
+features:
+  - |
+    S3 API improvements
+
+    * CORS preflights are now allowed for pre-signed URLs.
+
+    * The ``storage_domain`` option now accepts a comma-separated list of
+      storage domains. This allows multiple storage domains to configured
+      for use with virtual-host style addressing.
+
+    * Reduced the overhead of retrieving bucket and object ACLs.
+
+  - |
+    Replication, reconstruction, and diskfile improvements
+
+    * The reconstructor now uses the replication network to fetch fragments
+      for reconstruction.
+
+    * Added the ability to limit how many objects per handoff partition
+      will be reverted in a reconstructor cycle using the new
+      ``max_objects_per_revert`` option. This may be useful to reduce
+      ssync timeouts and lock contention, ensuring that progress is made
+      during rebalances.
+
+  - |
+    Object updater improvements
+
+    * Added the ability to ratelimit updates (approximately) per-container
+      using the new ``max_objects_per_container_per_second`` option. This may
+      be used to limit requests to already-overloaded containers while still
+      making progress on updates to other containers.
+
+    * Added timing stats by response code.
+
+    * Updates are now sent over the replication network.
+
+  - |
+    Memcache improvements
+
+    * Added the ability to configure a chance to skip checking memcache when
+      querying shard ranges. This allows some fraction of traffic to go to
+      disk and refresh memcache before the key ages out. Recommended values
+      for the new ``container_updating_shard_ranges_skip_cache_pct`` and
+      ``container_listing_shard_ranges_skip_cache_pct`` options are in the
+      range of 0.0 to 0.1.
+
+    * Added stats for shard range cache hits, misses, and skips.
+
+  - |
+    Added object-reconstructor stats to recon.
+
+  - |
+    Added a new ``swift.common.registry`` module. This includes helper
+    functions ``register_sensitive_header`` and ``register_sensitive_param``
+    which third party middleware authors may use to flag headers and query
+    parameters for redaction when logging. For more information, see `the
+    documentation `__.
+
+  - |
+    Added the ability to configure project-scope read-only roles for
+    keystoneauth using the new ``project_reader_roles`` option.
+
+  - |
+    The ``cname_lookup`` middleware now works with dnspython 2.0 and later.
+
+  - |
+    The internal clients used by the container-reconciler, container-sharder,
+    container-sync, and object-expirer daemons now use a more-descriptive
+    ``-ic`` log name, rather than ``swift``. If you previously
+    configured the ``log_name`` option in ``internal-client.conf``, you must
+    now use the ``set log_name = `` syntax to configure it, even if
+    no value is set in the ``[DEFAULT]`` section. This may be done prior to
+    upgrading.
+
+  - |
+    Removed translations from most logging.
+
+deprecations:
+  - |
+    The ``StatsdClient.set_prefix`` method is now deprecated and
+    may be removed in a future release; by extension, so is the
+    ``LogAdapter.set_statsd_prefix`` method. Middleware developers should
+    use the ``statsd_tail_prefix`` argument to ``get_logger`` instead.
+
+fixes:
+  - |
+    S3 API fixes
+
+    * Fixed the types of configured values in ``/info`` response.
+
+    * Fixed a server error when trying to copy objects with non-ASCII names.
+
+    * Fixed a server error when uploading objects with very long names.
+      A ``KeyTooLongError`` is now returned.
+
+    * Fixed an error when multi-deleting MPUs when SLO async-deletes
+      are enabled.
+
+    * Fixed an error that allowed list-uploads and list-parts requests to
+      return incomplete or out-of-order results.
+
+    * Fixed several bugs when dealing with non-ASCII object names and
+      multipart uploads.
+
+  - |
+    Replication, reconstruction, and diskfile fixes
+
+    * Ensure that non-durable data and .meta files are purged from handoffs
+      after syncing.
+
+    * Fixed tracebacks when there's a race to mark a file durable or delete it.
+
+    * Improved cooperative multitasking during ssync.
+
+    * Upon detecting a ring change, the reconstructor now only aborts the
+      jobs for that ring and continues processing jobs for other rings.
+
+    * Fixed a traceback when logging about a lock timeout in the replicator.
+
+  - |
+    Fixed a security issue where tempurl and s3api signatures were logged in
+    full. This allowed an attacker with access to log data to perform replay
+    attacks, potentially accessing or overwriting cluster data. Now, such
+    signatures are redacted in a manner similar to auth tokens; see the
+    ``reveal_sensitive_prefix`` option in ``proxy-server.conf``.
+
+    See CVE-2017-8761 for more information.
+
+  - |
+    Fixed a race condition where swift would attempt to quarantine
+    recently-deleted object updates.
+
+  - |
+    Improved handling of timeouts and other errors when obtaining a
+    connection to memcached.
+
+  - |
+    The ``swift-recon`` tool now queries each object-server IP only once
+    when reporting disk usage. Previously, each port in the ring would be
+    queried; when using servers-per-port, this could dramatically overstate
+    the disk capacity in the cluster.
+
+  - |
+    Fixed a bug that allowed some statsd metrics to be annotated with the
+    wrong backend layer.
+
+  - |
+    Fixed a traceback in the account-server when there's no account
+    database on disk to receive a container update. The account-server
+    now correctly 404s.
+
+  - |
+    The container-updater will quarantine container databases if all
+    replicas for the account respond 404.
+
+  - |
+    Fixed a proxy-server error when the read-only middleware tried to
+    handle non-Swift paths (such as may be used by third-party middleware).
+
+  - |
+    Some client behaviors that the proxy previously logged at warning have
+    been lowered to info.
+
+  - |
+    Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/2_29_1_release-a2962252523d9396.yaml b/releasenotes/notes/2_29_1_release-a2962252523d9396.yaml
new file mode 100644
index 0000000000..537c4be945
--- /dev/null
+++ b/releasenotes/notes/2_29_1_release-a2962252523d9396.yaml
@@ -0,0 +1,41 @@
+---
+deprecations:
+  - |
+    This is the final stable branch that will support Python 2.7.
+
+fixes:
+  - |
+    Fixed s3v4 signature calculation when the client sends an un-encoded
+    path in the request.
+
+  - |
+    Fixed multiple issues in s3api involving Multipart Uploads with
+    non-ASCII names.
+
+  - |
+    The object-updater now defers rate-limited updates to the end of its
+    cycle; these deferred updates will be processed (at the limited rate)
+    until the configured ``interval`` elapses. A new ``max_deferred_updates``
+    option may be used to bound the deferral queue.
+
+  - |
+    Empty account and container partition directories are now cleaned up
+    immediately after replication, rather than needing to wait for an
+    additional replication cycle.
+
+  - |
+    The object-expirer now only cleans up empty containers. Previously, it
+    would attempt to delete all processed containers, regardless of whether
+    there were entries which were skipped or had errors.
+
+  - |
+    A new ``item_size_warning_threshold`` option may be used to monitor for
+    values that are approaching the limit of what can be stored in memcache.
+    See the memcache sample config for more information.
+
+  - |
+    Internal clients now correctly use their configured ``User-Agent`` in
+    backend requests, rather than only using it for logging.
+
+  - |
+    Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/2_30_0_release-642778c3010848db.yaml b/releasenotes/notes/2_30_0_release-642778c3010848db.yaml
new file mode 100644
index 0000000000..e918df16cf
--- /dev/null
+++ b/releasenotes/notes/2_30_0_release-642778c3010848db.yaml
@@ -0,0 +1,167 @@
+---
+features:
+  - |
+    Sharding improvements
+
+    * The ``swift-manage-shard-ranges`` tool has a new mode to repair gaps
+      in the namespace.
+
+    * Metrics are now emitted for whether databases used for cleaving
+      were created or already existed, allowing a better understanding
+      of the reason for handoffs in the cluster.
+
+    * Misplaced-record stats are now also emitted to statsd. Previously,
+      these were only available in logs.
+
+  - |
+    Logging improvements
+
+    * The message template for proxy logging may now include a
+      ``{domain}`` field for the client-provided ``Host`` header.
+
+    * Added a ``log_rsync_transfers`` option to the object-replicator.
+      Set it to false to disable logging rsync "send" lines; during
+      large rebalances, such logging can overwhelm log aggregation
+      while providing little useful information.
+
+  - |
+    The formpost digest algorithm is now configurable via the new
+    ``allowed_digests`` option, and support is added for both SHA-256
+    and SHA-512. Supported formpost digests are exposed to clients in
+    ``/info``. Additionally, formpost signatures can now be base64 encoded.
+
+  - |
+    Added metrics to the formpost and tempurl middlewares to monitor
+    digest usage in signatures.
+
+  - |
+    Improved compatibility with certain FIPS-mode-enabled systems.
+
+  - |
+    Added a ``ring_ip`` option for various object services. This may be
+    used to find own devices in the ring in a containerized environment
+    where the ``bind_ip`` may not appear in the ring at all.
+
+  - |
+    Account and container replicators can now be configured with a
+    ``handoff_delete`` option, similar to object replicators and
+    reconstructors. See the sample config for more information.
+
+  - |
+    Developers using Swift's memcache client may now opt in to having
+    a ``MemcacheConnectionError`` be raised when no connection succeeded
+    using a new ``raise_on_error`` keyword argument to ``get``/``set``.
+
+  - |
+    Device names are now included in new database IDs. This provides more
+    context when examining incoming/outgoing sync tables or sharding
+    CleaveContexts.
+
+deprecations:
+  - |
+    SHA-1 signatures are now deprecated for the formpost and tempurl
+    middlewares. At some point in the future, SHA-1 will no longer be
+    enabled by default; eventually, support for it will be removed
+    entirely.
+
+security:
+  - |
+    Constant-time string comparisons are now used when checking S3 API signatures.
+
+  - |
+    Fixed a socket leak when clients try to delete a non-SLO as though
+    it were a Static Large Object.
+
+fixes:
+  - |
+    Sharding improvements
+
+    * Misplaced tombstone records are now properly cleaved.
+
+    * Fixed a bug where the sharder could fail to find a device to use for
+      cleaving.
+
+    * Databases marked deleted are now processed by the sharder.
+
+    * More information is now synced to the fresh database when sharding.
+      Previously, a database could lose the fact that it had been marked
+      as deleted.
+
+    * Shard ranges with no rows to cleave could previously be left in the
+      CREATED state after cleaving. Now, they are advanced to CLEAVED.
+
+  - |
+    S3 API improvements
+
+    * Fixed cross-policy object copies. Previously, copied data would
+      always be written using the source container's policy. Now, the
+      destination container's policy will be used, avoiding availability
+      issues and unnecessary container-reconciler work.
+
+    * More headers are now copied from multi-part upload markers to their
+      completed objects, including ``Content-Encoding``.
+
+    * When running with ``s3_acl`` disabled, ``bucket-owner-full-control`` and
+      ``bucket-owner-read`` canned ACLs will be translated to the same Swift
+      ACLs as ``private``.
+
+    * The S3 ACL and Delete Multiple APIs are now less case-sensitive.
+
+    * Improved the error message when deleting a bucket that's ever had
+      versioning enabled and still has versions in it.
+
+    * ``LastModified`` timestamps in listings are now rounded up to whole
+      seconds, like they are in responses from AWS.
+
+    * Proxy logging for Complete Multipart Upload requests is now more
+      consistent when requests have been retried.
+
+  - |
+    Logging improvements
+
+    * Signal handling is more consistently logged at notice level.
+      Previously, signal handling would sometimes be logged at info
+      or error levels.
+
+    * The object-replicator now logs successful rsync transfers at debug
+      instead of info.
+
+    * Transaction IDs are now only included in daemon log lines
+      in a request/response context.
+
+  - |
+    The tempurl middleware has been updated to return a 503 if storing a
+    token in memcache fails. Third party authentication middlewares are
+    encouraged to also use the new ``raise_on_error`` keyword argument
+    when storing ephemeral tokens in memcache.
+
+  - |
+    Database replication connections are now closed following an error
+    or timeout. This prevents a traceback in some cases when the replicator
+    tries to reuse the connection.
+
+  - |
+    ``ENOENT`` and ``ENODATA`` errors are better handled in the object
+    replicator and auditor.
+
+  - |
+    Improved object update throughput by shifting some shard range
+    filtering from Python to SQL.
+
+  - |
+    Include ``Vary: Origin`` header when CORS responses vary by origin.
+
+  - |
+    The staticweb middleware now allows empty listings at the root of
+    a container. Previously, this would result in a 404 response.
+
+  - |
+    Ring builder output tables better display weights over 1000.
+
+  - |
+    Various other minor bug fixes and improvements.
+
+other:
+  - |
+    Pickle support has been removed from Swift's memcache client. Support
+    had been deprecated since Swift 1.7.0.
diff --git a/releasenotes/notes/2_31_0_release-77e6b20dfba3b32c.yaml b/releasenotes/notes/2_31_0_release-77e6b20dfba3b32c.yaml
new file mode 100644
index 0000000000..ae5d96a637
--- /dev/null
+++ b/releasenotes/notes/2_31_0_release-77e6b20dfba3b32c.yaml
@@ -0,0 +1,118 @@
+---
+features:
+  - |
+    Added support for Python 3.10.
+
+  - |
+    Added an optional ``backend_ratelimit`` middleware for backend servers.
+    See the backend server sample configuration files for more information.
+
+  - |
+    Sharding improvements
+
+    * Added a ``merge`` subcommand to ``swift-manage-shard-ranges`` to merge
+      arbitrary shard ranges into a container DB. Minimal safety checks
+      are performed; it should only be used for emergency shard range
+      manipulation by expert users.
+
+    * Warnings are now emitted when sharding appears to have become stuck.
+      Use the new ``container_sharding_timeout`` option to configure the
+      "stuck" threshold; the default is 48 hours.
+
+  - |
+    Metrics improvements
+
+    * Added timing stats for memcached operations.
+
+    * Renamed and improved the granularity of shard range cache and
+      backend stats. Metrics dashboards may need to be updated.
+
+    * Emit stats when backend nodes are error-limited.
+
+  - |
+    Added the ability to configure a chance to skip checking memcache when
+    querying account and container information. This allows some fraction
+    of traffic to go to disk and refresh memcache before the key ages out.
+    Recommended values for the new ``account_existence_skip_cache_pct`` and
+    ``container_existence_skip_cache_pct`` options are in the range of
+    0.0 to 0.01.
+
+  - |
+    Absolute-form request targets are now accepted. This enables access for
+    certain clients and SDKs (including some older versions of rclone that
+    were using an old version of aws-sdk-go).
+
+upgrade:
+  - |
+    Static large object segments may now be deleted asynchronously by
+    default. Operators may return to the old behavior by disabling the
+    ``allow_async_delete`` option in the ``[filter:slo]`` section
+    in their proxy-server.conf.
+
+security:
+  - |
+    Fixed a security issue in how ``s3api`` handles XML parsing that allowed
+    authenticated S3 clients to read arbitrary files from proxy servers.
+    Refer to `CVE-2022-47950 `__
+    for more information.
+
+fixes:
+  - |
+    S3 API improvements
+
+    * Fixed a server error when handling malformed CompleteMultipartUpload
+      requests.
+
+    * Improved error reporting when attempting to set invalid ``X-Delete-At``
+      or ``X-Delete-After`` values via the S3 API.
+
+  - |
+    Sharding improvements
+
+    * Sync more shard ranges from the root database to the shards. This
+      helps ensure shard range repairs effected at the root make their way
+      to shards that would otherwise be stuck trying to further divide
+      into sub-shards.
+
+    * Improved performance of ``delimiter`` listings for sharded containers.
+
+    * Added more safety checks to the ``repair`` subcommand of
+      ``swift-manage-shard-ranges``.
+
+    * Better handle ``EOFError`` and ``KeyboardInterrupt`` when prompting for
+      input in ``swift-manage-shard-ranges``.
+
+    * Stop warning about transient overlaps when auditing shard ranges.
+
+  - |
+    Fixed a path-rewriting bug introduced in Python 3.7.14, 3.8.14, 3.9.14,
+    and 3.10.6 that could cause some ``domain_remap`` requests to be routed to
+    the wrong object.
+
+  - |
+    Fixed a server error when attempting to access data in a deleted
+    container that had an erasure-coded storage policy.
+
+  - |
+    Improved error messages to clients that encounter errors using the
+    ``formpost`` middleware.
+
+  - |
+    Removed some inappropriate error-suppression when locking account and
+    container databases.
+
+  - |
+    Improved server start-up time when using multiple workers.
+
+  - |
+    Removed some unnecessary locking when logging.
+
+  - |
+    Added some basic object-metadata validation; invalid diskfiles will be
+    quarantined via the auditor or reconstructor.
+
+  - |
+    Enhanced logging when error-limiting a backend node.
+
+  - |
+    Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/2_31_1_release-20ccd07e32b91c1f.yaml b/releasenotes/notes/2_31_1_release-20ccd07e32b91c1f.yaml
new file mode 100644
index 0000000000..9461721d06
--- /dev/null
+++ b/releasenotes/notes/2_31_1_release-20ccd07e32b91c1f.yaml
@@ -0,0 +1,37 @@
+---
+fixes:
+  - |
+    Sharding fixes
+
+    * Shards no longer report stats to the root database when they are in
+      the ``CREATED`` state.
+
+    * Sharding metadata is no longer cleared when databases are deleted.
+      This could previously cause deleted shards that still had rows to
+      become stuck and never move them to the correct database.
+
+    * Fixed a performance regression in the handling of misplaced objects.
+
+    * Swift path and on-disk path are now included with all sharder logging.
+
+  - |
+    ``s3token`` no longer mangles request paths that include the Access Key ID.
+
+  - |
+    User metadata is now exposed via CORS when encryption is enabled,
+    matching the behavior when encryption is not enabled.
+
+  - |
+    Fewer backend requests are now required when account or container
+    information is missing from memcache.
+
+  - |
+    Fixed logging of IP and port in the proxy-server; in particular,
+    internal clients now correctly log about the replication IP/port.
+
+  - |
+    Fixed a bug in the object replicator that would cause an under-reporting
+    of failures.
+
+  - |
+    Various other minor bug fixes.
diff --git a/releasenotes/notes/2_32_0_release-39c8fb77a0a3e72d.yaml b/releasenotes/notes/2_32_0_release-39c8fb77a0a3e72d.yaml
new file mode 100644
index 0000000000..240daa5a17
--- /dev/null
+++ b/releasenotes/notes/2_32_0_release-39c8fb77a0a3e72d.yaml
@@ -0,0 +1,122 @@
+---
+features:
+  - |
+    Python 3.11 is now supported.
+
+  - |
+    Added the ability for reseller admins to set per-policy account quotas by
+    posting metadata of the form ``X-Account-Quota-Bytes-Policy-``.
+
+  - |
+    Added a ``keepalive_timeout`` option to the proxy server to limit how long
+    to wait for a client to initiate a request, separate from the general
+    ``client_timeout`` option. Note that this requires eventlet 0.33.4
+    (currently unreleased) or later.
+
+  - |
+    Added a ``keep_cache_slo_manifest`` option to the object server to better
+    control whether SLO manifests are dropped from the page cache.
+
+  - |
+    WSGI servers now accept a ``--test-config`` option that may be used to
+    validate configuration changes before reloading/restarting the server.
+
+  - |
+    Metrics improvements:
+
+    * Metrics are now emitted for a variety of S3 error responses, in the
+      form ``s3api..[.]``
+
+    * Account and container info metrics now include the response status code
+      when backend requests are made.
+
+    * Added timing metrics to the container sharder for various operations.
+
+  - |
+    A variety of performance improvements have been made for sharded
+    container databases.
+
+  - |
+    Various logging and metrics improvements when talking to memcache.
+
+  - |
+    Improved formatting of meta and sysmeta for ``swift-account-info`` and
+    ``swift-container-info``.
+
+upgrade:
+  - |
+    Previously, under some circumstances, a non-standard config option such
+    as ``RECLAIM_AGE`` might get parsed as ``reclaim_age`` for some processes
+    but ignored by others. Now, all config parsing is case-sensitive;
+    non-standard names will always be ignored.
+
+  - |
+    The structure of cached shard ranges has changed, improving performance
+    when listing or writing to sharded containers. Note that immediately
+    after upgrade, the new structures will all be cache misses, which may
+    lead to a thundering herd problem. To avoid this, upgrade just a few
+    nodes first, let them service some fraction of traffic to populate the
+    cache, then upgrade the rest of the cluster.
+
+deprecations:
+  - |
+    Removed the hard dependency on netifaces; it may still be used if the
+    ``getifaddrs`` C function is not available. This fallback support may be
+    removed in a future release.
+
+fixes:
+  - |
+    Python 3 fixes:
+
+    * Python 3 object servers can now read unencrypted non-ASCII metadata
+      that was written under Python 2.
+
+    * Ssync no longer corrupts unencrypted non-ASCII metadata during
+      transfers.
+
+    * Fixed an encoding issue when writing non-ASCII object names to sharded
+      containers and shard range caching is not enabled.
+
+    * Fixed an encoding issue when handling non-ASCII account names.
+
+    * Fixed a ``generator already executing`` error on client disconnect.
+
+    * Suppressed ``RemoteDisconnected`` tracebacks.
+
+  - |
+    Fixed an issue that prevented proxy servers from emitting metrics and
+    logs for backend requests made when getting account or container info.
+
+  - |
+    Fixed ssync's handling of timestamp offsets. Previously, this could cause
+    ssync to fail with a 409 Conflict, causing the transfer to fail and
+    preventing handoffs from clearing.
+
+  - |
+    Fixed an issue where an erasure-coded PUT could prevent other requests
+    from being processed when network calls rarely or never blocked.
+
+  - |
+    Fixed an issue when downloading an SLO manifest would hit a recoverable
+    error and attempt to resume from another node. This would manifest as
+    either a pyeclib decode error or an unexpected empty response.
+
+  - |
+    The proxy server now applies error-limiting to the correct node when
+    handling a recoverable node error.
+
+  - |
+    Account, container, and object log fields are now correctly identified
+    when returning ``BadDigest`` responses to S3 requests.
+
+  - |
+    Reduced the backend load of making ``?versions`` requests to a container
+    that has never had object versioning enabled.
+
+  - |
+    The standard-library ``logging`` module is no longer monkey-patched when
+    importing ``swift.common.utils``, making it easier to re-use swift code
+    in other contexts.
+
+  - |
+    Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/3_33_0_release-d208917f5012cedd.yaml b/releasenotes/notes/3_33_0_release-d208917f5012cedd.yaml
new file mode 100644
index 0000000000..d5a44bb980
--- /dev/null
+++ b/releasenotes/notes/3_33_0_release-d208917f5012cedd.yaml
@@ -0,0 +1,129 @@
+---
+features:
+  - |
+    Prefix-based tempurls may now be used to explore staticweb
+    listings within that prefix. Note that this opens a new ability
+    to list containers from tempurls, but only if staticweb's
+    ``X-Container-Meta-Web-Listings`` is enabled.
+
+  - |
+    When generating index pages from listings, staticweb now sends an HTML5
+    doctype. This makes them `PEP 503 `__
+    compliant, allowing their `continued
+    use `__ for simple Python
+    package repositories.
+
+  - |
+    Add basic read support for S3 object locking. This improves
+    compatibility with an Ansible S3 module. Write support is not
+    yet implemented, so get-object-lock-configuration will always 404.
+
+  - |
+    Added a ``swift-reload`` command to assist with safely reloading WSGI
+    servers.
+
+  - |
+    Daemons now send ``STOPPING`` and ``RELOADING`` systemd notifications
+    when the service is configured with ``Type=notify``.
+
+  - |
+    Added more metrics to the container-server, allowing GET and PUT timings
+    to be broken out for listings, shard range operations, and container
+    creation.
+
+  - |
+    Added a counter metric to the proxy server when caching shard ranges.
+
+
+  - |
+    ``swift-account-info`` and ``swift-container-info`` now accept a ``--sync``
+    flag to show information from the incoming/outgoing sync tables.
+
+  - |
+    Several fixes to prepare for Python 3.12 support. While not yet tested
+    in the gate, initial manual testing looks promising.
+
+  - |
+    Added support for recent versions of eventlet.
+
+fixes:
+  - |
+    S3 API fixes:
+
+    * When the ``+segments`` container's storage policy differs from that of
+      the primary container, completed manifests are now written with the
+      correct policy in the primary container.
+
+    * If there's a conflict deleting the in-progress-upload marker when
+      completing a multipart-upload, a 503 is now returned to the client,
+      prompting it to retry.
+
+    * Added ``Accept-Ranges: bytes`` to object responses. Range requests
+      have always been supported; now, that support is properly advertised.
+
+  - |
+    Static large object fixes:
+
+    * Fixed a server error when handling conditional GET requests.
+
+    * Return an error if the SLO manifest could not be parsed. Previously,
+      a zero-byte response was returned.
+
+  - |
+    Proxy server fixes:
+
+    * Added a new ``swift.proxy_logging_status`` request environment key that
+      middlewares may use to override the logged status for a request.
+
+    * Transaction IDs are included in more error responses.
+
+    * The ``recoverable_node_timeout`` option no longer applies to
+      ``X-Newest`` GET requests.
+
+    * Improved error-handling in multi-part range responses.
+
+  - |
+    Sharding fixes:
+
+    * Prevent resets of a shard range's epoch.
+
+    * Cleaned up ``X-Backend-*`` headers in listing responses.
+
+    * Reduced the frequency of ``Reclaimable db stuck waiting for shrinking``
+      messages when a root DB has been deleted but its shards have not been
+      shrunk away.
+
+    * The more-efficient shard range structure from the last release is now
+      used when fetching ranges from the backend.
+
+    * Include more information in shard-replication warnings.
+
+  - |
+    Object server fixes:
+
+    * Object POSTs and chunked PUTs are no longer accepted when the target
+      drive is already past its ``fallocate_reserve``. DELETEs are still
+      allowed.
+
+    * Added the ability to configure cooperative yielding when servicing
+      GET responses, via the ``cooperative_period`` option. See the example
+      config for more information.
+
+    * Invalid ``hashes.invalid`` entries are now ignored, rather than
+      causing a complete partition rehash.
+
+  - |
+    Per-service ``auto_create_account_prefix`` configuration options have
+    been removed. These options were deprecated in favor of ``swift.conf``
+    configuration in Swift 2.24.0, part of the OpenStack Ussuri release.
+
+  - |
+    Daemons send object updates via the replication network in more cases.
+
+  - |
+    The dark-data object audit watcher now works with sharded containers.
+    Previously, it would think that all data files were absent from
+    listings.
+
+  - |
+    Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/release-2.34.0-2136ae35f56f8b5a.yaml b/releasenotes/notes/release-2.34.0-2136ae35f56f8b5a.yaml
new file mode 100644
index 0000000000..56af53eae7
--- /dev/null
+++ b/releasenotes/notes/release-2.34.0-2136ae35f56f8b5a.yaml
@@ -0,0 +1,117 @@
+---
+features:
+  - |
+    Middleware features:
+
+    * The static large object (SLO) middleware now supports including
+      a ``?part-number=`` query parameter to request just part of a
+      large object. This may be used to enable efficient parallel
+      downloads. For more information, see `the documentation
+      `__.
+
+    * The S3 API middleware now supports the ``?partNumber=`` query
+      parameter, enabling parallel downloads.
+
+    * The KMS keymaster now supports overriding the endpoint returned
+      in the Keystone catalog via the ``barbican_endpoint`` configuration
+      option. This may be useful in multi-region deployments which have
+      multiple endpoints.
+
+    * The backend ratelimiter now supports dynamic reloading of limits.
+      The new configuration options ``backend_ratelimit_conf_path`` and
+      ``config_reload_interval`` control which file is reloaded and how
+      frequently, respectively.
+
+    * The backend ratelimiter now supports per-method, per-device
+      ratelimits. See `etc/backend-ratelimit.conf-sample
+      `__
+      for more information.
+
+    * The account quota middleware now supports object-count quotas
+      in addition to byte-count quotas, similar to the container
+      quota middleware. For more information, see `the documentation
+      `__.
+
+  - |
+    Object expiration improvements:
+
+    * Added per-account and per-container reaping delays. These may be
+      used to offer some grace period in which to recover expired objects.
+
+    * Added a proxy-server configuration option: ``allow_open_expired``.
+      This defaults to false; if true, clients may intereact with expired
+      objects by including an ``X-Open-Expired: true`` header in GET, HEAD,
+      or POST requests.
+
+    * Expiring object queue entries now include the size of the object to
+      be expired in the ``swift_expirer_bytes`` parameter of the queue entry's
+      content-type.
+
+    * Added metrics to count skipped, delayed, and assigned tasks as
+      they're enumerated.
+
+  - |
+    S3 API error response reasons are now logged as part of the ``log_info``
+    field. This can be especially useful when diagnosing HEAD
+    failures, which necessarily have no response body.
+
+  - |
+    Python 3.12 is now supported.
+
+  - |
+    ``swift-account-info`` now supports the ``--sync`` option to display the
+    contents of the incoming and outgoing sync tables, similar to
+    ``swift-container-info``.
+
+  - |
+    The ``swift-drive-audit`` tool now works with ISO timestamps in kernel
+    logs.
+
+upgrade:
+  - |
+    Dependency update: lxml must be at least 4.2.3.
+
+deprecations:
+  - |
+    Overall account byte quotas should now be set with the
+    ``X-Account-Quota-Bytes`` header. The ``X-Account-Meta-Quota-Bytes``
+    header is now deprecated.
+
+fixes:
+  - |
+    S3 API fixes:
+
+    * Fixed a server error when using non-ASCII access key IDs.
+
+    * Fixed several checksum-related error responses to be more AWS-like.
+
+  - |
+    Using an ``X-Remove-Account-Quota-Bytes-Policy-`` header
+    now removes the per-policy quota, rather than reducing it to zero.
+
+  - |
+    Proxy-server fixes:
+
+    * Object POSTs now 503 rather than trusting the 404s that might be
+      returned from handoffs when primaries are overloaded.
+
+    * Client disconnects should always be logged within the context of
+      the appropriate client request. Previously, there were some cases
+      where logging would occur during general garbage collection, leading
+      to incorrect or missing transaction IDs in logs.
+
+    * The proxy-logging middleware now emits timing and transfer stats for
+      more requests such as auth requests. These will be labeled ``UNKNOWN``
+      rather than ``account``, ``container``, etc.
+
+    * Fixed a server error when the Swift request path has missing account
+      or container components.
+
+  - |
+    ``EUCLEAN`` errors are better handled on the object server.
+
+  - |
+    The ``swift-recon-cron`` tool now better handles missing directories.
+
+  - |
+    Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/release-2.35.0-bb2736e6cbc4a520.yaml b/releasenotes/notes/release-2.35.0-bb2736e6cbc4a520.yaml
new file mode 100644
index 0000000000..96ea5da750
--- /dev/null
+++ b/releasenotes/notes/release-2.35.0-bb2736e6cbc4a520.yaml
@@ -0,0 +1,109 @@
+---
+features:
+  - |
+    Account listings now include storage policy information for the
+    containers listed.
+
+  - |
+    Added a new object-expirer configuration option,
+    ``round_robin_task_cache_size``, to adjust the number of tasks to cache
+    before processing. This may be used to trade faster expirer start-up for
+    more lumpy container-server load.
+
+  - |
+    Added a ``stale_worker_timeout`` configuration option to the WSGI
+    servers. Once this time elapses following a reload, the manager
+    process will issue SIGKILLs to any remaining stale workers.
+
+  - |
+    Object updater observability improvements:
+
+    * Added last start time to recon dumps as ``object_updater_last``.
+
+    * Added information (including target account/container) from oldest
+      failed updates to recon dumps, both per-device and aggregated for the
+      node. Use the new ``async_tracker_max_entries`` and
+      ``async_tracker_dump_count`` options to adjust how many records to
+      collect.
+
+  - |
+    Added the option to tune down ETag validation in the object-server
+    during full-object reads. By default, every full read will continue
+    to have its ETag validated as bytes are streamed to the proxy-server.
+    The ``etag_validate_pct`` option may be used to configure approximately
+    what percentage of full-object reads should be validated; reducing this
+    can improve performance when object-servers are CPU-constrained.
+
+    Partial reads continue to never have their ETag validated in the
+    object-server. The object-auditor continues to periodically validate
+    every object's ETag.
+
+deprecations:
+  - |
+    The object-expirer configuration options
+    ``expiring_objects_container_divisor`` and
+    ``expiring_objects_account_name`` are now deprecated.
+
+    If a cluster was deployed with a non-standard account name, operators
+    should remove the option from all configs so they are using a supported
+    configuration going forward, but will need to deploy stand-alone expirer
+    processes with legacy expirer config to clean-up old expiration tasks
+    from the previously configured account name.
+
+fixes:
+  - |
+    Removed the use of ``eval`` in the xprofile middleware. Note that this
+    middleware is only intended for development purposes and is not
+    intended for use in production systems.
+
+  - |
+    The S3 API no longer requires a ``Content-MD5`` header for ``DeleteObjects``
+    requests when using v4 signatures.
+
+  - |
+    Uploads to containers with object versioning enabled now require a
+    ``Content-Length`` or ``Transfer-Encoding: chunked`` header, similar to
+    other containers.
+
+  - |
+    Fixed a server error when deleting a specific version via the S3 API.
+
+  - |
+    ``X-Open-Expired`` now works properly with ``?part-number`` requests.
+
+  - |
+    Fixed ``Content-Type`` and ``Content-Length`` headers in account and
+    container ``HEAD`` responses (with eventlet>=0.38.0).
+
+  - |
+    Object expiration fixes:
+
+    * The object-expirer now better handles errors during listings.
+
+    * The object-expirer now respects the ``internal_client_conf_path``
+      configuration option in legacy ``object-expirer.conf`` configs.
+
+  - |
+    Improved the reliability of ``swift-reload``.
+
+  - |
+    Separated unlinks-due-to-outdated-updates from
+    unlinks-due-to-fully-processed-updates in logged object-updater stats.
+
+  - |
+    The ``cooperative_period`` option for the object-server now affects PUT
+    requests in a way similar to GET responses.
+
+  - |
+    Fixed an issue with the object-reconstructor that would prevent
+    reconstruction of objects with non-ASCII header names.
+
+  - |
+    Fixed an issue with the container-reconciler that could cause reconciler
+    databases to have conflicting rows that could not be resolved.
+
+  - |
+    Removed use of the deprecated cgi module.
+
+  - |
+    Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/release-2.36.0-9f4b539db40bffd7.yaml b/releasenotes/notes/release-2.36.0-9f4b539db40bffd7.yaml
new file mode 100644
index 0000000000..16eb603e2f
--- /dev/null
+++ b/releasenotes/notes/release-2.36.0-9f4b539db40bffd7.yaml
@@ -0,0 +1,88 @@
+---
+features:
+  - |
+    S3 API
+
+    * Added support for aws-chunked transfers. Recent AWS clients recently
+      began defaulting to this mode. See also:
+      `Transferring Payload in Multiple Chunks (Chunked Upload)
+      `__.
+
+    * Added support for verifying additional checksums during upload. All
+      algorithms currently supported by AWS are supported: CRC64NVME,
+      CRC32, CRC32C, SHA1, and SHA256. See also: `Checking object integrity
+      in Amazon S3
+      `__.
+      Note that some algorithms require the availability of additional
+      libraries: ISA-L or anycrc.
+
+    * Added support for create-without-overwrite conditional writes.
+
+  - |
+    Let clients request heartbeats during COPYs by including
+    the query parameter ``heartbeat=on``.
+
+    With heartbeating turned on, the proxy will start its response
+    immediately with 202 Accepted then send a single whitespace
+    character periodically until the request completes. At that
+    point, a final summary chunk will be sent which includes a
+    ``Response Status`` key indicating success or failure.
+
+  - |
+    Labeled metrics
+
+    * Added support for emitting labeled statsd metrics in a variety of
+      formats. Middleware authors should see `the LabeledStatsdClient documentation
+      `__
+      for more information.
+
+    * Instrumented proxy-logging with labeled metrics. See
+      ``proxy-server.conf-sample`` for more information.
+
+    * Instrumented the object-server with labeled metrics. See
+      ``object-server.conf-sample`` for more information.
+
+  - |
+    Added ``access_user_id`` logging field; out-of-tree auth middlewares should
+    use ``environ['swift.access_logging']['user_id']`` to populate this field.
+
+  - |
+    Introduced an extensible ring format. This allows both more than 65,536
+    devices and more data structures to be in a ring. For more information,
+    see `Ring File Formats -- Ring v2
+    `__.
+
+  - |
+    Python 3.13 (with the GIL enabled) is now supported. Free-threaded
+    builds remain untested.
+upgrade:
+  - |
+    Removed support for Python 3.6.
+
+  - |
+    Removed support for pickled ring files. These have not been written
+    since Swift 1.7.0.
+deprecations:
+  - |
+    Flamingo (2025.2) will be the final stable release to support Python 3.7 and 3.8.
+fixes:
+  - |
+    S3 API
+
+    * Fixed HTTP framing issues when returning errors for a request with
+      ``Expect: 100-continue``.
+
+    * Improved various error messages to better imitate AWS responses.
+
+  - |
+    SSYNC connections are now promptly terminated when subrequests timeout.
+
+  - |
+    Fixed a recursion error in the account-quota middleware.
+
+  - |
+    Fixed an error in ``invalidate_hash`` when the partition is deleted while
+    waiting for the partition lock.
+
+  - |
+    Various other minor bug fixes and improvements.
diff --git a/releasenotes/notes/release-2.37.0-7a89cc30f85f03e0.yaml b/releasenotes/notes/release-2.37.0-7a89cc30f85f03e0.yaml
new file mode 100644
index 0000000000..bb729ad43f
--- /dev/null
+++ b/releasenotes/notes/release-2.37.0-7a89cc30f85f03e0.yaml
@@ -0,0 +1,102 @@
+---
+features:
+  - |
+    The s3token middleware now passes service auth tokens to Keystone
+    if credentials are provided. This is required to enable S3 API
+    access for Keystone users when using Keystone >25.0.0, !=26.0.0,
+    !=26.0.1, !=27.0.0, !=28.0.0. See etc/proxy-server.conf-sample for
+    configuration details. For more information, see
+    `OSSA-2025-002 `__ and
+    `bug #2119646 `__.
+
+  - |
+    The s3token middleware now caches credential secrets for one minute
+    by default, if credentials are provided. Secret-caching typically
+    reduces the load on Keystone and is required for Keystone users to
+    be able to use signed aws-chunked transfers. To return to prior
+    behavior, explicitly set ``secret_cache_duration = 0`` in the
+    ``[filter:s3api]`` section of your proxy-server.conf.
+
+  - |
+    The KMS keymaster now supports selecting the endpoint returned in the
+    Keystone catalog via the ``barbican_region_name`` configuration option.
+    This may be useful in multi-region deployments which have multiple
+    endpoints.
+
+  - |
+    The request line-length limit is now configurable for all WSGI servers
+    via the ``max_request_line`` option in the ``[swift-constraints]`` section
+    of swift.conf. By default, continue to use eventlet's default of 8192
+    bytes.
+
+  - |
+    The following new metrics were added when using labeled metrics:
+
+      * The proxy-logging middleware may now emit real-time transfer metrics.
+        See the ``statsd_emit_buffer_xfer_bytes_seconds`` option in
+        etc/proxy-server.conf-sample for more information.
+
+      * The proxy-logging middleware now includes an ``api`` label whose value
+        may be ``swift`` or ``S3`` depending on whether the client request is
+        serviced by the swift API or S3 API.
+
+      * The s3api middleware now emits a counter recording the usage of
+        various protocol-related headers.
+
+      * The container-sharder now emits a timing metric for the length of
+        time between shard range creation and cleaving.
+
+  - |
+    ``swift-manage-shard-ranges`` now defaults to committing pending
+    updates before looking for shard range boundaries. A new option,
+    ``--skip-commits``, may be used to restore previous behavior.
+
+  - |
+    Added a ``--clobber-hardlink-collisions`` option to ``swift-object-relinker``.
+    With this option enabled during the relink phase the relinker will
+    quarantine the colliding file in the new target part dir and retry the
+    relink. During the cleanup phase it will ignore the un-matched inode
+    "collision" and allow the cleanup of the old file in the old part dir
+    similar to tombstones.
+
+upgrade:
+  - |
+    Removed fallback support using netifaces; ``getifaddrs`` is now always
+    used to determine available IP addresses.
+
+fixes:
+  - |
+    Improved checksum validation for S3 API DeleteObjects requests.
+
+  - |
+    POST requests are more likely to receive a 503 response in the
+    face of backend inconsistencies.
+
+  - |
+    Writes to sharded containers are less likely to have their updates
+    sent to the root container. This uses a new cooperative-token
+    mechanism to limit the number of concurrent shard range queries
+    to the root container; see the ``[app:proxy-server]`` section of
+    etc/proxy-server.conf-sample for configuration options.
+
+  - |
+    Fixed the ``swift_dir`` option for WSGI servers; the file
+    ``/etc/swift/swift.conf`` no longer needs to exist when that option
+    is set.
+
+  - |
+    Fixed an object-server error when there is a part-power increase in
+    progress and there was an issue marking the file in the new partition
+    space as durable.
+
+  - |
+    Device names are now included in sharded database IDs, similar to
+    regular databases. This provides more context when examining
+    incoming/outgoing sync tables or sharding CleaveContexts.
+
+  - |
+    Database replicators now clean up temporary files older than
+    ``reclaim_age``.
+
+  - |
+    Various other minor bug fixes and improvements.
diff --git a/releasenotes/source/2023.1.rst b/releasenotes/source/2023.1.rst
new file mode 100644
index 0000000000..2c9a36fae4
--- /dev/null
+++ b/releasenotes/source/2023.1.rst
@@ -0,0 +1,6 @@
+===========================
+2023.1 Series Release Notes
+===========================
+
+.. release-notes::
+   :branch: unmaintained/2023.1
diff --git a/releasenotes/source/2023.2.rst b/releasenotes/source/2023.2.rst
new file mode 100644
index 0000000000..a4838d7d0e
--- /dev/null
+++ b/releasenotes/source/2023.2.rst
@@ -0,0 +1,6 @@
+===========================
+2023.2 Series Release Notes
+===========================
+
+.. release-notes::
+   :branch: stable/2023.2
diff --git a/releasenotes/source/2024.1.rst b/releasenotes/source/2024.1.rst
new file mode 100644
index 0000000000..6896656be6
--- /dev/null
+++ b/releasenotes/source/2024.1.rst
@@ -0,0 +1,6 @@
+===========================
+2024.1 Series Release Notes
+===========================
+
+.. release-notes::
+   :branch: unmaintained/2024.1
diff --git a/releasenotes/source/2024.2.rst b/releasenotes/source/2024.2.rst
new file mode 100644
index 0000000000..aaebcbc8c3
--- /dev/null
+++ b/releasenotes/source/2024.2.rst
@@ -0,0 +1,6 @@
+===========================
+2024.2 Series Release Notes
+===========================
+
+.. release-notes::
+   :branch: stable/2024.2
diff --git a/releasenotes/source/2025.1.rst b/releasenotes/source/2025.1.rst
new file mode 100644
index 0000000000..3add0e53aa
--- /dev/null
+++ b/releasenotes/source/2025.1.rst
@@ -0,0 +1,6 @@
+===========================
+2025.1 Series Release Notes
+===========================
+
+.. release-notes::
+   :branch: stable/2025.1
diff --git a/releasenotes/source/2025.2.rst b/releasenotes/source/2025.2.rst
new file mode 100644
index 0000000000..4dae18d869
--- /dev/null
+++ b/releasenotes/source/2025.2.rst
@@ -0,0 +1,6 @@
+===========================
+2025.2 Series Release Notes
+===========================
+
+.. release-notes::
+   :branch: stable/2025.2
diff --git a/releasenotes/source/conf.py b/releasenotes/source/conf.py
new file mode 100644
index 0000000000..401a53ab98
--- /dev/null
+++ b/releasenotes/source/conf.py
@@ -0,0 +1,353 @@
+# -*- coding: utf-8 -*-
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# swift documentation build configuration file, created by
+# sphinx-quickstart on Mon Oct  3 17:01:55 2016.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+import datetime
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'reno.sphinxext',
+    'openstackdocstheme',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+# templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The encoding of source files.
+#
+# source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'Swift Release Notes'
+copyright = '%d, OpenStack Foundation' % datetime.datetime.now().year
+
+# Release notes do not need a version number in the title, they
+# cover multiple releases.
+# The short X.Y version.
+version = ''
+# The full version, including alpha/beta/rc tags.
+release = ''
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+# language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#
+# today = ''
+#
+# Else, today_fmt is used as the format for a strftime call.
+#
+# today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#
+# default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#
+# add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#
+# add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#
+# show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'native'
+
+# A list of ignored prefixes for module index sorting.
+# modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+# keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+# todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'openstackdocs'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+# html_theme_path = []
+
+# The name for this set of Sphinx documents.
+# " v documentation" by default.
+#
+# html_title = u'swift v2.10.0'
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#
+# html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#
+# html_logo = None
+
+# The name of an image file (relative to this directory) to use as a favicon of
+# the docs.  This file should be a Windows icon file (.ico) being 16x16 or
+# 32x32 pixels large.
+#
+# html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+# html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#
+# html_extra_path = []
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#
+# html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#
+# html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#
+# html_additional_pages = {}
+
+# If false, no module index is generated.
+#
+# html_domain_indices = True
+
+# If false, no index is generated.
+#
+# html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#
+# html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#
+# html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#
+# html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#
+# html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a  tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#
+# html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+# html_file_suffix = None
+
+# Language to be used for generating the HTML full-text search index.
+# Sphinx supports the following languages:
+#   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
+#   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh'
+#
+# html_search_language = 'en'
+
+# A dictionary with options for the search language support, empty by default.
+# 'ja' uses this config value.
+# 'zh' user can custom change `jieba` dictionary path.
+#
+# html_search_options = {'type': 'default'}
+
+# The name of a javascript file (relative to the configuration directory) that
+# implements a search results scorer. If empty, the default will be used.
+#
+# html_search_scorer = 'scorer.js'
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'SwiftReleaseNotesdoc'
+
+# -- Options for LaTeX output ---------------------------------------------
+
+# latex_elements = {
+#      # The paper size ('letterpaper' or 'a4paper').
+#      #
+#      # 'papersize': 'letterpaper',
+
+#      # The font size ('10pt', '11pt' or '12pt').
+#      #
+#      # 'pointsize': '10pt',
+
+#      # Additional stuff for the LaTeX preamble.
+#      #
+#      # 'preamble': '',
+
+#      # Latex figure (float) alignment
+#      #
+#      # 'figure_align': 'htbp',
+# }
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+# latex_documents = [
+#     (master_doc, 'swift.tex', u'swift Documentation',
+#      u'swift', 'manual'),
+# ]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#
+# latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#
+# latex_use_parts = False
+
+# If true, show page references after internal links.
+#
+# latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#
+# latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#
+# latex_appendices = []
+
+# It false, will not define \strong, \code, 	itleref, \crossref ... but only
+# \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added
+# packages.
+#
+# latex_keep_old_macro_names = True
+
+# If false, no module index is generated.
+#
+# latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+# man_pages = [
+#     (master_doc, 'swift', u'swift Documentation',
+#      [author], 1)
+# ]
+
+# If true, show URL addresses after external links.
+#
+# man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+# texinfo_documents = [
+#     (master_doc, 'swift', u'swift Documentation',
+#      author, 'swift', 'One line description of project.',
+#      'Miscellaneous'),
+# ]
+
+# Documents to append as an appendix to all manuals.
+#
+# texinfo_appendices = []
+
+# If false, no module index is generated.
+#
+# texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#
+# texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#
+# texinfo_no_detailmenu = False
+
+locale_dirs = ['locale/']
+
+# -- Options for openstackdocstheme -------------------------------------------
+openstackdocs_repo_name = 'openstack/swift'
+openstackdocs_auto_name = False
+openstackdocs_bug_project = 'swift'
+openstackdocs_bug_tag = ''
diff --git a/releasenotes/source/current.rst b/releasenotes/source/current.rst
new file mode 100644
index 0000000000..87a748f242
--- /dev/null
+++ b/releasenotes/source/current.rst
@@ -0,0 +1,5 @@
+====================================
+ Current (Unreleased) Release Notes
+====================================
+
+.. release-notes::
diff --git a/releasenotes/source/index.rst b/releasenotes/source/index.rst
new file mode 100644
index 0000000000..823ce4a5fe
--- /dev/null
+++ b/releasenotes/source/index.rst
@@ -0,0 +1,27 @@
+=====================
+ Swift Release Notes
+=====================
+
+.. toctree::
+   :maxdepth: 1
+
+   current
+   2025.2
+   2025.1
+   2024.2
+   2024.1
+   2023.2
+   2023.1
+   zed
+   yoga
+   xena
+   wallaby
+   victoria
+   ussuri
+   train
+   stein
+   rocky
+   queens
+   pike
+   ocata
+   newton
diff --git a/releasenotes/source/locale/en_GB/LC_MESSAGES/releasenotes.po b/releasenotes/source/locale/en_GB/LC_MESSAGES/releasenotes.po
new file mode 100644
index 0000000000..1cb7513bad
--- /dev/null
+++ b/releasenotes/source/locale/en_GB/LC_MESSAGES/releasenotes.po
@@ -0,0 +1,4207 @@
+# Andi Chandler , 2017. #zanata
+# Andi Chandler , 2018. #zanata
+# Andi Chandler , 2020. #zanata
+# Andi Chandler , 2022. #zanata
+# Andi Chandler , 2023. #zanata
+# Andi Chandler , 2024. #zanata
+msgid ""
+msgstr ""
+"Project-Id-Version: Swift Release Notes\n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2025-07-07 19:19+0000\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"PO-Revision-Date: 2024-10-01 01:42+0000\n"
+"Last-Translator: Andi Chandler \n"
+"Language-Team: English (United Kingdom)\n"
+"Language: en_GB\n"
+"X-Generator: Zanata 4.3.3\n"
+"Plural-Forms: nplurals=2; plural=(n != 1)\n"
+
+msgid ""
+"**Dependency update**: ``eventlet`` must be at least 0.25.0. This also "
+"dragged forward minimum-supported versions of ``dnspython`` (1.15.0), "
+"``greenlet`` (0.3.2), and ``six`` (1.10.0)."
+msgstr ""
+"**Dependency update**: ``eventlet`` must be at least 0.25.0. This also "
+"dragged forward minimum-supported versions of ``dnspython`` (1.15.0), "
+"``greenlet`` (0.3.2), and ``six`` (1.10.0)."
+
+msgid ""
+"**Dependency updates**: we've increased our minimum supported version of "
+"``cryptography`` to 2.0.2 and ``netifaces`` to 0.8. This is largely due to "
+"the difficulty of continuing to test with the old versions."
+msgstr ""
+"**Dependency updates**: we've increased our minimum supported version of "
+"``cryptography`` to 2.0.2 and ``netifaces`` to 0.8. This is largely due to "
+"the difficulty of continuing to test with the old versions."
+
+msgid ""
+"**If your cluster has encryption enabled and is still running Swift under "
+"Python 2**, we recommend upgrading Swift *before* transitioning to Python 3. "
+"Otherwise, new writes to objects with non-ASCII characters in their paths "
+"may result in corrupted downloads when read from a proxy-server still "
+"running old swift on Python 2. See `bug 1888037 `__ for more information."
+msgstr ""
+"**If your cluster has encryption enabled and is still running Swift under "
+"Python 2**, we recommend upgrading Swift *before* transitioning to Python 3. "
+"Otherwise, new writes to objects with non-ASCII characters in their paths "
+"may result in corrupted downloads when read from a proxy-server still "
+"running old swift on Python 2. See `bug 1888037 `__ for more information."
+
+msgid ""
+"**If your cluster has encryption enabled and is still running Swift under "
+"Python 2**, we recommend upgrading Swift *before* transitioning to Python 3. "
+"Otherwise, new writes to objects with non-ASCII characters in their paths "
+"may result in corrupted downloads when read from a proxy-server still "
+"running old swift on Python 2. See `bug 1888037 `__ for more information. Note that new tags including a "
+"fix for the bug are planned for all maintained stable branches; upgrading to "
+"any one of those should be sufficient to ensure a smooth upgrade to the "
+"latest Swift."
+msgstr ""
+"**If your cluster has encryption enabled and is still running Swift under "
+"Python 2**, we recommend upgrading Swift *before* transitioning to Python 3. "
+"Otherwise, new writes to objects with non-ASCII characters in their paths "
+"may result in corrupted downloads when read from a proxy-server still "
+"running old swift on Python 2. See `bug 1888037 `__ for more information. Note that new tags including a "
+"fix for the bug are planned for all maintained stable branches; upgrading to "
+"any one of those should be sufficient to ensure a smooth upgrade to the "
+"latest Swift."
+
+msgid ""
+"**If your cluster is configured with a separate replication network**, note "
+"that background daemons will switch to using this network for all traffic. "
+"If your account, container, or object replication servers are configured "
+"with ``replication_server = true``, these daemons may log a flood of ``405 "
+"Method Not Allowed`` messages during a rolling upgrade. To avoid this, "
+"comment out the option and restart replication servers before upgrading."
+msgstr ""
+"**If your cluster is configured with a separate replication network**, note "
+"that background daemons will switch to using this network for all traffic. "
+"If your account, container, or object replication servers are configured "
+"with ``replication_server = true``, these daemons may log a flood of ``405 "
+"Method Not Allowed`` messages during a rolling upgrade. To avoid this, "
+"comment out the option and restart replication servers before upgrading."
+
+msgid "0 for success"
+msgstr "0 for success"
+
+msgid "1 for an unexpected outcome"
+msgstr "1 for an unexpected outcome"
+
+msgid "2 for invalid options"
+msgstr "2 for invalid options"
+
+msgid "2.10.0"
+msgstr "2.10.0"
+
+msgid "2.10.1"
+msgstr "2.10.1"
+
+msgid "2.10.2"
+msgstr "2.10.2"
+
+msgid "2.11.0"
+msgstr "2.11.0"
+
+msgid "2.12.0"
+msgstr "2.12.0"
+
+msgid "2.13.0"
+msgstr "2.13.0"
+
+msgid "2.13.1"
+msgstr "2.13.1"
+
+msgid "2.13.1-12"
+msgstr "2.13.1-12"
+
+msgid "2.14.0"
+msgstr "2.14.0"
+
+msgid "2.15.0"
+msgstr "2.15.0"
+
+msgid "2.15.1"
+msgstr "2.15.1"
+
+msgid "2.15.2"
+msgstr "2.15.2"
+
+msgid "2.16.0"
+msgstr "2.16.0"
+
+msgid "2.17.0"
+msgstr "2.17.0"
+
+msgid "2.17.1"
+msgstr "2.17.1"
+
+msgid "2.18.0"
+msgstr "2.18.0"
+
+msgid "2.19.0"
+msgstr "2.19.0"
+
+msgid "2.19.1"
+msgstr "2.19.1"
+
+msgid "2.19.2"
+msgstr "2.19.2"
+
+msgid "2.20.0"
+msgstr "2.20.0"
+
+msgid "2.21.0"
+msgstr "2.21.0"
+
+msgid "2.21.1"
+msgstr "2.21.1"
+
+msgid "2.22.0"
+msgstr "2.22.0"
+
+msgid "2.23.0"
+msgstr "2.23.0"
+
+msgid "2.23.1"
+msgstr "2.23.1"
+
+msgid "2.23.2"
+msgstr "2.23.2"
+
+msgid "2.23.3"
+msgstr "2.23.3"
+
+msgid "2.24.0"
+msgstr "2.24.0"
+
+msgid "2.25.0"
+msgstr "2.25.0"
+
+msgid "2.25.1"
+msgstr "2.25.1"
+
+msgid "2.26.0"
+msgstr "2.26.0"
+
+msgid "2.27.0"
+msgstr "2.27.0"
+
+msgid "2.28.0"
+msgstr "2.28.0"
+
+msgid "2.28.1"
+msgstr "2.28.1"
+
+msgid "2.29.1"
+msgstr "2.29.1"
+
+msgid "2.29.2"
+msgstr "2.29.2"
+
+msgid "2.30.0"
+msgstr "2.30.0"
+
+msgid "2.30.1"
+msgstr "2.30.1"
+
+msgid "2.31.1"
+msgstr "2.31.1"
+
+msgid "2.32.0"
+msgstr "2.32.0"
+
+msgid "2.33.0"
+msgstr "2.33.0"
+
+msgid "2.34.0"
+msgstr "2.34.0"
+
+msgid "2023.1 Series Release Notes"
+msgstr "2023.1 Series Release Notes"
+
+msgid "2023.2 Series Release Notes"
+msgstr "2023.2 Series Release Notes"
+
+msgid "2024.1 Series Release Notes"
+msgstr "2024.1 Series Release Notes"
+
+msgid "3 for user exit"
+msgstr "3 for user exit"
+
+msgid ""
+"A 'compact' command has been added to ``swift-manage-shard-ranges`` that "
+"enables sequences of contiguous shards with low object counts to be "
+"compacted into another existing shard, or into the root container."
+msgstr ""
+"A 'compact' command has been added to ``swift-manage-shard-ranges`` that "
+"enables sequences of contiguous shards with low object counts to be "
+"compacted into another existing shard, or into the root container."
+
+msgid ""
+"A PUT or POST to a container will now update the container's Last-Modified "
+"time, and that value will be included in a GET/HEAD response."
+msgstr ""
+"A PUT or POST to a container will now update the container's Last-Modified "
+"time, and that value will be included in a GET/HEAD response."
+
+msgid ""
+"A ``--no-auto-shard`` option has been added to ``swift-container-sharder``."
+msgstr ""
+"A ``--no-auto-shard`` option has been added to ``swift-container-sharder``."
+
+msgid ""
+"A comparable group, ``.reseller_reader``, is now available for development "
+"purposes when authenticating using tempauth."
+msgstr ""
+"A comparable group, ``.reseller_reader``, is now available for development "
+"purposes when authenticating using tempauth."
+
+msgid ""
+"A composite ring comprises two or more component rings that are combined to "
+"form a single ring with a replica count equal to the sum of the component "
+"rings. The component rings are built independently, using distinct devices "
+"in distinct regions, which means that the dispersion of replicas between the "
+"components can be guaranteed."
+msgstr ""
+"A composite ring comprises two or more component rings that are combined to "
+"form a single ring with a replica count equal to the sum of the component "
+"rings. The component rings are built independently, using distinct devices "
+"in distinct regions, which means that the dispersion of replicas between the "
+"components can be guaranteed."
+
+msgid ""
+"A new ``item_size_warning_threshold`` option may be used to monitor for "
+"values that are approaching the limit of what can be stored in memcache. See "
+"the memcache sample config for more information."
+msgstr ""
+"A new ``item_size_warning_threshold`` option may be used to monitor for "
+"values that are approaching the limit of what can be stored in memcache. See "
+"the memcache sample config for more information."
+
+msgid ""
+"A variety of performance improvements have been made for sharded container "
+"databases."
+msgstr ""
+"A variety of performance improvements have been made for sharded container "
+"databases."
+
+msgid "ACLs now work with unicode in user/account names."
+msgstr "ACLs now work with Unicode in user/account names."
+
+msgid ""
+"Accept a trade off of dispersion for balance in the ring builder that will "
+"result in getting to balanced rings much more quickly in some cases."
+msgstr ""
+"Accept a trade off of dispersion for balance in the ring builder that will "
+"result in getting to balanced rings much more quickly in some cases."
+
+msgid ""
+"Account and container databases will now be quarantined if the database "
+"schema has been corrupted."
+msgstr ""
+"Account and container databases will now be quarantined if the database "
+"schema has been corrupted."
+
+msgid ""
+"Account and container info metrics now include the response status code when "
+"backend requests are made."
+msgstr ""
+"Account and container info metrics now include the response status code when "
+"backend requests are made."
+
+msgid ""
+"Account and container replication stats logs now include ``remote_merges``, "
+"the number of times a whole database was sent to another node."
+msgstr ""
+"Account and container replication stats logs now include ``remote_merges``, "
+"the number of times a whole database was sent to another node."
+
+msgid ""
+"Account and container replicators can now be configured with a "
+"``handoff_delete`` option, similar to object replicators and reconstructors. "
+"See the sample config for more information."
+msgstr ""
+"Account and container replicators can now be configured with a "
+"``handoff_delete`` option, similar to object replicators and reconstructors. "
+"See the sample config for more information."
+
+msgid "Account quotas are now enforced even on empty accounts."
+msgstr "Account quotas are now enforced even on empty accounts."
+
+msgid ""
+"Account, container, and object log fields are now correctly identified when "
+"returning ``BadDigest`` responses to S3 requests."
+msgstr ""
+"Account, container, and object log fields are now correctly identified when "
+"returning ``BadDigest`` responses to S3 requests."
+
+msgid "Add Composite Ring Functionality"
+msgstr "Add Composite Ring Functionality"
+
+msgid "Add Vary headers for CORS responses."
+msgstr "Add Vary headers for CORS responses."
+
+msgid ""
+"Add ``databases_per_second`` to the account-replicator, container-"
+"replicator, and container-sharder. This prevents them from using a full CPU "
+"core when they are not IO limited."
+msgstr ""
+"Add ``databases_per_second`` to the account-replicator, container-"
+"replicator, and container-sharder. This prevents them from using a full CPU "
+"core when they are not IO limited."
+
+msgid ""
+"Add a ``--drop-prefixes`` flag to swift-account-info, swift-container-info, "
+"and swift-object-info. This makes the output between the three more "
+"consistent."
+msgstr ""
+"Add a ``--drop-prefixes`` flag to swift-account-info, swift-container-info, "
+"and swift-object-info. This makes the output between the three more "
+"consistent."
+
+msgid ""
+"Add a multiprocess mode to the object replicator. Setting the "
+"``replicator_workers`` setting to a positive value N will result in the "
+"replicator using up to N worker processes to perform replication tasks. At "
+"most one worker per disk will be spawned."
+msgstr ""
+"Add a multiprocess mode to the object replicator. Setting the "
+"``replicator_workers`` setting to a positive value N will result in the "
+"replicator using up to N worker processes to perform replication tasks. At "
+"most one worker per disk will be spawned."
+
+msgid ""
+"Add a new ``concurrent_ec_extra_requests`` option to allow the proxy to make "
+"some extra backend requests immediately. The proxy will respond as soon as "
+"there are enough responses available to reconstruct."
+msgstr ""
+"Add a new ``concurrent_ec_extra_requests`` option to allow the proxy to make "
+"some extra backend requests immediately. The proxy will respond as soon as "
+"there are enough responses available to reconstruct."
+
+msgid ""
+"Add basic read support for S3 object locking. This improves compatibility "
+"with an Ansible S3 module. Write support is not yet implemented, so get-"
+"object-lock-configuration will always 404."
+msgstr ""
+"Add basic read support for S3 object locking. This improves compatibility "
+"with an Ansible S3 module. Write support is not yet implemented, so get-"
+"object-lock-configuration will always 404."
+
+msgid ""
+"Add basic read support for object tagging. This improves compatibility with "
+"AWS CLI version 2. Write support is not yet implemented, so the tag set will "
+"always be empty."
+msgstr ""
+"Add basic read support for object tagging. This improves compatibility with "
+"AWS CLI version 2. Write support is not yet implemented, so the tag set will "
+"always be empty."
+
+msgid ""
+"Add basic support for ?versions bucket listings. We still do not have "
+"support for toggling S3 bucket versioning, but we can at least support "
+"getting the latest versions of all objects."
+msgstr ""
+"Add basic support for ?versions bucket listings. We still do not have "
+"support for toggling S3 bucket versioning, but we can at least support "
+"getting the latest versions of all objects."
+
+msgid "Add checksum to object extended attributes."
+msgstr "Add checksum to object extended attributes."
+
+msgid ""
+"Add fallocate_reserve to account and container servers. This allows disks "
+"shared between account/container and object rings to avoid getting 100% "
+"full. The default value of 1% matches the existing default on object servers."
+msgstr ""
+"Add fallocate_reserve to account and container servers. This allows disks "
+"shared between account/container and object rings to avoid getting 100% "
+"full. The default value of 1% matches the existing default on object servers."
+
+msgid "Add root containers with compactible ranges to recon cache."
+msgstr "Add root containers with compatible ranges to recon cache."
+
+msgid ""
+"Add slo_manifest_hook callback to allow other middlewares to impose "
+"additional constraints on or make edits to SLO manifests before being "
+"written. For example, a middleware could enforce minimum segment size or "
+"insert data segments."
+msgstr ""
+"Add slo_manifest_hook callback to allow other middlewares to impose "
+"additional constraints on or make edits to SLO manifests before being "
+"written. For example, a middleware could enforce minimum segment size or "
+"insert data segments."
+
+msgid ""
+"Add support for PROXY protocol v1 to the proxy server. This allows the Swift "
+"proxy server to log accurate client IP addresses when there is a proxy or "
+"SSL-terminator between the client and the Swift proxy server.  Example "
+"servers supporting this PROXY protocol include stunnel, haproxy, hitch, and "
+"varnish. See the sample proxy server config file for the appropriate config "
+"setting to enable or disable this functionality."
+msgstr ""
+"Add support for PROXY protocol v1 to the proxy server. This allows the Swift "
+"proxy server to log accurate client IP addresses when there is a proxy or "
+"SSL-terminator between the client and the Swift proxy server.  Example "
+"servers supporting this PROXY protocol include stunnel, HAProxy, hitch, and "
+"Varnish. See the sample proxy server config file for the appropriate config "
+"setting to enable or disable this functionality."
+
+msgid ""
+"Add support for multiple root encryption secrets for the trivial and KMIP "
+"keymasters. This allows operators to rotate encryption keys over time "
+"without needing to re-encrypt all existing data in the cluster. Please see "
+"the included sample config files for instructions on how to multiple "
+"encryption keys."
+msgstr ""
+"Add support for multiple root encryption secrets for the trivial and KMIP "
+"keymasters. This allows operators to rotate encryption keys over time "
+"without needing to re-encrypt all existing data in the cluster. Please see "
+"the included sample config files for instructions on how to multiple "
+"encryption keys."
+
+msgid ""
+"Add support to increase object ring partition power transparently to end "
+"users and with no cluster downtime. Increasing the ring part power allows "
+"for incremental adjustment to the upper bound of the cluster size. Please "
+"review the `full docs `__ for more information."
+msgstr ""
+"Add support to increase object ring partition power transparently to end "
+"users and with no cluster downtime. Increasing the ring part power allows "
+"for incremental adjustment to the upper bound of the cluster size. Please "
+"review the `full docs `__ for more information."
+
+msgid ""
+"Added \"audit watcher\" hooks to allow operators to run arbitrary code "
+"against every diskfile in a cluster. For more information, see `the "
+"documentation `__."
+msgstr ""
+"Added \"audit watcher\" hooks to allow operators to run arbitrary code "
+"against every diskfile in a cluster. For more information, see `the "
+"documentation `__."
+
+msgid ""
+"Added \"emergency mode\" hooks in the account and container replicators. "
+"These options may be used to prioritize moving handoff partitions to primary "
+"locations more quickly. This helps when adding capacity to a ring."
+msgstr ""
+"Added \"emergency mode\" hooks in the account and container replicators. "
+"These options may be used to prioritise moving handoff partitions to primary "
+"locations more quickly. This helps when adding capacity to a ring."
+
+msgid ""
+"Added \"static symlinks\", which perform some validation as they follow "
+"redirects and include more information about their target in container "
+"listings. For more information, see the `symlink middleware `__ section of the "
+"documentation."
+msgstr ""
+"Added \"static symlinks\", which perform some validation as they follow "
+"redirects and include more information about their target in container "
+"listings. For more information, see the `symlink middleware `__ section of the "
+"documentation."
+
+msgid ""
+"Added ``--swift-versions`` to ``swift-recon`` CLI to compare installed "
+"versions in the cluster."
+msgstr ""
+"Added ``--swift-versions`` to ``swift-recon`` CLI to compare installed "
+"versions in the cluster."
+
+msgid "Added ``-d `` and ``-p `` command line options."
+msgstr "Added ``-d `` and ``-p `` command line options."
+
+msgid ""
+"Added ``Accept-Ranges: bytes`` to object responses. Range requests have "
+"always been supported; now, that support is properly advertised."
+msgstr ""
+"Added ``Accept-Ranges: bytes`` to object responses. Range requests have "
+"always been supported; now, that support is properly advertised."
+
+msgid "Added ``tasks_per_second`` option to rate-limit the object-expirer."
+msgstr "Added ``tasks_per_second`` option to rate-limit the object-expirer."
+
+msgid ""
+"Added ``ttfb`` (Time to First Byte) and ``pid`` (Process ID) to the set of "
+"available proxy-server log fields. For more information, see `the "
+"documentation `__."
+msgstr ""
+"Added ``ttfb`` (Time to First Byte) and ``pid`` (Process ID) to the set of "
+"available proxy-server log fields. For more information, see `the "
+"documentation `__."
+
+msgid ""
+"Added ``usedforsecurity`` annotations for use on FIPS-compliant systems."
+msgstr ""
+"Added ``usedforsecurity`` annotations for use on FIPS-compliant systems."
+
+msgid ""
+"Added a \"user\" option to the drive-audit config file. Its value is used to "
+"set the owner of the drive-audit recon cache."
+msgstr ""
+"Added a \"user\" option to the drive-audit config file. Its value is used to "
+"set the owner of the drive-audit recon cache."
+
+msgid "Added a ``--dry-run`` option for the 'compact' command."
+msgstr "Added a ``--dry-run`` option for the 'compact' command."
+
+msgid ""
+"Added a ``--includes`` option for the 'show' command to only output shard "
+"ranges that may include a given object name."
+msgstr ""
+"Added a ``--includes`` option for the 'show' command to only output shard "
+"ranges that may include a given object name."
+
+msgid ""
+"Added a ``keep_cache_slo_manifest`` option to the object server to better "
+"control whether SLO manifests are dropped from the page cache."
+msgstr ""
+"Added a ``keep_cache_slo_manifest`` option to the object server to better "
+"control whether SLO manifests are dropped from the page cache."
+
+msgid ""
+"Added a ``keep_idle`` config option to configure KEEPIDLE time for TCP "
+"sockets. The default value is the old constant of 600."
+msgstr ""
+"Added a ``keep_idle`` config option to configure KEEPIDLE time for TCP "
+"sockets. The default value is the old constant of 600."
+
+msgid ""
+"Added a ``keepalive_timeout`` option to the proxy server to limit how long "
+"to wait for a client to initiate a request, separate from the general "
+"``client_timeout`` option. Note that this requires eventlet 0.33.4 "
+"(currently unreleased) or later."
+msgstr ""
+"Added a ``keepalive_timeout`` option to the proxy server to limit how long "
+"to wait for a client to initiate a request, separate from the general "
+"``client_timeout`` option. Note that this requires eventlet 0.33.4 "
+"(currently unreleased) or later."
+
+msgid ""
+"Added a ``log_rsync_transfers`` option to the object-replicator. Set it to "
+"false to disable logging rsync \"send\" lines; during large rebalances, such "
+"logging can overwhelm log aggregation while providing little useful "
+"information."
+msgstr ""
+"Added a ``log_rsync_transfers`` option to the object-replicator. Set it to "
+"false to disable logging rsync \"send\" lines; during large rebalances, such "
+"logging can overwhelm log aggregation while providing little useful "
+"information."
+
+msgid ""
+"Added a ``ring_ip`` option for various object services. This may be used to "
+"find own devices in the ring in a containerized environment where the "
+"``bind_ip`` may not appear in the ring at all."
+msgstr ""
+"Added a ``ring_ip`` option for various object services. This may be used to "
+"find own devices in the ring in a containerised environment where the "
+"``bind_ip`` may not appear in the ring at all."
+
+msgid ""
+"Added a ``swift-reload`` command to assist with safely reloading WSGI "
+"servers."
+msgstr ""
+"Added a ``swift-reload`` command to assist with safely reloading WSGI "
+"servers."
+
+msgid ""
+"Added a configurable URL base to staticweb, fixing issues when the "
+"accessible endpoint isn't known to the Swift cluster (eg http vs https)."
+msgstr ""
+"Added a configurable URL base to staticweb, fixing issues when the "
+"accessible endpoint isn't known to the Swift cluster (eg http vs https)."
+
+msgid "Added a configurable URL base to staticweb."
+msgstr "Added a configurable URL base to staticweb."
+
+msgid "Added a counter metric to the proxy server when caching shard ranges."
+msgstr "Added a counter metric to the proxy server when caching shard ranges."
+
+msgid ""
+"Added a delay before deleting non-durable data. A new configuration option, "
+"``commit_window`` in the ``[DEFAULT]`` section of object-server.conf, "
+"adjusts this delay; the default is 60 seconds. This improves the durability "
+"of both back-dated PUTs (from the reconciler or container-sync, for example) "
+"and fresh writes to handoffs by preventing the reconstructor from deleting "
+"data that the object-server was still writing."
+msgstr ""
+"Added a delay before deleting non-durable data. A new configuration option, "
+"``commit_window`` in the ``[DEFAULT]`` section of object-server.conf, "
+"adjusts this delay; the default is 60 seconds. This improves the durability "
+"of both back-dated PUTs (from the reconciler or container-sync, for example) "
+"and fresh writes to handoffs by preventing the reconstructor from deleting "
+"data that the object-server was still writing."
+
+msgid "Added a handoffs-only mode."
+msgstr "Added a handoffs-only mode."
+
+msgid ""
+"Added a new 'analyze' command to automatically identify overlapping shard "
+"ranges and recommend a resolution based on a JSON listing of shard ranges "
+"such as produced by the 'show' command."
+msgstr ""
+"Added a new 'analyze' command to automatically identify overlapping shard "
+"ranges and recommend a resolution based on a JSON listing of shard ranges "
+"such as produced by the 'show' command."
+
+msgid ""
+"Added a new 'repair' command to automatically identify and optionally "
+"resolve overlapping shard ranges."
+msgstr ""
+"Added a new 'repair' command to automatically identify and optionally "
+"resolve overlapping shard ranges."
+
+msgid ""
+"Added a new ``swift.proxy_logging_status`` request environment key that "
+"middlewares may use to override the logged status for a request."
+msgstr ""
+"Added a new ``swift.proxy_logging_status`` request environment key that "
+"middlewares may use to override the logged status for a request."
+
+msgid ""
+"Added a new config option, ``minimum_shard_size``. When scanning for shard "
+"ranges, if the final shard would otherwise contain fewer than this many "
+"objects, the previous shard will instead be expanded to the end of the "
+"namespace (and so may contain up to ``rows_per_shard + minimum_shard_size`` "
+"objects). This reduces the number of small shards generated. The default "
+"value is ``rows_per_shard / 5``."
+msgstr ""
+"Added a new config option, ``minimum_shard_size``. When scanning for shard "
+"ranges, if the final shard would otherwise contain fewer than this many "
+"objects, the previous shard will instead be expanded to the end of the "
+"namespace (and so may contain up to ``rows_per_shard + minimum_shard_size`` "
+"objects). This reduces the number of small shards generated. The default "
+"value is ``rows_per_shard / 5``."
+
+msgid ""
+"Added a new config option, ``rows_per_shard``, to specify how many objects "
+"should be in each shard when scanning for ranges. The default is "
+"``shard_container_threshold / 2``, preserving existing behavior."
+msgstr ""
+"Added a new config option, ``rows_per_shard``, to specify how many objects "
+"should be in each shard when scanning for ranges. The default is "
+"``shard_container_threshold / 2``, preserving existing behaviour."
+
+msgid ""
+"Added a new config option, ``shrink_threshold``, to specify the absolute "
+"size below which a shard will be considered for shrinking. This overrides "
+"the ``shard_shrink_point`` configuration option, which expressed this as a "
+"percentage of ``shard_container_threshold``. ``shard_shrink_point`` is now "
+"deprecated."
+msgstr ""
+"Added a new config option, ``shrink_threshold``, to specify the absolute "
+"size below which a shard will be considered for shrinking. This overrides "
+"the ``shard_shrink_point`` configuration option, which expressed this as a "
+"percentage of ``shard_container_threshold``. ``shard_shrink_point`` is now "
+"deprecated."
+
+msgid ""
+"Added a new middleware that allows users and operators to configure accounts "
+"and containers to use RFC-compliant (i.e., double-quoted) ETags. This may be "
+"useful when using Swift as an origin for some content delivery networks. For "
+"more information, see `the middleware documentation `__."
+msgstr ""
+"Added a new middleware that allows users and operators to configure accounts "
+"and containers to use RFC-compliant (i.e., double-quoted) ETags. This may be "
+"useful when using Swift as an origin for some content delivery networks. For "
+"more information, see `the middleware documentation `__."
+
+msgid ""
+"Added a new middleware to allow accounts and containers to opt-in to RFC-"
+"compliant ETags. For more information, see `the documentation `__. Clients should be aware of the fact that ETags may be "
+"quoted for RFC compliance; this may become the default behavior in some "
+"future release."
+msgstr ""
+"Added a new middleware to allow accounts and containers to opt-in to RFC-"
+"compliant ETags. For more information, see `the documentation `__. Clients should be aware of the fact that ETags may be "
+"quoted for RFC compliance; this may become the default behaviour in some "
+"future release."
+
+msgid ""
+"Added a new object versioning mode, with APIs for querying and accessing old "
+"versions. For more information, see `the documentation `__."
+msgstr ""
+"Added a new object versioning mode, with APIs for querying and accessing old "
+"versions. For more information, see `the documentation `__."
+
+msgid ""
+"Added a new optional proxy-logging field ``{wire_status_int}`` for the "
+"status code returned to the client. For more information, see `the "
+"documentation `__."
+msgstr ""
+"Added a new optional proxy-logging field ``{wire_status_int}`` for the "
+"status code returned to the client. For more information, see `the "
+"documentation `__."
+
+msgid ""
+"Added a proxy-server configuration option: ``allow_open_expired``. This "
+"defaults to false; if true, clients may intereact with expired objects by "
+"including an ``X-Open-Expired: true`` header in GET, HEAD, or POST requests."
+msgstr ""
+"Added a proxy-server configuration option: ``allow_open_expired``. This "
+"defaults to false; if true, clients may interact with expired objects by "
+"including an ``X-Open-Expired: true`` header in GET, HEAD, or POST requests."
+
+msgid ""
+"Added an experimental ``swift-ring-composer`` CLI tool to build composite "
+"rings."
+msgstr ""
+"Added an experimental ``swift-ring-composer`` CLI tool to build composite "
+"rings."
+
+msgid ""
+"Added an operator tool, ``swift-container-deleter``, to asynchronously "
+"delete some or all objects in a container using the object expirers."
+msgstr ""
+"Added an operator tool, ``swift-container-deleter``, to asynchronously "
+"delete some or all objects in a container using the object expirers."
+
+msgid "Added an option to drop privileges when running the relinker as root."
+msgstr "Added an option to drop privileges when running the relinker as root."
+
+msgid ""
+"Added an option to rate-limit how quickly data files are relinked or cleaned "
+"up. This may be used to reduce I/O load during partition power increases, "
+"improving end-user performance."
+msgstr ""
+"Added an option to rate-limit how quickly data files are relinked or cleaned "
+"up. This may be used to reduce I/O load during partition power increases, "
+"improving end-user performance."
+
+msgid ""
+"Added an option to write EC fragments with legacy CRC to ensure a smooth "
+"upgrade from liberasurecode<=1.5.0 to >=1.6.2. For more information, see "
+"`bug 1886088 `__."
+msgstr ""
+"Added an option to write EC fragments with legacy CRC to ensure a smooth "
+"upgrade from liberasurecode<=1.5.0 to >=1.6.2. For more information, see "
+"`bug 1886088 `__."
+
+msgid ""
+"Added an option, ``ratelimit_as_client_error``, to return 429s for rate-"
+"limited responses. Several clients/SDKs have seem to support retries with "
+"backoffs on 429, and having it as a client error cleans up logging and "
+"metrics. By default, Swift will respond 503, matching AWS documentation."
+msgstr ""
+"Added an option, ``ratelimit_as_client_error``, to return 429s for rate-"
+"limited responses. Several clients/SDKs have seem to support retries with "
+"backoffs on 429, and having it as a client error cleans up logging and "
+"metrics. By default, Swift will respond 503, matching AWS documentation."
+
+msgid ""
+"Added an optional ``read_only`` middleware to make an entire cluster or "
+"individual accounts read only."
+msgstr ""
+"Added an optional ``read_only`` middleware to make an entire cluster or "
+"individual accounts read only."
+
+msgid ""
+"Added container sharding, an operator controlled feature that may be used to "
+"shard very large container databases into a number of smaller shard "
+"containers. This mitigates the issues with one large DB by distributing the "
+"data across multiple smaller databases throughout the cluster. Please read "
+"the full overview at https://docs.openstack.org/swift/latest/"
+"overview_container_sharding.html"
+msgstr ""
+"Added container sharding, an operator controlled feature that may be used to "
+"shard very large container databases into a number of smaller shard "
+"containers. This mitigates the issues with one large DB by distributing the "
+"data across multiple smaller databases throughout the cluster. Please read "
+"the full overview at https://docs.openstack.org/swift/latest/"
+"overview_container_sharding.html"
+
+msgid "Added container/object listing with prefix to InternalClient."
+msgstr "Added container/object listing with prefix to InternalClient."
+
+msgid ""
+"Added metrics to count skipped, delayed, and assigned tasks as they're "
+"enumerated."
+msgstr ""
+"Added metrics to count skipped, delayed, and assigned tasks as they're "
+"enumerated."
+
+msgid ""
+"Added metrics to the formpost and tempurl middlewares to monitor digest "
+"usage in signatures."
+msgstr ""
+"Added metrics to the formpost and tempurl middlewares to monitor digest "
+"usage in signatures."
+
+msgid ""
+"Added more metrics to the container-server, allowing GET and PUT timings to "
+"be broken out for listings, shard range operations, and container creation."
+msgstr ""
+"Added more metrics to the container-server, allowing GET and PUT timings to "
+"be broken out for listings, shard range operations, and container creation."
+
+msgid ""
+"Added per-account and per-container reaping delays. These may be used to "
+"offer some grace period in which to recover expired objects."
+msgstr ""
+"Added per-account and per-container reaping delays. These may be used to "
+"offer some grace period in which to recover expired objects."
+
+msgid "Added support for Python 3.8."
+msgstr "Added support for Python 3.8."
+
+msgid "Added support for Python 3.9."
+msgstr "Added support for Python 3.9."
+
+msgid "Added support for S3 versioning using the above new mode."
+msgstr "Added support for S3 versioning using the above new mode."
+
+msgid "Added support for inline data segments in SLO manifests."
+msgstr "Added support for inline data segments in SLO manifests."
+
+msgid ""
+"Added support for per-policy proxy config options. This allows per-policy "
+"affinity options to be set for use with duplicated EC policies and composite "
+"rings. Certain options found in per-policy conf sections will override their "
+"equivalents that may be set in the [app:proxy-server] section. Currently the "
+"options handled that way are ``sorting_method``, ``read_affinity``, "
+"``write_affinity``, ``write_affinity_node_count``, and "
+"``write_affinity_handoff_delete_count``."
+msgstr ""
+"Added support for per-policy proxy config options. This allows per-policy "
+"affinity options to be set for use with duplicated EC policies and composite "
+"rings. Certain options found in per-policy conf sections will override their "
+"equivalents that may be set in the [app:proxy-server] section. Currently the "
+"options handled that way are ``sorting_method``, ``read_affinity``, "
+"``write_affinity``, ``write_affinity_node_count``, and "
+"``write_affinity_handoff_delete_count``."
+
+msgid "Added support for recent versions of eventlet."
+msgstr "Added support for recent versions of eventlet."
+
+msgid ""
+"Added support for retrieving the encryption root secret from an external key "
+"management system. In practice, this is currently limited to Barbican."
+msgstr ""
+"Added support for retrieving the encryption root secret from an external key "
+"management system. In practice, this is currently limited to Barbican."
+
+msgid ""
+"Added support for system-scoped \"reader\" roles when authenticating using "
+"Keystone. Operators may configure this using the ``system_reader_roles`` "
+"option in the ``[filter:keystoneauth]`` section of their proxy-server.conf."
+msgstr ""
+"Added support for system-scoped \"reader\" roles when authenticating using "
+"Keystone. Operators may configure this using the ``system_reader_roles`` "
+"option in the ``[filter:keystoneauth]`` section of their proxy-server.conf."
+
+msgid "Added symlink objects support."
+msgstr "Added symlink objects support."
+
+msgid ""
+"Added the ability for reseller admins to set per-policy account quotas by "
+"posting metadata of the form ``X-Account-Quota-Bytes-Policy-``."
+msgstr ""
+"Added the ability for reseller admins to set per-policy account quotas by "
+"posting metadata of the form ``X-Account-Quota-Bytes-Policy-``."
+
+msgid "Added the ability to configure auth region in s3token middleware."
+msgstr "Added the ability to configure auth region in s3token middleware."
+
+msgid ""
+"Added the ability to connect to memcached over TLS. See the ``tls_*`` "
+"options in etc/memcache.conf-sample"
+msgstr ""
+"Added the ability to connect to Memcached over TLS. See the ``tls_*`` "
+"options in etc/memcache.conf-sample"
+
+msgid ""
+"Added the ability to read options from object-server.conf, similar to "
+"background daemons."
+msgstr ""
+"Added the ability to read options from object-server.conf, similar to "
+"background daemons."
+
+msgid "After upgrading, re-enable and restart the object-reconstructor."
+msgstr "After upgrading, re-enable and restart the object-reconstructor."
+
+msgid ""
+"All 416 responses will now include a Content-Range header with an "
+"unsatisfied-range value. This allows the caller to know the valid range "
+"request value for an object."
+msgstr ""
+"All 416 responses will now include a Content-Range header with an "
+"unsatisfied-range value. This allows the caller to know the valid range "
+"request value for an object."
+
+msgid ""
+"All background daemons now use the replication network. This allows better "
+"isolation between external, client-facing traffic and internal, background "
+"traffic. Note that during a rolling upgrade, replication servers may respond "
+"with ``405 Method Not Allowed``. To avoid this, operators should remove the "
+"config option ``replication_server = true`` from their replication servers; "
+"this will allow them to handle all request methods before upgrading."
+msgstr ""
+"All background daemons now use the replication network. This allows better "
+"isolation between external, client-facing traffic and internal, background "
+"traffic. Note that during a rolling upgrade, replication servers may respond "
+"with ``405 Method Not Allowed``. To avoid this, operators should remove the "
+"config option ``replication_server = true`` from their replication servers; "
+"this will allow them to handle all request methods before upgrading."
+
+msgid "Allow ``fallocate_reserve`` to be specified as a percentage."
+msgstr "Allow ``fallocate_reserve`` to be specified as a percentage."
+
+msgid "Allow direct_client users to overwrite the ``X-Timestamp`` header."
+msgstr "Allow direct_client users to overwrite the ``X-Timestamp`` header."
+
+msgid ""
+"Allow operators to pass either raw or URL-quoted paths to ``swift-get-"
+"nodes``. Notably, this allows ``swift-get-nodes`` to work with the reserved "
+"namespace used for object versioning."
+msgstr ""
+"Allow operators to pass either raw or URL-quoted paths to ``swift-get-"
+"nodes``. Notably, this allows ``swift-get-nodes`` to work with the reserved "
+"namespace used for object versioning."
+
+msgid "Allow proxy-logging middlewares to be configured more independently."
+msgstr "Allow proxy-logging middlewares to be configured more independently."
+
+msgid ""
+"Allow ratelimit to be placed multiple times in a proxy pipeline, such as "
+"both before s3api and auth (to handle swift requests without needing to make "
+"an auth decision) and after (to limit S3 requests)."
+msgstr ""
+"Allow ratelimit to be placed multiple times in a proxy pipeline, such as "
+"both before s3api and auth (to handle swift requests without needing to make "
+"an auth decision) and after (to limit S3 requests)."
+
+msgid ""
+"Allow static large object segments to be deleted asynchronously. Operators "
+"may opt into this new behavior by enabling the new ``allow_async_delete`` "
+"option in the ``[filter:slo]`` section in their proxy-server.conf. For more "
+"information, see `the documentation `__."
+msgstr ""
+"Allow static large object segments to be deleted asynchronously. Operators "
+"may opt into this new behaviour by enabling the new ``allow_async_delete`` "
+"option in the ``[filter:slo]`` section in their proxy-server.conf. For more "
+"information, see `the documentation `__."
+
+msgid "Allow the expirer to gracefully move past updating stale work items."
+msgstr "Allow the expirer to gracefully move past updating stale work items."
+
+msgid "Always set Swift processes to use UTC."
+msgstr "Always set Swift processes to use UTC."
+
+msgid ""
+"As a result, some errors that previously resulted in exit code 2 will now "
+"exit with code 1."
+msgstr ""
+"As a result, some errors that previously resulted in exit code 2 will now "
+"exit with code 1."
+
+msgid ""
+"Avoid upgrading liberasurecode until swift and liberasurecode better-support "
+"a rolling upgrade. Swift remains compatible with liberasurecode 1.5.0 and "
+"earlier."
+msgstr ""
+"Avoid upgrading liberasurecode until swift and liberasurecode better-support "
+"a rolling upgrade. Swift remains compatible with liberasurecode 1.5.0 and "
+"earlier."
+
+msgid "Background corruption-detection improvements"
+msgstr "Background corruption-detection improvements"
+
+msgid "Bug Fixes"
+msgstr "Bug Fixes"
+
+msgid "COPY now works with unicode account names."
+msgstr "COPY now works with Unicode account names."
+
+msgid "Cache all answers from nameservers in cname_lookup."
+msgstr "Cache all answers from nameservers in cname_lookup."
+
+msgid ""
+"Certain S3 API headers are now lower case as they would be coming from AWS."
+msgstr ""
+"Certain S3 API headers are now lower case as they would be coming from AWS."
+
+msgid ""
+"Change the behavior of the EC reconstructor to perform a fragment rebuild to "
+"a handoff node when a primary peer responds with 507 to the REPLICATE "
+"request. This changes EC to match the existing behavior of replication when "
+"drives fail. After a rebalance of EC rings (potentially removing unmounted/"
+"failed devices), it's most IO efficient to run in handoffs_only mode to "
+"avoid unnecessary rebuilds."
+msgstr ""
+"Change the behaviour of the EC reconstructor to perform a fragment rebuild "
+"to a handoff node when a primary peer responds with 507 to the REPLICATE "
+"request. This changes EC to match the existing behaviour of replication when "
+"drives fail. After a rebalance of EC rings (potentially removing unmounted/"
+"failed devices), it's most IO efficient to run in handoffs_only mode to "
+"avoid unnecessary rebuilds."
+
+msgid ""
+"Changed where liberasurecode-devel for CentOS 7 is referenced and installed "
+"as a dependency."
+msgstr ""
+"Changed where liberasurecode-devel for CentOS 7 is referenced and installed "
+"as a dependency."
+
+msgid "Cleaned up logged tracebacks when talking to memcached servers."
+msgstr "Cleaned up logged tracebacks when talking to memcached servers."
+
+msgid ""
+"Closed a bug where ssync may have written bad fragment data in some "
+"circumstances. A check was added to ensure the correct number of bytes is "
+"written for a fragment before finalizing the write. Also, erasure coded "
+"fragment metadata will now be validated on read requests and, if bad data is "
+"found, the fragment will be quarantined."
+msgstr ""
+"Closed a bug where ssync may have written bad fragment data in some "
+"circumstances. A check was added to ensure the correct number of bytes is "
+"written for a fragment before finalising the write. Also, erasure coded "
+"fragment metadata will now be validated on read requests and, if bad data is "
+"found, the fragment will be quarantined."
+
+msgid ""
+"Closed a bug where ssync may have written bad fragment data in some "
+"circumstances. A check was added to ensure the correct number of bytes is "
+"written for a fragment before finalizing the write. Also, erasure coded "
+"fragment metadata will now be validated when read and, if bad data is found, "
+"the fragment will be quarantined."
+msgstr ""
+"Closed a bug where sync may have written bad fragment data in some "
+"circumstances. A check was added to ensure the correct number of bytes is "
+"written for a fragment before finalising the write. Also, erasure coded "
+"fragment metadata will now be validated when read and, if bad data is found, "
+"the fragment will be quarantined."
+
+msgid "CompleteMultipartUpload requests may now be safely retried."
+msgstr "CompleteMultipartUpload requests may now be safely retried."
+
+msgid "CompleteMultipartUpload requests with a ``Content-MD5`` now work."
+msgstr "CompleteMultipartUpload requests with a ``Content-MD5`` now work."
+
+msgid ""
+"Composite rings can be used for explicit replica placement and \"replicated "
+"EC\" for global erasure codes policies."
+msgstr ""
+"Composite rings can be used for explicit replica placement and \"replicated "
+"EC\" for global erasure codes policies."
+
+msgid ""
+"Composite rings support 'cooperative' rebalance which means that during "
+"rebalance all component rings will be consulted before a partition is moved "
+"in any component ring. This avoids the same partition being simultaneously "
+"moved in multiple components."
+msgstr ""
+"Composite rings support 'cooperative' rebalance which means that during "
+"rebalance all component rings will be consulted before a partition is moved "
+"in any component ring. This avoids the same partition being simultaneously "
+"moved in multiple components."
+
+msgid ""
+"Container metadata related to sharding are now removed when no longer needed."
+msgstr ""
+"Container metadata related to sharding are now removed when no longer needed."
+
+msgid ""
+"Container read ACLs now work with object versioning. This only allows access "
+"to the most-recent version via an unversioned URL."
+msgstr ""
+"Container read ACLs now work with object versioning. This only allows access "
+"to the most-recent version via an unversioned URL."
+
+msgid ""
+"Container sync can now copy SLOs more efficiently by allowing the manifest "
+"to be synced before all of the referenced segments. This fixes a bug where "
+"container sync would not copy SLO manifests."
+msgstr ""
+"Container sync can now copy SLOs more efficiently by allowing the manifest "
+"to be synced before all of the referenced segments. This fixes a bug where "
+"container sync would not copy SLO manifests."
+
+msgid ""
+"Container sync now synchronizes static symlinks in a way similar to static "
+"large objects."
+msgstr ""
+"Container sync now synchronizes static symlinks in a way similar to static "
+"large objects."
+
+msgid "Correctly handle deleted files with if-none-match requests."
+msgstr "Correctly handle deleted files with if-none-match requests."
+
+msgid ""
+"Correctly send 412 Precondition Failed if a user sends an invalid copy "
+"destination. Previously Swift would send a 500 Internal Server Error."
+msgstr ""
+"Correctly send 412 Precondition Failed if a user sends an invalid copy "
+"destination. Previously Swift would send a 500 Internal Server Error."
+
+msgid "Correctness improvements"
+msgstr "Correctness improvements"
+
+msgid "Critical Issues"
+msgstr "Critical Issues"
+
+msgid ""
+"Cross-account symlinks now store correct account information in container "
+"listings. This was previously fixed in 2.22.0."
+msgstr ""
+"Cross-account symlinks now store correct account information in container "
+"listings. This was previously fixed in 2.22.0."
+
+msgid "Current (Unreleased) Release Notes"
+msgstr "Current (Unreleased) Release Notes"
+
+msgid ""
+"Currently the default is still only one process, and no workers. Set "
+"``reconstructor_workers`` in the ``[object-reconstructor]`` section to some "
+"whole number <= the number of devices on a node to get that many "
+"reconstructor workers."
+msgstr ""
+"Currently the default is still only one process, and no workers. Set "
+"``reconstructor_workers`` in the ``[object-reconstructor]`` section to some "
+"whole number <= the number of devices on a node to get that many "
+"reconstructor workers."
+
+msgid "Daemons using InternalClient can now be properly killed with SIGTERM."
+msgstr "Daemons using InternalClient can now be properly killed with SIGTERM."
+
+msgid "Data encryption updates"
+msgstr "Data encryption updates"
+
+msgid ""
+"Deleted shard containers are no longer considered root containers. This "
+"prevents unnecessary sharding audit failures and allows the deleted shard "
+"database to actually be unlinked."
+msgstr ""
+"Deleted shard containers are no longer considered root containers. This "
+"prevents unnecessary sharding audit failures and allows the deleted shard "
+"database to actually be unlinked."
+
+msgid ""
+"Deleting an expiring object will now cause less work in the system. The "
+"number of async pending files written has been reduced for all objects and "
+"greatly reduced for erasure-coded objects. This dramatically reduces the "
+"burden on container servers."
+msgstr ""
+"Deleting an expiring object will now cause less work in the system. The "
+"number of async pending files written has been reduced for all objects and "
+"greatly reduced for erasure-coded objects. This dramatically reduces the "
+"burden on container servers."
+
+msgid ""
+"Deployers with clusters that relied on the old implicit default location of "
+"\"US\" should explicitly set ``location = US`` in the ``[filter:s3api]`` "
+"section of proxy-server.conf before upgrading."
+msgstr ""
+"Deployers with clusters that relied on the old implicit default location of "
+"\"US\" should explicitly set ``location = US`` in the ``[filter:s3api]`` "
+"section of proxy-server.conf before upgrading."
+
+msgid ""
+"Deprecate swift-temp-url and call python-swiftclient's implementation "
+"instead. This adds python-swiftclient as an optional dependency of Swift."
+msgstr ""
+"Deprecate swift-temp-url and call python-swiftclient's implementation "
+"instead. This adds python-swiftclient as an optional dependency of Swift."
+
+msgid "Deprecation Notes"
+msgstr "Deprecation Notes"
+
+msgid "Detect and remove invalid entries from ``hashes.pkl``"
+msgstr "Detect and remove invalid entries from ``hashes.pkl``"
+
+msgid ""
+"Device region and zone can now be changed via ``swift-ring-builder``. Note "
+"that this may cause a lot of data movement on the next rebalance as the "
+"builder tries to reach full dispersion."
+msgstr ""
+"Device region and zone can now be changed via ``swift-ring-builder``. Note "
+"that this may cause a lot of data movement on the next rebalance as the "
+"builder tries to reach full dispersion."
+
+msgid "Disallow X-Delete-At header values equal to the X-Timestamp header."
+msgstr "Disallow X-Delete-At header values equal to the X-Timestamp header."
+
+msgid "Display crypto data/metadata details in swift-object-info."
+msgstr "Display crypto data/metadata details in swift-object-info."
+
+msgid "Display more info on empty rings."
+msgstr "Display more info on empty rings."
+
+msgid "Do not follow CNAME when host is in storage_domain."
+msgstr "Do not follow CNAME when host is in storage_domain."
+
+msgid "Don't inject shard ranges when user quits."
+msgstr "Don't inject shard ranges when user quits."
+
+msgid "Drop support for auth-server from common/manager.py and `swift-init`."
+msgstr "Drop support for auth-server from common/manager.py and `swift-init`."
+
+msgid ""
+"During rebalances, clients should no longer get 404s for data that exists "
+"but whose replicas are overloaded."
+msgstr ""
+"During rebalances, clients should no longer get 404s for data that exists "
+"but whose replicas are overloaded."
+
+msgid "EC Fragment Duplication - Foundational Global EC Cluster Support."
+msgstr "EC Fragment Duplication - Foundational Global EC Cluster Support."
+
+msgid ""
+"Empty container databases (such as might be created on handoffs) now shard "
+"much more quickly."
+msgstr ""
+"Empty container databases (such as might be created on handoffs) now shard "
+"much more quickly."
+
+msgid ""
+"Enable cluster-wide CORS Expose-Headers setting via \"cors_expose_headers\"."
+msgstr ""
+"Enable cluster-wide CORS Expose-Headers setting via \"cors_expose_headers\"."
+
+msgid "Enabled versioned writes on Dynamic Large Objects (DLOs)."
+msgstr "Enabled versioned writes on Dynamic Large Objects (DLOs)."
+
+msgid ""
+"Ensure update of the container by object-updater, removing a rare "
+"possibility that objects would never be added to a container listing."
+msgstr ""
+"Ensure update of the container by object-updater, removing a rare "
+"possibility that objects would never be added to a container listing."
+
+msgid ""
+"Erasure code GET performance has been significantly improved in clusters "
+"that are not completely healthy."
+msgstr ""
+"Erasure code GET performance has been significantly improved in clusters "
+"that are not completely healthy."
+
+msgid ""
+"Erasure code reconstruction handles moving data from handoff nodes better. "
+"Instead of moving the data to another handoff, it waits until it can be "
+"moved to a primary node."
+msgstr ""
+"Erasure code reconstruction handles moving data from hand-off nodes better. "
+"Instead of moving the data to another hand-off, it waits until it can be "
+"moved to a primary node."
+
+msgid ""
+"Erasure-coded storage policies using ``isa_l_rs_vand`` and ``nparity`` >= 5 "
+"must be configured as deprecated, preventing any new containers from being "
+"created with such a policy. This configuration is known to harm data "
+"durability. Any data in such policies should be migrated to a new policy. "
+"See See `Launchpad bug 1639691 `__ for more information."
+msgstr ""
+"Erasure-coded storage policies using ``isa_l_rs_vand`` and ``nparity`` >= 5 "
+"must be configured as deprecated, preventing any new containers from being "
+"created with such a policy. This configuration is known to harm data "
+"durability. Any data in such policies should be migrated to a new policy. "
+"See See `Launchpad bug 1639691 `__ for more information."
+
+msgid ""
+"Errors encountered while validating static symlink targets no longer cause "
+"BadResponseLength errors in the proxy-server."
+msgstr ""
+"Errors encountered while validating static symlink targets no longer cause "
+"BadResponseLength errors in the proxy-server."
+
+msgid ""
+"Errors encountered while validating static symlink targets no longer cause "
+"``BadResponseLength`` errors in the proxy-server."
+msgstr ""
+"Errors encountered while validating static symlink targets no longer cause "
+"``BadResponseLength`` errors in the proxy-server."
+
+msgid ""
+"Experimental support for Python 3.6 and 3.7 is now available. Note that this "
+"requires ``eventlet>=0.25.0``. All unit tests pass, and running functional "
+"tests under Python 2 will pass against services running under Python 3. "
+"Expect full support in the next minor release."
+msgstr ""
+"Experimental support for Python 3.6 and 3.7 is now available. Note that this "
+"requires ``eventlet>=0.25.0``. All unit tests pass, and running functional "
+"tests under Python 2 will pass against services running under Python 3. "
+"Expect full support in the next minor release."
+
+msgid ""
+"Extend concurrent reads to erasure coded policies. Previously, the options "
+"``concurrent_gets`` and ``concurrency_timeout`` only applied to replicated "
+"policies."
+msgstr ""
+"Extend concurrent reads to erasure coded policies. Previously, the options "
+"``concurrent_gets`` and ``concurrency_timeout`` only applied to replicated "
+"policies."
+
+msgid "Fix SLO delete for accounts with non-ASCII names."
+msgstr "Fix SLO delete for accounts with non-ASCII names."
+
+msgid ""
+"Fix a proxy-server error when retrieving erasure coded data when there are "
+"durable fragments but not enough to reconstruct."
+msgstr ""
+"Fix a proxy-server error when retrieving erasure coded data when there are "
+"durable fragments but not enough to reconstruct."
+
+msgid "Fix an error in the proxy server when finalizing data."
+msgstr "Fix an error in the proxy server when finalising data."
+
+msgid ""
+"Fixed 500 from cname_lookup middleware. Previously, if the looked-up domain "
+"was used by domain_remap to update the request path, the server would "
+"respond Internal Error."
+msgstr ""
+"Fixed 500 from cname_lookup middleware. Previously, if the looked-up domain "
+"was used by domain_remap to update the request path, the server would "
+"respond Internal Error."
+
+msgid ""
+"Fixed UnicodeDecodeError in the object reconstructor that would prevent "
+"objects with non-ascii names from being reconstructed and caused the "
+"reconstructor process to hang."
+msgstr ""
+"Fixed UnicodeDecodeError in the object reconstructor that would prevent "
+"objects with non-ASCII names from being reconstructed and caused the "
+"reconstructor process to hang."
+
+msgid ""
+"Fixed XML responses (eg on bulk extractions and SLO upload failures) to be "
+"more correct. The enclosing \"delete\" tag was removed where it doesn't make "
+"sense and replaced with \"extract\" or \"upload\" depending on the context."
+msgstr ""
+"Fixed XML responses (e.g. on bulk extractions and SLO upload failures) to be "
+"more correct. The enclosing \"delete\" tag was removed where it doesn't make "
+"sense and replaced with \"extract\" or \"upload\" depending on the context."
+
+msgid "Fixed ``rsync`` output parsing."
+msgstr "Fixed ``rsync`` output parsing."
+
+msgid "Fixed a bug in domain_remap when obj starts/ends with slash."
+msgstr "Fixed a bug in domain_remap when obj starts/ends with slash."
+
+msgid ""
+"Fixed a bug in how Swift uses eventlet that was exposed under high "
+"concurrency."
+msgstr ""
+"Fixed a bug in how Swift uses eventlet that was exposed under high "
+"concurrency."
+
+msgid ""
+"Fixed a bug in the EC reconstructor where an unsuccessful sync would cause "
+"extra disk I/O load on the remote server. Now the extra checking work is "
+"only requested if the sync request was successful."
+msgstr ""
+"Fixed a bug in the EC reconstructor where an unsuccessful sync would cause "
+"extra disk I/O load on the remote server. Now the extra checking work is "
+"only requested if the sync request was successful."
+
+msgid ""
+"Fixed a bug in the new object versioning API that would cause more than "
+"``limit`` results to be returned when listing."
+msgstr ""
+"Fixed a bug in the new object versioning API that would cause more than "
+"``limit`` results to be returned when listing."
+
+msgid ""
+"Fixed a bug introduced in 2.15.0 where the object reconstructor would exit "
+"with a traceback if no EC policy was configured."
+msgstr ""
+"Fixed a bug introduced in 2.15.0 where the object reconstructor would exit "
+"with a traceback if no EC policy was configured."
+
+msgid "Fixed a bug where SSYNC would fail to replicate unexpired object."
+msgstr "Fixed a bug where SSYNC would fail to replicate unexpired object."
+
+msgid ""
+"Fixed a bug where a container listing delimiter wouldn't work with "
+"encryption."
+msgstr ""
+"Fixed a bug where a container listing delimiter wouldn't work with "
+"encryption."
+
+msgid ""
+"Fixed a bug where an SLO download with a range request may have resulted in "
+"a 5xx series response."
+msgstr ""
+"Fixed a bug where an SLO download with a range request may have resulted in "
+"a 5xx series response."
+
+msgid ""
+"Fixed a bug where encryption would store the incorrect key metadata if the "
+"object name starts with a slash."
+msgstr ""
+"Fixed a bug where encryption would store the incorrect key metadata if the "
+"object name starts with a slash."
+
+msgid ""
+"Fixed a bug where some headers weren't being copied correctly in a COPY "
+"request."
+msgstr ""
+"Fixed a bug where some headers weren't being copied correctly in a COPY "
+"request."
+
+msgid "Fixed a bug where some tombstone files might never be reclaimed."
+msgstr "Fixed a bug where some tombstone files might never be reclaimed."
+
+msgid ""
+"Fixed a bug where the ring builder would not allow removal of a device when "
+"min_part_seconds_left was greater than zero."
+msgstr ""
+"Fixed a bug where the ring builder would not allow removal of a device when "
+"min_part_seconds_left was greater than zero."
+
+msgid ""
+"Fixed a bug where zero-byte PUTs would not work properly with \"If-None-"
+"Match: \\*\" conditional requests."
+msgstr ""
+"Fixed a bug where zero-byte PUTs would not work properly with \"If-None-"
+"Match: \\*\" conditional requests."
+
+msgid ""
+"Fixed a cache invalidation issue related to GET and PUT requests to "
+"containers that would occasionally cause object PUTs to a container to 404 "
+"after the container had been successfully created."
+msgstr ""
+"Fixed a cache invalidation issue related to GET and PUT requests to "
+"containers that would occasionally cause object PUTs to a container to 404 "
+"after the container had been successfully created."
+
+msgid "Fixed a few areas where the ``swiftdir`` option was not respected."
+msgstr "Fixed a few areas where the ``swiftdir`` option was not respected."
+
+msgid ""
+"Fixed a race condition in updating hashes.pkl where a partition suffix "
+"invalidation may have been skipped."
+msgstr ""
+"Fixed a race condition in updating hashes.pkl where a partition suffix "
+"invalidation may have been skipped."
+
+msgid "Fixed a rare infinite loop in `swift-ring-builder` while placing parts."
+msgstr ""
+"Fixed a rare infinite loop in `swift-ring-builder` while placing parts."
+
+msgid ""
+"Fixed a rare issue where multiple backend timeouts could result in bad data "
+"being returned to the client."
+msgstr ""
+"Fixed a rare issue where multiple backend timeouts could result in bad data "
+"being returned to the client."
+
+msgid "Fixed a socket leak in copy middleware when a large object was copied."
+msgstr "Fixed a socket leak in copy middleware when a large object was copied."
+
+msgid ""
+"Fixed an error when reading encrypted data that was written while running "
+"Python 2 for a path that includes non-ASCII characters."
+msgstr ""
+"Fixed an error when reading encrypted data that was written while running "
+"Python 2 for a path that includes non-ASCII characters."
+
+msgid ""
+"Fixed an issue in COPY where concurrent requests may have copied the wrong "
+"data."
+msgstr ""
+"Fixed an issue in COPY where concurrent requests may have copied the wrong "
+"data."
+
+msgid ""
+"Fixed an issue that caused Delete Multiple Objects requests with large "
+"bodies to 400. This was previously fixed in 2.20.0."
+msgstr ""
+"Fixed an issue that caused Delete Multiple Objects requests with large "
+"bodies to 400. This was previously fixed in 2.20.0."
+
+msgid ""
+"Fixed an issue when reading or writing objects with a content-type like "
+"``message/*``. Previously, Swift would fail to respond."
+msgstr ""
+"Fixed an issue when reading or writing objects with a content-type like "
+"``message/*``. Previously, Swift would fail to respond."
+
+msgid ""
+"Fixed an issue where S3 API v4 signatures would not be validated against the "
+"body of the request, allowing a replay attack if request headers were "
+"captured by a malicious third party."
+msgstr ""
+"Fixed an issue where S3 API v4 signatures would not be validated against the "
+"body of the request, allowing a replay attack if request headers were "
+"captured by a malicious third party."
+
+msgid ""
+"Fixed an issue where a failed drive could prevent the container sharder from "
+"making progress."
+msgstr ""
+"Fixed an issue where a failed drive could prevent the container sharder from "
+"making progress."
+
+msgid ""
+"Fixed an issue where an object server failure during a client download could "
+"leave an open socket between the proxy and client."
+msgstr ""
+"Fixed an issue where an object server failure during a client download could "
+"leave an open socket between the proxy and client."
+
+msgid ""
+"Fixed an issue where background consistency daemon child processes would "
+"deadlock waiting on the same file descriptor."
+msgstr ""
+"Fixed an issue where background consistency daemon child processes would "
+"deadlock waiting on the same file descriptor."
+
+msgid ""
+"Fixed an issue where deleted EC objects didn't have their on-disk "
+"directories cleaned up. This would cause extra resource usage on the object "
+"servers."
+msgstr ""
+"Fixed an issue where deleted EC objects didn't have their on-disk "
+"directories cleaned up. This would cause extra resource usage on the object "
+"servers."
+
+msgid ""
+"Fixed an issue where multipart uploads with the S3 API would sometimes "
+"report an error despite all segments being upload successfully."
+msgstr ""
+"Fixed an issue where multipart uploads with the S3 API would sometimes "
+"report an error despite all segments being upload successfully."
+
+msgid ""
+"Fixed an issue where non-ASCII Keystone EC2 credentials would not get mapped "
+"to the correct account. This was previously fixed in 2.20.0."
+msgstr ""
+"Fixed an issue where non-ASCII Keystone EC2 credentials would not get mapped "
+"to the correct account. This was previously fixed in 2.20.0."
+
+msgid ""
+"Fixed an issue where v4 signatures would not be validated against the body "
+"of the request, allowing a replay attack if request headers were captured by "
+"a malicious third party. Note that unsigned payloads still function normally."
+msgstr ""
+"Fixed an issue where v4 signatures would not be validated against the body "
+"of the request, allowing a replay attack if request headers were captured by "
+"a malicious third party. Note that unsigned payloads still function normally."
+
+msgid ""
+"Fixed an issue with SSYNC requests to ensure that only one request can be "
+"running on a partition at a time."
+msgstr ""
+"Fixed an issue with SSYNC requests to ensure that only one request can be "
+"running on a partition at a time."
+
+msgid ""
+"Fixed an issue with multi-region EC policies that caused the EC "
+"reconstructor to constantly attempt cross-region rebuild traffic."
+msgstr ""
+"Fixed an issue with multi-region EC policies that caused the EC "
+"reconstructor to constantly attempt cross-region rebuild traffic."
+
+msgid "Fixed deadlock when logging from a tpool thread."
+msgstr "Fixed deadlock when logging from a tpool thread."
+
+msgid ""
+"Fixed deadlock when logging from a tpool thread. The object server runs "
+"certain IO-intensive methods outside the main pthread for performance. "
+"Previously, if one of those methods tried to log, this can cause a crash "
+"that eventually leads to an object server with hundreds or thousands of "
+"greenthreads, all deadlocked. The fix is to use a mutex that works across "
+"different greenlets and different pthreads."
+msgstr ""
+"Fixed deadlock when logging from a tpool thread. The object server runs "
+"certain IO-intensive methods outside the main pthread for performance. "
+"Previously, if one of those methods tried to log, this can cause a crash "
+"that eventually leads to an object server with hundreds or thousands of "
+"greenthreads, all deadlocked. The fix is to use a mutex that works across "
+"different greenlets and different pthreads."
+
+msgid ""
+"Fixed encoding issue in ssync where a mix of ascii and non-ascii metadata "
+"values would cause an error."
+msgstr ""
+"Fixed encoding issue in ssync where a mix of ASCII and non-ASCII metadata "
+"values would cause an error."
+
+msgid ""
+"Fixed error where a container drive error resulted in double space usage on "
+"rest drives. When drive with container or account database is unmounted, the "
+"bug would create handoff replicas on all remaining drives, increasing the "
+"drive space used and filling the cluster."
+msgstr ""
+"Fixed error where a container drive error resulted in double space usage on "
+"rest drives. When drive with container or account database is unmounted, the "
+"bug would create hand-off replicas on all remaining drives, increasing the "
+"drive space used and filling the cluster."
+
+msgid ""
+"Fixed issue where bulk requests using xml and expect 100-continue would "
+"return a malformed HTTP response."
+msgstr ""
+"Fixed issue where bulk requests using XML and expect 100-continue would "
+"return a malformed HTTP response."
+
+msgid "Fixed listings for sharded containers."
+msgstr "Fixed listings for sharded containers."
+
+msgid "Fixed non-ASCII account metadata handling."
+msgstr "Fixed non-ASCII account metadata handling."
+
+msgid ""
+"Fixed non-deterministic suffix updates in hashes.pkl where a partition may "
+"be updated much less often than expected."
+msgstr ""
+"Fixed non-deterministic suffix updates in hashes.pkl where a partition may "
+"be updated much less often than expected."
+
+msgid "Fixed rare socket leak on range requests to erasure-coded objects."
+msgstr "Fixed rare socket leak on range requests to erasure-coded objects."
+
+msgid ""
+"Fixed regression in consolidate_hashes that occured when a new file was "
+"stored to new suffix to a non-empty partition. This bug was introduced in "
+"2.7.0 and could cause an increase in rsync replication stats during and "
+"after upgrade, due to inconsistent hashing of partition suffixes."
+msgstr ""
+"Fixed regression in consolidate_hashes that occurred when a new file was "
+"stored to new suffix to a non-empty partition. This bug was introduced in "
+"2.7.0 and could cause an increase in rsync replication stats during and "
+"after upgrade, due to inconsistent hashing of partition suffixes."
+
+msgid ""
+"Fixed regression in consolidate_hashes that occurred when a new file was "
+"stored to new suffix to a non-empty partition. This bug was introduced in "
+"2.7.0 and could cause an increase in rsync replication stats during and "
+"after upgrade, due to inconsistent hashing of partition suffixes."
+msgstr ""
+"Fixed regression in consolidate_hashes that occurred when a new file was "
+"stored to new suffix to a non-empty partition. This bug was introduced in "
+"2.7.0 and could cause an increase in rsync replication stats during and "
+"after upgrade, due to inconsistent hashing of partition suffixes."
+
+msgid "Fixed some SignatureDoesNotMatch errors when using the AWS .NET SDK."
+msgstr "Fixed some SignatureDoesNotMatch errors when using the AWS .NET SDK."
+
+msgid "Fixed some minor test compatibility issues."
+msgstr "Fixed some minor test compatibility issues."
+
+msgid "Fixed some title-casing of headers."
+msgstr "Fixed some title-casing of headers."
+
+msgid "Fixed the KeyError message when auditor finds an expired object."
+msgstr "Fixed the KeyError message when auditor finds an expired object."
+
+msgid "Fixed the stats calculation in the erasure code reconstructor."
+msgstr "Fixed the stats calculation in the erasure code reconstructor."
+
+msgid "Fixed time skew when using X-Delete-After."
+msgstr "Fixed time skew when using X-Delete-After."
+
+msgid ""
+"Fixed using ``swift-ring-builder set_weight`` with more than one device."
+msgstr ""
+"Fixed using ``swift-ring-builder set_weight`` with more than one device."
+
+msgid "Fixed v1 listings that end with a non-ASCII object name."
+msgstr "Fixed v1 listings that end with a non-ASCII object name."
+
+msgid ""
+"For further information see the `docs `__"
+msgstr ""
+"For further information see the `docs `__"
+
+msgid ""
+"For new multipart-uploads via the S3 API, the ETag that is stored will be "
+"calculated in the same way that AWS uses. This ETag will be used in GET/HEAD "
+"responses, bucket listings, and conditional requests via the S3 API. "
+"Accessing the same object via the Swift API will use the SLO Etag; however, "
+"in JSON container listings the multipart upload etag will be exposed in a "
+"new \"s3_etag\" key. Previously, some S3 clients would complain about "
+"download corruption when the ETag did not have a '-'."
+msgstr ""
+"For new multipart-uploads via the S3 API, the ETag that is stored will be "
+"calculated in the same way that AWS uses. This ETag will be used in GET/HEAD "
+"responses, bucket listings, and conditional requests via the S3 API. "
+"Accessing the same object via the Swift API will use the SLO Etag; however, "
+"in JSON container listings the multipart upload etag will be exposed in a "
+"new \"s3_etag\" key. Previously, some S3 clients would complain about "
+"download corruption when the ETag did not have a '-'."
+
+msgid "Fractional replicas are no longer allowed for erasure code policies."
+msgstr "Fractional replicas are no longer allowed for erasure code policies."
+
+msgid ""
+"GET and HEAD requests to a symlink will operate on the referenced object and "
+"require appropriate permission in the target container. DELETE and PUT "
+"requests will operate on the symlink object itself. POST requests are not "
+"forwarded to the referenced object. POST requests sent to a symlink will "
+"result in a 307 Temporary Redirect response."
+msgstr ""
+"GET and HEAD requests to a symlink will operate on the referenced object and "
+"require appropriate permission in the target container. DELETE and PUT "
+"requests will operate on the symlink object itself. POST requests are not "
+"forwarded to the referenced object. POST requests sent to a symlink will "
+"result in a 307 Temporary Redirect response."
+
+msgid ""
+"Getting an SLO manifest with ``?format=raw`` now responds with an ETag that "
+"matches the MD5 of the generated body rather than the MD5 of the manifest "
+"stored on disk."
+msgstr ""
+"Getting an SLO manifest with ``?format=raw`` now responds with an ETag that "
+"matches the MD5 of the generated body rather than the MD5 of the manifest "
+"stored on disk."
+
+msgid ""
+"Hashes are no longer invalidated after a successful ssync; they were already "
+"invalidated during the data transfer."
+msgstr ""
+"Hashes are no longer invalidated after a successful ssync; they were already "
+"invalidated during the data transfer."
+
+msgid "I/O priority is now supported on AArch64 architecture."
+msgstr "I/O priority is now supported on AArch64 architecture."
+
+msgid ""
+"If a proxy server is configured to autocreate accounts and the account "
+"create fails, it will now return a server error (500) instead of Not Found "
+"(404)."
+msgstr ""
+"If a proxy server is configured to autocreate accounts and the account "
+"create fails, it will now return a server error (500) instead of Not Found "
+"(404)."
+
+msgid ""
+"If proxy and object layers can be upgraded independently and proxies can be "
+"upgraded quickly:"
+msgstr ""
+"If proxy and object layers can be upgraded independently and proxies can be "
+"upgraded quickly:"
+
+msgid "If running Swift under Python 3, ``eventlet`` must be at least 0.25.0."
+msgstr "If running Swift under Python 3, ``eventlet`` must be at least 0.25.0."
+
+msgid ""
+"If the sharder encounters shard range gaps while cleaving, it will now log "
+"an error and halt sharding progress. Previously, rows may not have been "
+"moved properly, leading to data loss."
+msgstr ""
+"If the sharder encounters shard range gaps while cleaving, it will now log "
+"an error and halt sharding progress. Previously, rows may not have been "
+"moved properly, leading to data loss."
+
+msgid ""
+"If upgrading from Swift 2.20.0 or Swift 2.19.1 or earlier, set "
+"``meta_version_to_write = 1`` in your keymaster configuration *prior* to "
+"upgrading."
+msgstr ""
+"If upgrading from Swift 2.20.0 or Swift 2.19.1 or earlier, set "
+"``meta_version_to_write = 1`` in your keymaster configuration *prior* to "
+"upgrading."
+
+msgid ""
+"If using erasure coding with ISA-L in rs_vand mode and 5 or more parity "
+"fragments, Swift will emit a warning. This is a configuration that is known "
+"to harm data durability. In a future release, this warning will be upgraded "
+"to an error unless the policy is marked as deprecated. All data in an "
+"erasure code storage policy using isa_l_rs_vand with 5 or more parity should "
+"be migrated as soon as possible. Please see https://bugs.launchpad.net/swift/"
+"+bug/1639691 for more information."
+msgstr ""
+"If using erasure coding with ISA-L in rs_vand mode and 5 or more parity "
+"fragments, Swift will emit a warning. This is a configuration that is known "
+"to harm data durability. In a future release, this warning will be upgraded "
+"to an error unless the policy is marked as deprecated. All data in an "
+"erasure code storage policy using isa_l_rs_vand with 5 or more parity should "
+"be migrated as soon as possible. Please see https://bugs.launchpad.net/swift/"
+"+bug/1639691 for more information."
+
+msgid "If you have a config file like this::"
+msgstr "If you have a config file like this::"
+
+msgid "If you upgrade and roll back, you must delete all `hashes.pkl` files."
+msgstr "If you upgrade and roll back, you must delete all `hashes.pkl` files."
+
+msgid "If you want updates to be processed exactly as before, do this::"
+msgstr "If you want updates to be processed exactly as before, do this::"
+
+msgid ""
+"If you've been testing Swift on Python 3, upgrade at your earliest "
+"convenience."
+msgstr ""
+"If you've been testing Swift on Python 3, upgrade at your earliest "
+"convenience."
+
+msgid ""
+"If your users can tolerate it, consider a read-only rolling upgrade. Before "
+"upgrading, enable the `read-only middleware `__ cluster-wide to prevent new "
+"writes during the upgrade. Additionally, stop and disable the object-"
+"reconstructor as above. Upgrade normally, then disable the read-only "
+"middleware and re-enable and restart the object-reconstructor."
+msgstr ""
+"If your users can tolerate it, consider a read-only rolling upgrade. Before "
+"upgrading, enable the `read-only middleware `__ cluster-wide to prevent new "
+"writes during the upgrade. Additionally, stop and disable the object-"
+"reconstructor as above. Upgrade normally, then disable the read-only "
+"middleware and re-enable and restart the object-reconstructor."
+
+msgid "Imported docs content from openstack-manuals project."
+msgstr "Imported docs content from openstack-manuals project."
+
+msgid "Improve performance when increasing partition power."
+msgstr "Improve performance when increasing partition power."
+
+msgid "Improved S3 API compatibility."
+msgstr "Improved S3 API compatibility."
+
+msgid ""
+"Improved ``object-updater`` stats logging. It now tells you all of its stats "
+"(successes, failures, quarantines due to bad pickles, unlinks, and errors), "
+"and it tells you incremental progress every five minutes. The logging at the "
+"end of a pass remains and has been expanded to also include all stats."
+msgstr ""
+"Improved ``object-updater`` stats logging. It now tells you all of its stats "
+"(successes, failures, quarantines due to bad pickles, unlinks, and errors), "
+"and it tells you incremental progress every five minutes. The logging at the "
+"end of a pass remains and has been expanded to also include all stats."
+
+msgid "Improved cache management for account and container responses."
+msgstr "Improved cache management for account and container responses."
+
+msgid "Improved compatibility with certain FIPS-mode-enabled systems."
+msgstr "Improved compatibility with certain FIPS-mode-enabled systems."
+
+msgid ""
+"Improved container-sharder stat reporting to reduce load on root container "
+"databases."
+msgstr ""
+"Improved container-sharder stat reporting to reduce load on root container "
+"databases."
+
+msgid ""
+"Improved container-sync performance when data has already been deleted or "
+"overwritten."
+msgstr ""
+"Improved container-sync performance when data has already been deleted or "
+"overwritten."
+
+msgid ""
+"Improved how containers reclaim deleted rows to reduce locking and object "
+"update throughput."
+msgstr ""
+"Improved how containers reclaim deleted rows to reduce locking and object "
+"update throughput."
+
+msgid ""
+"Improved logging and statsd metrics. Be aware that this will cause an "
+"increase in the proxy-logging statsd metrics emited for S3 responses. "
+"However, this should more accurately reflect the state of the system."
+msgstr ""
+"Improved logging and statsd metrics. Be aware that this will cause an "
+"increase in the proxy-logging statsd metrics emitted for S3 responses. "
+"However, this should more accurately reflect the state of the system."
+
+msgid ""
+"Improved performance by eliminating an unneeded directory structure hash."
+msgstr ""
+"Improved performance by eliminating an unneeded directory structure hash."
+
+msgid ""
+"Improved performance of sharded container listings when performing prefix "
+"listings."
+msgstr ""
+"Improved performance of sharded container listings when performing prefix "
+"listings."
+
+msgid ""
+"Improved proxy-server performance by reducing unnecessary locking, memory "
+"copies, and eventlet scheduling."
+msgstr ""
+"Improved proxy-server performance by reducing unnecessary locking, memory "
+"copies, and eventlet scheduling."
+
+msgid "Improved proxy-to-backend requests to be more RFC-compliant."
+msgstr "Improved proxy-to-backend requests to be more RFC-compliant."
+
+msgid "Improved quota-exceeded error messages."
+msgstr "Improved quota-exceeded error messages."
+
+msgid ""
+"Improved relinker progress logging, and started collecting progress "
+"information for swift-recon."
+msgstr ""
+"Improved relinker progress logging, and started collecting progress "
+"information for swift-recon."
+
+msgid ""
+"Improved safety during cleanup to ensure files have been relinked "
+"appropriately before unlinking."
+msgstr ""
+"Improved safety during cleanup to ensure files have been relinked "
+"appropriately before unlinking."
+
+msgid ""
+"Improved the error message when deleting a bucket that's ever had versioning "
+"enabled and still has versions in it."
+msgstr ""
+"Improved the error message when deleting a bucket that's ever had versioning "
+"enabled and still has versions in it."
+
+msgid ""
+"Improved the granularity of the ring dispersion metric so that small "
+"improvements after a rebalance can show changes in the dispersion number. "
+"Dispersion in existing and new rings can be recalculated using the new ``--"
+"recalculate`` option to ``swift-ring-builder``."
+msgstr ""
+"Improved the granularity of the ring dispersion metric so that small "
+"improvements after a rebalance can show changes in the dispersion number. "
+"Dispersion in existing and new rings can be recalculated using the new ``--"
+"recalculate`` option to ``swift-ring-builder``."
+
+msgid "Improvements in key parts of the consistency engine"
+msgstr "Improvements in key parts of the consistency engine"
+
+msgid ""
+"In SLO manifests, the `etag` and `size_bytes` keys are now fully optional "
+"and not required. Previously, the keys needed to exist but the values were "
+"optional. The only required key is `path`."
+msgstr ""
+"In SLO manifests, the `etag` and `size_bytes` keys are now fully optional "
+"and not required. Previously, the keys needed to exist but the values were "
+"optional. The only required key is `path`."
+
+msgid ""
+"In a rolling upgrade from liberasurecode 1.5.0 or earlier to 1.6.0 or later, "
+"object-servers may quarantine newly-written data, leading to availability "
+"issues or even data loss. See `bug 1886088 `__ for more information, including how to "
+"determine whether you are affected. Several mitigations are available to "
+"operators:"
+msgstr ""
+"In a rolling upgrade from liberasurecode 1.5.0 or earlier to 1.6.0 or later, "
+"object-servers may quarantine newly-written data, leading to availability "
+"issues or even data loss. See `bug 1886088 `__ for more information, including how to "
+"determine whether you are affected. Several mitigations are available to "
+"operators:"
+
+msgid ""
+"In the ratelimit middleware, account whitelist and blacklist settings have "
+"been deprecated and may be removed in a future release. When found, a "
+"deprecation message will be logged. Instead of these config file values, set "
+"X-Account-Sysmeta- Global-Write-Ratelimit:WHITELIST and X-Account-Sysmeta-"
+"Global- Write-Ratelimit:BLACKLIST on the particular accounts that need to be "
+"whitelisted or blacklisted. System metadata cannot be added or modified by "
+"standard clients. Use the internal client to set sysmeta."
+msgstr ""
+"In the ratelimit middleware, account whitelist and blacklist settings have "
+"been deprecated and may be removed in a future release. When found, a "
+"deprecation message will be logged. Instead of these config file values, set "
+"X-Account-Sysmeta- Global-Write-Ratelimit:WHITELIST and X-Account-Sysmeta-"
+"Global- Write-Ratelimit:BLACKLIST on the particular accounts that need to be "
+"whitelisted or blacklisted. System metadata cannot be added or modified by "
+"standard clients. Use the internal client to set sysmeta."
+
+msgid "Include ``Vary: Origin`` header when CORS responses vary by origin."
+msgstr "Include ``Vary: Origin`` header when CORS responses vary by origin."
+
+msgid ""
+"Include object sysmeta in POST responses. Sysmeta is still stripped from the "
+"response before being sent to the client, but this allows middleware to make "
+"use of the information."
+msgstr ""
+"Include object sysmeta in POST responses. Sysmeta is still stripped from the "
+"response before being sent to the client, but this allows middleware to make "
+"use of the information."
+
+msgid "Include received fragment index in reconstructor log warnings."
+msgstr "Include received fragment index in reconstructor log warnings."
+
+msgid ""
+"Instead of using a separate .durable file to indicate the durable status of "
+"an EC fragment archive, we rename the .data to include a durable marker in "
+"the filename. This saves one inode for every EC .data file. Existing ."
+"durable files will not be removed, and they will continue to work just fine."
+msgstr ""
+"Instead of using a separate .durable file to indicate the durable status of "
+"an EC fragment archive, we rename the .data to include a durable marker in "
+"the filename. This saves one inode for every EC .data file. Existing ."
+"durable files will not be removed, and they will continue to work just fine."
+
+msgid "Internal client no longer logs object DELETEs as status 499."
+msgstr "Internal client no longer logs object DELETEs as status 499."
+
+msgid ""
+"Internal clients now correctly use their configured ``User-Agent`` in "
+"backend requests, rather than only using it for logging."
+msgstr ""
+"Internal clients now correctly use their configured ``User-Agent`` in "
+"backend requests, rather than only using it for logging."
+
+msgid "Known Issues"
+msgstr "Known Issues"
+
+msgid "Large object reads log fewer client disconnects."
+msgstr "Large object reads log fewer client disconnects."
+
+msgid ""
+"Let clients request heartbeats during SLO PUTs by including the query "
+"parameter ``heartbeat=on``."
+msgstr ""
+"Let clients request heartbeats during SLO PUTs by including the query "
+"parameter ``heartbeat=on``."
+
+msgid ""
+"Listing containers in accounts with json or xml now includes a "
+"`last_modified` time. This does not change any on-disk data, but simply "
+"exposes the value to offer consistency with the object listings on "
+"containers."
+msgstr ""
+"Listing containers in accounts with JSON or XML now includes a "
+"`last_modified` time. This does not change any on-disk data, but simply "
+"exposes the value to offer consistency with the object listings on "
+"containers."
+
+msgid ""
+"Lock timeouts in the container updater are now logged at INFO level, not "
+"ERROR."
+msgstr ""
+"Lock timeouts in the container updater are now logged at INFO level, not "
+"ERROR."
+
+msgid "Log correct status code for conditional requests."
+msgstr "Log correct status code for conditional requests."
+
+msgid ""
+"Log deprecation warning for ``allow_versions`` in the container server "
+"config. Configure the ``versioned_writes`` middleware in the proxy server "
+"instead. This option will be ignored in a future release."
+msgstr ""
+"Log deprecation warning for ``allow_versions`` in the container server "
+"config. Configure the ``versioned_writes`` middleware in the proxy server "
+"instead. This option will be ignored in a future release."
+
+msgid ""
+"Log deprecation warnings for ``run_pause``. This setting was deprecated in "
+"Swift 2.4.0 and is replaced by ``interval``. It may be removed in a future "
+"release."
+msgstr ""
+"Log deprecation warnings for ``run_pause``. This setting was deprecated in "
+"Swift 2.4.0 and is replaced by ``interval``. It may be removed in a future "
+"release."
+
+msgid ""
+"Log formats are now more configurable and include support for anonymization. "
+"See the ``log_msg_template`` option in ``proxy-server.conf`` and `the Swift "
+"documentation `__ for more information."
+msgstr ""
+"Log formats are now more configurable and include support for anonymization. "
+"See the ``log_msg_template`` option in ``proxy-server.conf`` and `the Swift "
+"documentation `__ for more information."
+
+msgid "Log progress per partition when relinking/cleaning up."
+msgstr "Log progress per partition when relinking/cleaning up."
+
+msgid "Log the correct request type of a subrequest downstream of copy."
+msgstr "Log the correct request type of a sub-request downstream of copy."
+
+msgid "Logging improvements"
+msgstr "Logging improvements"
+
+msgid ""
+"Lower bounds of dependencies have been updated to reflect what is actually "
+"tested."
+msgstr ""
+"Lower bounds of dependencies have been updated to reflect what is actually "
+"tested."
+
+msgid ""
+"Make mount_check option usable in containerized environments by adding a "
+"check for an \".ismount\" file at the root directory of a device."
+msgstr ""
+"Make mount_check option usable in containerised environments by adding a "
+"check for an \".ismount\" file at the root directory of a device."
+
+msgid ""
+"Metrics are now emitted for a variety of S3 error responses, in the form "
+"``s3api..[.]``"
+msgstr ""
+"Metrics are now emitted for a variety of S3 error responses, in the form "
+"``s3api..[.]``"
+
+msgid ""
+"Metrics are now emitted for whether databases used for cleaving were created "
+"or already existed, allowing a better understanding of the reason for "
+"handoffs in the cluster."
+msgstr ""
+"Metrics are now emitted for whether databases used for cleaving were created "
+"or already existed, allowing a better understanding of the reason for "
+"handoffs in the cluster."
+
+msgid "Metrics improvements:"
+msgstr "Metrics improvements:"
+
+msgid "Mirror X-Trans-Id to X-Openstack-Request-Id."
+msgstr "Mirror X-Trans-Id to X-Openstack-Request-Id."
+
+msgid "Misplaced tombstone records are now properly cleaved."
+msgstr "Misplaced tombstone records are now properly cleaved."
+
+msgid ""
+"Misplaced-record stats are now also emitted to statsd. Previously, these "
+"were only available in logs."
+msgstr ""
+"Misplaced-record stats are now also emitted to statsd. Previously, these "
+"were only available in logs."
+
+msgid "More daemons now support systemd notify sockets."
+msgstr "More daemons now support systemd notify sockets."
+
+msgid ""
+"More headers are now copied from multi-part upload markers to their "
+"completed objects, including ``Content-Encoding``."
+msgstr ""
+"More headers are now copied from multi-part upload markers to their "
+"completed objects, including ``Content-Encoding``."
+
+msgid ""
+"More information is now synced to the fresh database when sharding. "
+"Previously, a database could lose the fact that it had been marked as "
+"deleted."
+msgstr ""
+"More information is now synced to the fresh database when sharding. "
+"Previously, a database could lose the fact that it had been marked as "
+"deleted."
+
+msgid ""
+"Move listing formatting out to a new proxy middleware named "
+"``listing_formats``. ``listing_formats`` should be just right of the first "
+"proxy-logging middleware, and left of most other middlewares. If it is not "
+"already present, it will be automatically inserted for you."
+msgstr ""
+"Move listing formatting out to a new proxy middleware named "
+"``listing_formats``. ``listing_formats`` should be just right of the first "
+"proxy-logging middleware, and left of most other middleware. If it is not "
+"already present, it will be automatically inserted for you."
+
+msgid "Moved Zuul v3 tox jobs into the Swift code repo."
+msgstr "Moved Zuul v3 tox jobs into the Swift code repo."
+
+msgid ""
+"Moved other-requirements.txt to bindep.txt. bindep.txt lists non-python "
+"dependencies of Swift."
+msgstr ""
+"Moved other-requirements.txt to bindep.txt. bindep.txt lists non-Python "
+"dependencies of Swift."
+
+msgid ""
+"Multi-character strings may now be used as delimiters in account and "
+"container listings."
+msgstr ""
+"Multi-character strings may now be used as delimiters in account and "
+"container listings."
+
+msgid ""
+"Multipart object segments are now actually deleted when the multipart object "
+"is deleted via the S3 API."
+msgstr ""
+"Multipart object segments are now actually deleted when the multipart object "
+"is deleted via the S3 API."
+
+msgid "Multipart upload parts may now be copied from other multipart uploads."
+msgstr "Multipart upload parts may now be copied from other multipart uploads."
+
+msgid ""
+"Multiple keymaster middlewares are now supported. This allows migration from "
+"one key provider to another."
+msgstr ""
+"Multiple keymaster middlewares are now supported. This allows migration from "
+"one key provider to another."
+
+msgid "New Features"
+msgstr "New Features"
+
+msgid ""
+"New buckets created via the S3 API will now store multi-part upload data in "
+"the same storage policy as other data rather than the cluster's default "
+"storage policy."
+msgstr ""
+"New buckets created via the S3 API will now store multi-part upload data in "
+"the same storage policy as other data rather than the cluster's default "
+"storage policy."
+
+msgid ""
+"New config variables to change the schedule priority and I/O scheduling "
+"class. Servers and daemons now understand `nice_priority`, `ionice_class`, "
+"and `ionice_priority` to schedule their relative importance. Please read "
+"http://docs.openstack.org/developer/swift/deployment_guide.html for full "
+"config details."
+msgstr ""
+"New config variables to change the schedule priority and I/O scheduling "
+"class. Servers and daemons now understand `nice_priority`, `ionice_class`, "
+"and `ionice_priority` to schedule their relative importance. Please read "
+"http://docs.openstack.org/developer/swift/deployment_guide.html for full "
+"config details."
+
+msgid "Newton Series Release Notes"
+msgstr "Newton Series Release Notes"
+
+msgid "Non-durable fragments can now be reverted from handoffs."
+msgstr "Non-durable fragments can now be reverted from handoffs."
+
+msgid ""
+"Note that ``secret_id`` values must remain unique across all keymasters in a "
+"given pipeline. If they are not unique, the right-most keymaster will take "
+"precedence."
+msgstr ""
+"Note that ``secret_id`` values must remain unique across all keymasters in a "
+"given pipeline. If they are not unique, the right-most keymaster will take "
+"precedence."
+
+msgid ""
+"Note that after writing EC data with Swift 2.11.0 or later, that data will "
+"not be accessible to earlier versions of Swift."
+msgstr ""
+"Note that after writing EC data with Swift 2.11.0 or later, that data will "
+"not be accessible to earlier versions of Swift."
+
+msgid ""
+"Note: if you have a custom middleware that makes account or container "
+"listings, it will only receive listings in JSON format."
+msgstr ""
+"Note: if you have a custom middleware that makes account or container "
+"listings, it will only receive listings in JSON format."
+
+msgid ""
+"Now Swift will use ``write_affinity_handoff_delete_count`` to define how "
+"many local handoff nodes should swift send request to get more candidates "
+"for the final response. The default value \"auto\" means Swift will "
+"calculate the number automatically based on the number of replicas and "
+"current cluster topology."
+msgstr ""
+"Now Swift will use ``write_affinity_handoff_delete_count`` to define how "
+"many local hand-off nodes should swift send request to get more candidates "
+"for the final response. The default value \"auto\" means Swift will "
+"calculate the number automatically based on the number of replicas and "
+"current cluster topology."
+
+msgid "Now ``swift-recon-cron`` works with conf.d configs."
+msgstr "Now ``swift-recon-cron`` works with conf.d configs."
+
+msgid ""
+"O_TMPFILE support is now detected by attempting to use it instead of looking "
+"at the kernel version. This allows older kernels with backported patches to "
+"take advantage of the O_TMPFILE functionality."
+msgstr ""
+"O_TMPFILE support is now detected by attempting to use it instead of looking "
+"at the kernel version. This allows older kernels with backported patches to "
+"take advantage of the O_TMPFILE functionality."
+
+msgid ""
+"Object expiration respects the ``expiring_objects_container_divisor`` config "
+"option."
+msgstr ""
+"Object expiration respects the ``expiring_objects_container_divisor`` config "
+"option."
+
+msgid "Object expiry improvements"
+msgstr "Object expiry improvements"
+
+msgid ""
+"Object reconstructor logs are now prefixed with information about the "
+"specific worker process logging the message. This makes reading the logs and "
+"understanding the messages much simpler."
+msgstr ""
+"Object reconstructor logs are now prefixed with information about the "
+"specific worker process logging the message. This makes reading the logs and "
+"understanding the messages much simpler."
+
+msgid ""
+"Object versioning now supports a \"history\" mode in addition to the older "
+"\"stack\" mode. The difference is in how DELETE requests are handled. For "
+"full details, please read http://docs.openstack.org/developer/swift/"
+"overview_object_versioning.html."
+msgstr ""
+"Object versioning now supports a \"history\" mode in addition to the older "
+"\"stack\" mode. The difference is in how DELETE requests are handled. For "
+"full details, please read http://docs.openstack.org/developer/swift/"
+"overview_object_versioning.html."
+
+msgid ""
+"Object writes to a container whose existence cannot be verified now 503 "
+"instead of 404."
+msgstr ""
+"Object writes to a container whose existence cannot be verified now 503 "
+"instead of 404."
+
+msgid ""
+"Objects with an ``X-Delete-At`` value in the far future no longer cause "
+"backend server errors."
+msgstr ""
+"Objects with an ``X-Delete-At`` value in the far future no longer cause "
+"backend server errors."
+
+msgid "Ocata Series Release Notes"
+msgstr "Ocata Series Release Notes"
+
+msgid ""
+"On Python 3, certain S3 API headers are now lower case as they would be "
+"coming from AWS."
+msgstr ""
+"On Python 3, certain S3 API headers are now lower case as they would be "
+"coming from AWS."
+
+msgid ""
+"On Python 3, fixed a RecursionError in swift-dispersion-report when using "
+"TLS."
+msgstr ""
+"On Python 3, fixed a RecursionError in swift-dispersion-report when using "
+"TLS."
+
+msgid ""
+"On Python 3, fixed an issue when reading or writing objects with a content "
+"type like ``message/*``. Previously, Swift would fail to respond."
+msgstr ""
+"On Python 3, fixed an issue when reading or writing objects with a content "
+"type like ``message/*``. Previously, Swift would fail to respond."
+
+msgid ""
+"On Python 3, the KMS keymaster now works with secrets stored in Barbican "
+"with a ``text/plain`` payload-content-type."
+msgstr ""
+"On Python 3, the KMS keymaster now works with secrets stored in Barbican "
+"with a ``text/plain`` payload-content-type."
+
+msgid "On Python 3, the formpost middleware now works with unicode file names."
+msgstr ""
+"On Python 3, the formpost middleware now works with Unicode file names."
+
+msgid ""
+"On newer kernels (3.15+ when using xfs), Swift will use the O_TMPFILE flag "
+"when opening a file instead of creating a temporary file and renaming it on "
+"commit. This makes the data path simpler and allows the filesystem to more "
+"efficiently optimize the files on disk, resulting in better performance."
+msgstr ""
+"On newer kernels (3.15+ when using xfs), Swift will use the O_TMPFILE flag "
+"when opening a file instead of creating a temporary file and renaming it on "
+"commit. This makes the data path simpler and allows the filesystem to more "
+"efficiently optimise the files on disk, resulting in better performance."
+
+msgid ""
+"On upgrade, a node configured with concurrency=N will still handle async "
+"updates N-at-a-time, but will do so using only one process instead of N."
+msgstr ""
+"On upgrade, a node configured with concurrency=N will still handle async "
+"updates N-at-a-time, but will do so using only one process instead of N."
+
+msgid ""
+"Operators should verify that encryption is not enabled in their reconciler "
+"pipelines; having it enabled there may harm data durability. For more "
+"information, see `bug 1910804 `__."
+msgstr ""
+"Operators should verify that encryption is not enabled in their reconciler "
+"pipelines; having it enabled there may harm data durability. For more "
+"information, see `bug 1910804 `__."
+
+msgid ""
+"Optimize the Erasure Code reconstructor protocol to reduce IO load on "
+"servers."
+msgstr ""
+"Optimise the Erasure Code reconstructor protocol to reduce I/O load on "
+"servers."
+
+msgid ""
+"Optimized the common case for hashing filesystem trees, thus eliminating a "
+"lot of extraneous disk I/O."
+msgstr ""
+"Optimised the common case for hashing filesystem trees, thus eliminating a "
+"lot of extraneous disk I/O."
+
+msgid ""
+"Ordinary objects in S3 use the MD5 of the object as the ETag, just like "
+"Swift. Multipart Uploads follow a different format, notably including a dash "
+"followed by the number of segments. To that end (and for S3 API requests "
+"*only*), SLO responses via the S3 API have a literal '-N' added on the end "
+"of the ETag."
+msgstr ""
+"Ordinary objects in S3 use the MD5 of the object as the ETag, just like "
+"Swift. Multipart Uploads follow a different format, notably including a dash "
+"followed by the number of segments. To that end (and for S3 API requests "
+"*only*), SLO responses via the S3 API have a literal '-N' added on the end "
+"of the ETag."
+
+msgid "Other Notes"
+msgstr "Other Notes"
+
+msgid ""
+"Overlapping shrinking shards no longer generate audit warnings; these are "
+"expected to sometimes overlap."
+msgstr ""
+"Overlapping shrinking shards no longer generate audit warnings; these are "
+"expected to sometimes overlap."
+
+msgid ""
+"PUT subrequests generated from a client-side COPY will now properly log the "
+"SSC (server-side copy) Swift source field. See https://docs.openstack.org/"
+"developer/swift/logs.html#swift-source for more information."
+msgstr ""
+"PUT sub-requests generated from a client-side COPY will now properly log the "
+"SSC (server-side copy) Swift source field. See https://docs.openstack.org/"
+"developer/swift/logs.html#swift-source for more information."
+
+msgid ""
+"Partition cleanup is more robust, decreasing the likelihood of leaving "
+"behind mostly-empty partitions from the old partition power."
+msgstr ""
+"Partition cleanup is more robust, decreasing the likelihood of leaving "
+"behind mostly-empty partitions from the old partition power."
+
+msgid "Partition power increase fixes:"
+msgstr "Partition power increase fixes:"
+
+msgid "Partition power increase improvements:"
+msgstr "Partition power increase improvements:"
+
+msgid ""
+"Partitions that encountered errors during relinking are no longer marked as "
+"completed in the relinker state file. This ensures that a subsequent relink "
+"will retry the failed partitions."
+msgstr ""
+"Partitions that encountered errors during relinking are no longer marked as "
+"completed in the relinker state file. This ensures that a subsequent relink "
+"will retry the failed partitions."
+
+msgid ""
+"Per-service ``auto_create_account_prefix`` settings are now deprecated and "
+"may be ignored in a future release; if you need to use this, please set it "
+"in the ``[swift-constraints]`` section of ``/etc/swift/swift.conf``."
+msgstr ""
+"Per-service ``auto_create_account_prefix`` settings are now deprecated and "
+"may be ignored in a future release; if you need to use this, please set it "
+"in the ``[swift-constraints]`` section of ``/etc/swift/swift.conf``."
+
+msgid ""
+"Pickle support has been removed from Swift's memcache client. Support had "
+"been deprecated since Swift 1.7.0."
+msgstr ""
+"Pickle support has been removed from Swift's memcache client. Support had "
+"been deprecated since Swift 1.7.0."
+
+msgid "Pike Series Release Notes"
+msgstr "Pike Series Release Notes"
+
+msgid ""
+"Preflight requests do not contain enough information to map a bucket to an "
+"account/container pair; a new cluster-wide option "
+"``cors_preflight_allow_origin`` may be configured for such OPTIONS requests. "
+"The default (blank) rejects all S3 preflight requests."
+msgstr ""
+"Preflight requests do not contain enough information to map a bucket to an "
+"account/container pair; a new cluster-wide option "
+"``cors_preflight_allow_origin`` may be configured for such OPTIONS requests. "
+"The default (blank) rejects all S3 preflight requests."
+
+msgid ""
+"Prevent PyKMIP's kmip_protocol logger from logging at DEBUG. Previously, "
+"some versions of PyKMIP would include all wire data when the root logger was "
+"configured to log at DEBUG; this could expose key material in logs. Only the "
+"``kmip_keymaster`` was affected."
+msgstr ""
+"Prevent PyKMIP's kmip_protocol logger from logging at DEBUG. Previously, "
+"some versions of PyKMIP would include all wire data when the root logger was "
+"configured to log at DEBUG; this could expose key material in logs. Only the "
+"``kmip_keymaster`` was affected."
+
+msgid ""
+"Prevent PyKMIP's kmip_protocol logger from logging at DEBUG. Previously, "
+"some versions of PyKMIP would include all wire data when the root logger was "
+"configured to log at DEBUG; this could expose key material in logs. Only the "
+"kmip_keymaster was affected."
+msgstr ""
+"Prevent PyKMIP's kmip_protocol logger from logging at DEBUG. Previously, "
+"some versions of PyKMIP would include all wire data when the root logger was "
+"configured to log at DEBUG; this could expose key material in logs. Only the "
+"kmip_keymaster was affected."
+
+msgid ""
+"Prevent logged traceback in object-server on client disconnect for chunked "
+"transfers to replicated policies."
+msgstr ""
+"Prevent logged traceback in object-server on client disconnect for chunked "
+"transfers to replicated policies."
+
+msgid ""
+"Prevent object updates from auto-creating shard containers. This ensures "
+"more consistent listings for sharded containers during rebalances."
+msgstr ""
+"Prevent object updates from auto-creating shard containers. This ensures "
+"more consistent listings for sharded containers during rebalances."
+
+msgid ""
+"Prevent shard databases from losing track of their root database when "
+"deleted."
+msgstr ""
+"Prevent shard databases from losing track of their root database when "
+"deleted."
+
+msgid ""
+"Prevent sharded root databases from being reclaimed to ensure that shards "
+"can detect that they have been deleted."
+msgstr ""
+"Prevent sharded root databases from being reclaimed to ensure that shards "
+"can detect that they have been deleted."
+
+msgid ""
+"Previously, when deleting objects in multi-region swift deployment with "
+"write affinity configured, users always get 404 when deleting object before "
+"it's replicated to appropriate nodes."
+msgstr ""
+"Previously, when deleting objects in multi-region swift deployment with "
+"write affinity configured, users always get 404 when deleting object before "
+"it's replicated to appropriate nodes."
+
+msgid ""
+"Provide an S3 API compatibility layer. The external \"swift3\" project has "
+"been imported into Swift's codebase as the \"s3api\" middleware."
+msgstr ""
+"Provide an S3 API compatibility layer. The external \"swift3\" project has "
+"been imported into Swift's codebase as the \"s3api\" middleware."
+
+msgid ""
+"Provide useful status codes in logs for some versioning and symlink "
+"subrequests that were previously logged as 499."
+msgstr ""
+"Provide useful status codes in logs for some versioning and symlink "
+"subrequests that were previously logged as 499."
+
+msgid ""
+"Proxy logging for Complete Multipart Upload requests is now more consistent "
+"when requests have been retried."
+msgstr ""
+"Proxy logging for Complete Multipart Upload requests is now more consistent "
+"when requests have been retried."
+
+msgid ""
+"Proxy, account, container, and object servers now support \"seamless "
+"reloads\" via ``SIGUSR1``. This is similar to the existing graceful restarts "
+"but keeps the server socket open the whole time, reducing service downtime."
+msgstr ""
+"Proxy, account, container, and object servers now support \"seamless "
+"reloads\" via ``SIGUSR1``. This is similar to the existing graceful restarts "
+"but keeps the server socket open the whole time, reducing service downtime."
+
+msgid "Python 3 bug fixes:"
+msgstr "Python 3 bug fixes:"
+
+msgid "Python 3 fixes:"
+msgstr "Python 3 fixes:"
+
+msgid "Python 3.11 is now supported."
+msgstr "Python 3.11 is now supported."
+
+msgid ""
+"Python 3.6 and 3.7 are now fully supported. If you've been testing Swift on "
+"Python 3, upgrade at your earliest convenience."
+msgstr ""
+"Python 3.6 and 3.7 are now fully supported. If you've been testing Swift on "
+"Python 3, upgrade at your earliest convenience."
+
+msgid "Queens Series Release Notes"
+msgstr "Queens Series Release Notes"
+
+msgid "Reduced log noise for common ssync errors."
+msgstr "Reduced log noise for common ssync errors."
+
+msgid ""
+"Reduced object-replicator and object-reconstructor CPU usage by only "
+"checking that the device list is current when rings change."
+msgstr ""
+"Reduced object-replicator and object-reconstructor CPU usage by only "
+"checking that the device list is current when rings change."
+
+msgid ""
+"Reduced the backend load of making ``?versions`` requests to a container "
+"that has never had object versioning enabled."
+msgstr ""
+"Reduced the backend load of making ``?versions`` requests to a container "
+"that has never had object versioning enabled."
+
+msgid ""
+"Reduced the frequency of ``Reclaimable db stuck waiting for shrinking`` "
+"messages when a root DB has been deleted but its shards have not been shrunk "
+"away."
+msgstr ""
+"Reduced the frequency of ``Reclaimable db stuck waiting for shrinking`` "
+"messages when a root DB has been deleted but its shards have not been shrunk "
+"away."
+
+msgid ""
+"Region name config option is now respected when configuring S3 credential "
+"caching."
+msgstr ""
+"Region name config option is now respected when configuring S3 credential "
+"caching."
+
+msgid ""
+"Rehash partitions during the partition power increase. Previously, we relied "
+"on the replication engine to perform the rehash, which could cause an "
+"unexpected I/O spike after a partition power increase."
+msgstr ""
+"Rehash partitions during the partition power increase. Previously, we relied "
+"on the replication engine to perform the rehash, which could cause an "
+"unexpected I/O spike after a partition power increase."
+
+msgid ""
+"Remove ``swift-temp-url`` script. The functionality has been in swiftclient "
+"for a long time and this script has been deprecated since 2.10.0."
+msgstr ""
+"Remove ``swift-temp-url`` script. The functionality has been in swiftclient "
+"for a long time and this script has been deprecated since 2.10.0."
+
+msgid "Remove deprecated ``vm_test_mode`` option."
+msgstr "Remove deprecated ``vm_test_mode`` option."
+
+msgid "Remove empty db hash and suffix directories if a db gets quarantined."
+msgstr "Remove empty DB hash and suffix directories if a DB gets quarantined."
+
+msgid ""
+"Removed \"in-process-\" from func env tox name to work with upstream CI."
+msgstr ""
+"Removed \"in-process-\" from func env tox name to work with upstream CI."
+
+msgid ""
+"Removed a race condition that could cause newly-written data to not be "
+"linked into the new partition for the new partition power."
+msgstr ""
+"Removed a race condition that could cause newly-written data to not be "
+"linked into the new partition for the new partition power."
+
+msgid ""
+"Removed a race condition where a POST to an SLO could modify the X-Static-"
+"Large-Object metadata."
+msgstr ""
+"Removed a race condition where a POST to an SLO could modify the X-Static-"
+"Large-Object metadata."
+
+msgid ""
+"Removed a request-smuggling vector when running a mixed py2/py3 cluster."
+msgstr ""
+"Removed a request-smuggling vector when running a mixed py2/py3 cluster."
+
+msgid ""
+"Removed all ``post_as_copy`` related code and configs. The option has been "
+"deprecated since 2.13.0."
+msgstr ""
+"Removed all ``post_as_copy`` related code and configs. The option has been "
+"deprecated since 2.13.0."
+
+msgid ""
+"Removed per-device reconstruction stats. Now that the reconstructor is "
+"shuffling parts before going through them, those stats no longer make sense."
+msgstr ""
+"Removed per-device reconstruction stats. Now that the reconstructor is "
+"shuffling parts before going through them, those stats no longer make sense."
+
+msgid ""
+"Removed the hard dependency on netifaces; it may still be used if the "
+"``getifaddrs`` C function is not available. This fallback support may be "
+"removed in a future release."
+msgstr ""
+"Removed the hard dependency on netifaces; it may still be used if the "
+"``getifaddrs`` C function is not available. This fallback support may be "
+"removed in a future release."
+
+msgid ""
+"Replaced ``replication_one_per_device`` by custom count defined by "
+"``replication_concurrency_per_device``. The original config value is "
+"deprecated, but continues to function for now. If both values are defined, "
+"the old ``replication_one_per_device`` is ignored."
+msgstr ""
+"Replaced ``replication_one_per_device`` by custom count defined by "
+"``replication_concurrency_per_device``. The original config value is "
+"deprecated, but continues to function for now. If both values are defined, "
+"the old ``replication_one_per_device`` is ignored."
+
+msgid "Replication fixes:"
+msgstr "Replication fixes:"
+
+msgid "Replication improvements:"
+msgstr "Replication improvements:"
+
+msgid ""
+"Replication servers can now handle all request methods. This allows ssync to "
+"work with a separate replication network."
+msgstr ""
+"Replication servers can now handle all request methods. This allows ssync to "
+"work with a separate replication network."
+
+msgid ""
+"Requesting multiple ranges from a Dynamic Large Object now returns the "
+"entire object instead of incorrect data. This was previously fixed in 2.23.0."
+msgstr ""
+"Requesting multiple ranges from a Dynamic Large Object now returns the "
+"entire object instead of incorrect data. This was previously fixed in 2.23.0."
+
+msgid "Require that known-bad EC schemes be deprecated"
+msgstr "Require that known-bad EC schemes be deprecated"
+
+msgid "Respect server type for --md5 check in swift-recon."
+msgstr "Respect server type for --md5 check in swift-recon."
+
+msgid ""
+"Respond 400 Bad Request when Accept headers fail to parse instead of "
+"returning 406 Not Acceptable."
+msgstr ""
+"Respond 400 Bad Request when Accept headers fail to parse instead of "
+"returning 406 Not Acceptable."
+
+msgid ""
+"Return an error if the SLO manifest could not be parsed. Previously, a zero-"
+"byte response was returned."
+msgstr ""
+"Return an error if the SLO manifest could not be parsed. Previously, a zero-"
+"byte response was returned."
+
+msgid "Ring builder output tables better display weights over 1000."
+msgstr "Ring builder output tables better display weights over 1000."
+
+msgid ""
+"Ring files now include byteorder information about the endian of the machine "
+"used to generate the file, and the values are appropriately byteswapped if "
+"deserialized on a machine with a different endianness. Newly created ring "
+"files will be byteorder agnostic, but previously generated ring files will "
+"still fail on different endian architectures. Regenerating older ring files "
+"will cause them to become byteorder agnostic. The regeneration of the ring "
+"files will not cause any new data movement. Newer ring files will still be "
+"usable by older versions of Swift (on machines with the same endianness--"
+"this maintains existing behavior)."
+msgstr ""
+"Ring files now include byteorder information about the endian of the machine "
+"used to generate the file, and the values are appropriately byteswapped if "
+"deserialised on a machine with a different endianness. Newly created ring "
+"files will be byteorder agnostic, but previously generated ring files will "
+"still fail on different endian architectures. Regenerating older ring files "
+"will cause them to become byteorder agnostic. The regeneration of the ring "
+"files will not cause any new data movement. Newer ring files will still be "
+"usable by older versions of Swift (on machines with the same endianness--"
+"this maintains existing behaviour)."
+
+msgid ""
+"Rings with min_part_hours set to zero will now only move one partition "
+"replica per rebalance, thus matching behavior when min_part_hours is greater "
+"than zero."
+msgstr ""
+"Rings with min_part_hours set to zero will now only move one partition "
+"replica per rebalance, thus matching behaviour when min_part_hours is "
+"greater than zero."
+
+msgid "Rocky Series Release Notes"
+msgstr "Rocky Series Release Notes"
+
+msgid "S3 API compatibility updates"
+msgstr "S3 API compatibility updates"
+
+msgid "S3 API fixes:"
+msgstr "S3 API fixes:"
+
+msgid "S3 API improvements"
+msgstr "S3 API improvements"
+
+msgid "S3 API improvements:"
+msgstr "S3 API improvements:"
+
+msgid ""
+"S3 API now allows multipart uploads with non-ASCII characters in the object "
+"name."
+msgstr ""
+"S3 API now allows multipart uploads with non-ASCII characters in the object "
+"name."
+
+msgid ""
+"S3 API now translates ``503 Service Unavailable`` responses to a more S3-"
+"like response instead of raising an error."
+msgstr ""
+"S3 API now translates ``503 Service Unavailable`` responses to a more S3-"
+"like response instead of raising an error."
+
+msgid "S3 ETag for SLOs now include a '-'."
+msgstr "S3 ETag for SLOs now include a '-'."
+
+msgid "S3 requests are now less demanding on the container layer."
+msgstr "S3 requests are now less demanding on the container layer."
+
+msgid ""
+"SHA-1 signatures are now deprecated for the formpost and tempurl "
+"middlewares. At some point in the future, SHA-1 will no longer be enabled by "
+"default; eventually, support for it will be removed entirely."
+msgstr ""
+"SHA-1 signatures are now deprecated for the formpost and tempurl "
+"middlewares. At some point in the future, SHA-1 will no longer be enabled by "
+"default; eventually, support for it will be removed entirely."
+
+msgid ""
+"SLO manifest PUT requests can now be properly validated by sending an ETag "
+"header of the md5 sum of the concatenated md5 sums of the referenced "
+"segments."
+msgstr ""
+"SLO manifest PUT requests can now be properly validated by sending an ETag "
+"header of the MD5 sum of the concatenated MD5 sums of the referenced "
+"segments."
+
+msgid ""
+"SLO will now concurrently HEAD segments, resulting in much faster manifest "
+"validation and object creation. By default, two HEAD requests will be done "
+"at a time, but this can be changed by the operator via the new `concurrency` "
+"setting in the \"[filter:slo]\" section of the proxy server config."
+msgstr ""
+"SLO will now concurrently HEAD segments, resulting in much faster manifest "
+"validation and object creation. By default, two HEAD requests will be done "
+"at a time, but this can be changed by the operator via the new `concurrency` "
+"setting in the \"[filter:slo]\" section of the proxy server config."
+
+msgid ""
+"SSYNC replication mode now removes as much of the directory structure as "
+"possible as soon at it observes that the directory is empty. This reduces "
+"the work needed for subsequent replication passes."
+msgstr ""
+"SSYNC replication mode now removes as much of the directory structure as "
+"possible as soon at it observes that the directory is empty. This reduces "
+"the work needed for subsequent replication passes."
+
+msgid ""
+"Save the ring when dispersion improves, even if balance doesn't improve."
+msgstr ""
+"Save the ring when dispersion improves, even if balance doesn't improve."
+
+msgid "Security Issues"
+msgstr "Security Issues"
+
+msgid ""
+"See the provided ``keymaster.conf-sample`` for more information about this "
+"setting."
+msgstr ""
+"See the provided ``keymaster.conf-sample`` for more information about this "
+"setting."
+
+msgid "Send ETag header in 206 Partial Content responses to SLO reads."
+msgstr "Send ETag header in 206 Partial Content responses to SLO reads."
+
+msgid ""
+"Server errors encountered when validating the first segment of a Static or "
+"Dynamic Large Object now return a 503 to the client, rather than a 409."
+msgstr ""
+"Server errors encountered when validating the first segment of a Static or "
+"Dynamic Large Object now return a 503 to the client, rather than a 409."
+
+msgid ""
+"Server workers may now be gracefully terminated via ``SIGHUP`` or "
+"``SIGUSR1``. The parent process will then spawn a fresh worker."
+msgstr ""
+"Server workers may now be gracefully terminated via ``SIGHUP`` or "
+"``SIGUSR1``. The parent process will then spawn a fresh worker."
+
+msgid ""
+"Server-side copying of erasure-coded data to a replicated policy no longer "
+"copies EC sysmeta. The previous behavior had no material effect, but could "
+"confuse operators examining data on disk."
+msgstr ""
+"Server-side copying of erasure-coded data to a replicated policy no longer "
+"copies EC sysmeta. The previous behaviour had no material effect but could "
+"confuse operators examining data on disk."
+
+msgid ""
+"Servers now open one listen socket per worker, ensuring each worker serves "
+"roughly the same number of concurrent connections."
+msgstr ""
+"Servers now open one listen socket per worker, ensuring each worker serves "
+"roughly the same number of concurrent connections."
+
+msgid ""
+"Several fixes to prepare for Python 3.12 support. While not yet tested in "
+"the gate, initial manual testing looks promising."
+msgstr ""
+"Several fixes to prepare for Python 3.12 support. While not yet tested in "
+"the gate, initial manual testing looks promising."
+
+msgid "Several utility scripts now work better on Python 3:"
+msgstr "Several utility scripts now work better on Python 3:"
+
+msgid ""
+"Shard ranges with no rows to cleave could previously be left in the CREATED "
+"state after cleaving. Now, they are advanced to CLEAVED."
+msgstr ""
+"Shard ranges with no rows to cleave could previously be left in the CREATED "
+"state after cleaving. Now, they are advanced to CLEAVED."
+
+msgid ""
+"Sharding cycle time and last-completion time are now available via swift-"
+"recon."
+msgstr ""
+"Sharding cycle time and last-completion time are now available via swift-"
+"recon."
+
+msgid "Sharding fixes"
+msgstr "Sharding fixes"
+
+msgid "Sharding fixes:"
+msgstr "Sharding fixes:"
+
+msgid "Sharding improvements"
+msgstr "Sharding improvements"
+
+msgid "Sharding improvements:"
+msgstr "Sharding improvements:"
+
+msgid ""
+"Sharding metadata is no longer cleared when databases are deleted. This "
+"could previously cause deleted shards that still had rows to become stuck "
+"and never move them to the correct database."
+msgstr ""
+"Sharding metadata is no longer cleared when databases are deleted. This "
+"could previously cause deleted shards that still had rows to become stuck "
+"and never move them to the correct database."
+
+msgid ""
+"Shards no longer report stats to the root database when they are in the "
+"``CREATED`` state."
+msgstr ""
+"Shards no longer report stats to the root database when they are in the "
+"``CREATED`` state."
+
+msgid ""
+"Shuffle object-updater work. This somewhat reduces the impact a single "
+"overloaded database has on other containers' listings."
+msgstr ""
+"Shuffle object-updater work. This somewhat reduces the impact a single "
+"overloaded database has on other containers' listings."
+
+msgid ""
+"Signal handling is more consistently logged at notice level. Previously, "
+"signal handling would sometimes be logged at info or error levels."
+msgstr ""
+"Signal handling is more consistently logged at notice level. Previously, "
+"signal handling would sometimes be logged at info or error levels."
+
+msgid ""
+"Significant improvements to the api-ref doc available at http://developer."
+"openstack.org/api-ref/object-storage/."
+msgstr ""
+"Significant improvements to the api-ref doc available at http://developer."
+"openstack.org/api-ref/object-storage/."
+
+msgid ""
+"Similar to above, ``expansion_limit`` was added as an absolute-size "
+"replacement for the now-deprecated ``shard_shrink_merge_point`` "
+"configuration option."
+msgstr ""
+"Similar to above, ``expansion_limit`` was added as an absolute-size "
+"replacement for the now-deprecated ``shard_shrink_merge_point`` "
+"configuration option."
+
+msgid ""
+"Ssync no longer corrupts unencrypted non-ASCII metadata during transfers."
+msgstr ""
+"Ssync no longer corrupts unencrypted non-ASCII metadata during transfers."
+
+msgid ""
+"Static Large Object (SLO) manifest may now (again) have zero-byte last "
+"segments."
+msgstr ""
+"Static Large Object (SLO) manifest may now (again) have zero-byte last "
+"segments."
+
+msgid ""
+"Static Large Object sizes in listings for versioned containers are now more "
+"accurate."
+msgstr ""
+"Static Large Object sizes in listings for versioned containers are now more "
+"accurate."
+
+msgid "Static large object fixes:"
+msgstr "Static large object fixes:"
+
+msgid ""
+"Staticweb correctly handles listings when paths include non-ASCII characters."
+msgstr ""
+"Staticweb correctly handles listings when paths include non-ASCII characters."
+
+msgid "Stein Series Release Notes"
+msgstr "Stein Series Release Notes"
+
+msgid ""
+"Stop and disable the object-reconstructor before upgrading. This ensures no "
+"upgraded object server starts writing new fragments that old object servers "
+"would quarantine."
+msgstr ""
+"Stop and disable the object-reconstructor before upgrading. This ensures no "
+"upgraded object server starts writing new fragments that old object servers "
+"would quarantine."
+
+msgid ""
+"Stop logging tracebacks in the ``object-replicator`` when it runs out of "
+"handoff locations."
+msgstr ""
+"Stop logging tracebacks in the ``object-replicator`` when it runs out of "
+"handoff locations."
+
+msgid "Stopped logging tracebacks when receiving an unexpected response."
+msgstr "Stopped logging tracebacks when receiving an unexpected response."
+
+msgid ""
+"Storage policy definitions in swift.conf can now define the diskfile to use "
+"to access objects. See the included swift.conf-sample file for a description "
+"of usage."
+msgstr ""
+"Storage policy definitions in swift.conf can now define the diskfile to use "
+"to access objects. See the included swift.conf-sample file for a description "
+"of usage."
+
+msgid "Support multi-range GETs for static large objects."
+msgstr "Support multi-range GETs for static large objects."
+
+msgid "Suppress unexpected-file warnings for rsync temp files."
+msgstr "Suppress unexpected-file warnings for rsync temp files."
+
+msgid "Suppressed ``RemoteDisconnected`` tracebacks."
+msgstr "Suppressed ``RemoteDisconnected`` tracebacks."
+
+msgid "Suppressed the KeyError message when auditor finds an expired object."
+msgstr "Suppressed the KeyError message when auditor finds an expired object."
+
+msgid "Swift Release Notes"
+msgstr "Swift Release Notes"
+
+msgid ""
+"Swift can now cache the S3 secret from Keystone to use for subsequent "
+"requests. This functionality is disabled by default but can be enabled by "
+"setting the ``secret_cache_duration`` in the ``[filter:s3token]`` section of "
+"the proxy server config to a number greater than 0."
+msgstr ""
+"Swift can now cache the S3 secret from Keystone to use for subsequent "
+"requests. This functionality is disabled by default but can be enabled by "
+"setting the ``secret_cache_duration`` in the ``[filter:s3token]`` section of "
+"the proxy server config to a number greater than 0."
+
+msgid ""
+"Swift now returns a 503 (instead of a 500) when an account auto-create fails."
+msgstr ""
+"Swift now returns a 503 (instead of a 500) when an account auto-create fails."
+
+msgid "Swift path and on-disk path are now included with all sharder logging."
+msgstr "Swift path and on-disk path are now included with all sharder logging."
+
+msgid ""
+"Swift-all-in-one Docker images are now built and published to https://hub."
+"docker.com/r/openstackswift/saio. These are intended for use as development "
+"targets, but will hopefully be useful as a starting point for other work "
+"involving containerizing Swift."
+msgstr ""
+"Swift-all-in-one Docker images are now built and published to https://hub."
+"docker.com/r/openstackswift/saio. These are intended for use as development "
+"targets, but will hopefully be useful as a starting point for other work "
+"involving containerizing Swift."
+
+msgid ""
+"Symlink objects reference one other object. They are created by creating an "
+"empty object with an X-Symlink-Target header. The value of the header is of "
+"the format /, and the target does not need to exist at "
+"the time of symlink creation. Cross-account symlinks can be created by "
+"including the X-Symlink-Target-Account header."
+msgstr ""
+"Symlink objects reference one other object. They are created by creating an "
+"empty object with an X-Symlink-Target header. The value of the header is of "
+"the format /, and the target does not need to exist at "
+"the time of symlink creation. Cross-account symlinks can be created by "
+"including the X-Symlink-Target-Account header."
+
+msgid ""
+"TempURLs now support IP range restrictions. Please see https://docs."
+"openstack.org/swift/latest/middleware.html#client-usage for more information "
+"on how to use this additional restriction."
+msgstr ""
+"TempURLs now support IP range restrictions. Please see https://docs."
+"openstack.org/swift/latest/middleware.html#client-usage for more information "
+"on how to use this additional restriction."
+
+msgid ""
+"TempURLs now support a validation against a common prefix. A prefix-based "
+"signature grants access to all objects which share the same prefix. This "
+"avoids the creation of a large amount of signatures, when a whole container "
+"or pseudofolder is shared."
+msgstr ""
+"TempURLs now support a validation against a common prefix. A prefix-based "
+"signature grants access to all objects which share the same prefix. This "
+"avoids the creation of a large amount of signatures, when a whole container "
+"or pseudofolder is shared."
+
+msgid ""
+"TempURLs using the \"inline\" parameter can now also set the \"filename\" "
+"parameter. Both are used in the Content-Disposition response header."
+msgstr ""
+"TempURLs using the \"inline\" parameter can now also set the \"filename\" "
+"parameter. Both are used in the Content-Disposition response header."
+
+msgid ""
+"Temporary URLs now support one common form of ISO 8601 timestamps in "
+"addition to Unix seconds-since-epoch timestamps. The ISO 8601 format "
+"accepted is '%Y-%m-%dT%H:%M:%SZ'. This makes TempURLs more user-friendly to "
+"produce and consume."
+msgstr ""
+"Temporary URLs now support one common form of ISO 8601 timestamps in "
+"addition to Unix seconds-since-epoch timestamps. The ISO 8601 format "
+"accepted is '%Y-%m-%dT%H:%M:%SZ'. This makes TempURLs more user-friendly to "
+"produce and consume."
+
+msgid ""
+"The 'compact' command now outputs the total number of compactible sequences."
+msgstr ""
+"The 'compact' command now outputs the total number of compactible sequences."
+
+msgid ""
+"The EC reconstructor process has been dramatically improved by adding "
+"support for multiple concurrent workers. Multiple processes are required to "
+"get high concurrency, and this change results in much faster rebalance times "
+"on servers with many drives."
+msgstr ""
+"The EC reconstructor process has been dramatically improved by adding "
+"support for multiple concurrent workers. Multiple processes are required to "
+"get high concurrency, and this change results in much faster rebalance times "
+"on servers with many drives."
+
+msgid ""
+"The EC reconstructor will now attempt to remove empty directories "
+"immediately, while the inodes are still cached, rather than waiting until "
+"the next run."
+msgstr ""
+"The EC reconstructor will now attempt to remove empty directories "
+"immediately, while the inodes are still cached, rather than waiting until "
+"the next run."
+
+msgid "The ETag-quoting middleware no longer raises TypeErrors."
+msgstr "The ETag-quoting middleware no longer raises TypeErrors."
+
+msgid "The S3 ACL and Delete Multiple APIs are now less case-sensitive."
+msgstr "The S3 ACL and Delete Multiple APIs are now less case-sensitive."
+
+msgid ""
+"The ``container-replicator`` now correctly enqueues ``container-reconciler`` "
+"work for sharded containers."
+msgstr ""
+"The ``container-replicator`` now correctly enqueues ``container-reconciler`` "
+"work for sharded containers."
+
+msgid ""
+"The ``container-replicator`` now only attempts to fetch shard ranges if the "
+"remote indicates that it has shard ranges. Further, it does so with a "
+"timeout to prevent the process from hanging in certain cases."
+msgstr ""
+"The ``container-replicator`` now only attempts to fetch shard ranges if the "
+"remote indicates that it has shard ranges. Further, it does so with a "
+"timeout to prevent the process from hanging in certain cases."
+
+msgid ""
+"The ``domain_remap`` middleware now supports the ``mangle_client_paths`` "
+"option. Its default \"false\" value changes ``domain_remap`` parsing to stop "
+"stripping the ``path_root`` value from URL paths. If users depend on this "
+"path mangling, operators should set ``mangle_client_paths`` to \"True\" "
+"before upgrading."
+msgstr ""
+"The ``domain_remap`` middleware now supports the ``mangle_client_paths`` "
+"option. Its default \"false\" value changes ``domain_remap`` parsing to stop "
+"stripping the ``path_root`` value from URL paths. If users depend on this "
+"path mangling, operators should set ``mangle_client_paths`` to \"True\" "
+"before upgrading."
+
+msgid ""
+"The ``kmip_keymaster`` middleware can now be configured directly in the "
+"proxy-server config file. The existing behavior of using an external config "
+"file is still supported."
+msgstr ""
+"The ``kmip_keymaster`` middleware can now be configured directly in the "
+"proxy-server config file. The existing behaviour of using an external config "
+"file is still supported."
+
+msgid ""
+"The ``object-expirer`` may now be configured in ``object-server.conf``. This "
+"is in anticipation of a future change to allow the ``object-expirer`` to be "
+"deployed on all nodes that run the ``object-server``."
+msgstr ""
+"The ``object-expirer`` may now be configured in ``object-server.conf``. This "
+"is in anticipation of a future change to allow the ``object-expirer`` to be "
+"deployed on all nodes that run the ``object-server``."
+
+msgid ""
+"The ``proxy-server`` now caches 'updating' shards, improving write "
+"performance for sharded containers. A new config option, "
+"``recheck_updating_shard_ranges``, controls the cache time; set it to 0 to "
+"disable caching."
+msgstr ""
+"The ``proxy-server`` now caches 'updating' shards, improving write "
+"performance for sharded containers. A new config option, "
+"``recheck_updating_shard_ranges``, controls the cache time; set it to 0 to "
+"disable caching."
+
+msgid ""
+"The ``proxy-server`` now ignores 404 responses from handoffs that have no "
+"data when deciding on the correct response for object requests, similar to "
+"what it already does for account and container requests."
+msgstr ""
+"The ``proxy-server`` now ignores 404 responses from handoffs that have no "
+"data when deciding on the correct response for object requests, similar to "
+"what it already does for account and container requests."
+
+msgid ""
+"The ``proxy-server`` now ignores 404 responses from handoffs without "
+"databases when deciding on the correct response for account and container "
+"requests."
+msgstr ""
+"The ``proxy-server`` now ignores 404 responses from handoffs without "
+"databases when deciding on the correct response for account and container "
+"requests."
+
+msgid ""
+"The ``recoverable_node_timeout`` option no longer applies to ``X-Newest`` "
+"GET requests."
+msgstr ""
+"The ``recoverable_node_timeout`` option no longer applies to ``X-Newest`` "
+"GET requests."
+
+msgid ""
+"The ``swift-drive-audit`` tool now works with ISO timestamps in kernel logs."
+msgstr ""
+"The ``swift-drive-audit`` tool now works with ISO timestamps in kernel logs."
+
+msgid ""
+"The ``swift-manage-shard-ranges`` tool has a new mode to repair gaps in the "
+"namespace."
+msgstr ""
+"The ``swift-manage-shard-ranges`` tool has a new mode to repair gaps in the "
+"namespace."
+
+msgid "The ``swift-recon-cron`` tool now better handles missing directories."
+msgstr "The ``swift-recon-cron`` tool now better handles missing directories."
+
+msgid ""
+"The above bug was caused by a difference in string types that resulted in "
+"ambiguity when decrypting. To prevent the ambiguity for new data, set "
+"``meta_version_to_write = 3`` in your keymaster configuration *after* "
+"upgrading all proxy servers."
+msgstr ""
+"The above bug was caused by a difference in string types that resulted in "
+"ambiguity when decrypting. To prevent the ambiguity for new data, set "
+"``meta_version_to_write = 3`` in your keymaster configuration *after* "
+"upgrading all proxy servers."
+
+msgid ""
+"The account and container auditors now log and update recon before going to "
+"sleep."
+msgstr ""
+"The account and container auditors now log and update recon before going to "
+"sleep."
+
+msgid ""
+"The bulk extract middleware once again allows clients to specify metadata "
+"(including expiration timestamps) for all objects in the archive."
+msgstr ""
+"The bulk extract middleware once again allows clients to specify metadata "
+"(including expiration timestamps) for all objects in the archive."
+
+msgid ""
+"The concurrent read options (``concurrent_gets``, ``concurrency_timeout``, "
+"and ``concurrent_ec_extra_requests``) may now be configured per storage-"
+"policy."
+msgstr ""
+"The concurrent read options (``concurrent_gets``, ``concurrency_timeout``, "
+"and ``concurrent_ec_extra_requests``) may now be configured per storage-"
+"policy."
+
+msgid ""
+"The container sharder can now handle containers with special characters in "
+"their names."
+msgstr ""
+"The container sharder can now handle containers with special characters in "
+"their names."
+
+msgid ""
+"The container-reconciler now scales out better with new ``processes``, "
+"``process``, and ``concurrency`` options, similar to the object-expirer."
+msgstr ""
+"The container-reconciler now scales out better with new ``processes``, "
+"``process``, and ``concurrency`` options, similar to the object-expirer."
+
+msgid ""
+"The container-updater now reports zero objects and bytes used for child DBs "
+"in sharded containers. This prevents double-counting in utilization reports."
+msgstr ""
+"The container-updater now reports zero objects and bytes used for child DBs "
+"in sharded containers. This prevents double-counting in utilisation reports."
+
+msgid "The correct storage policy is now logged for S3 requests."
+msgstr "The correct storage policy is now logged for S3 requests."
+
+msgid ""
+"The dark-data audit watcher now requires that all primary locations for an "
+"object's container agree that the data does not appear in listings to "
+"consider data \"dark\". Previously, a network partition that left an object "
+"node isolated could cause it to quarantine or delete all of its data."
+msgstr ""
+"The dark-data audit watcher now requires that all primary locations for an "
+"object's container agree that the data does not appear in listings to "
+"consider data \"dark\". Previously, a network partition that left an object "
+"node isolated could cause it to quarantine or delete all of its data."
+
+msgid ""
+"The dark-data audit watcher now skips objects younger than a new "
+"configurable ``grace_age`` period. This avoids issues where data could be "
+"flagged, quarantined, or deleted because of listing consistency issues. The "
+"default is one week."
+msgstr ""
+"The dark-data audit watcher now skips objects younger than a new "
+"configurable ``grace_age`` period. This avoids issues where data could be "
+"flagged, quarantined, or deleted because of listing consistency issues. The "
+"default is one week."
+
+msgid ""
+"The dark-data object audit watcher now works with sharded containers. "
+"Previously, it would think that all data files were absent from listings."
+msgstr ""
+"The dark-data object audit watcher now works with sharded containers. "
+"Previously, it would think that all data files were absent from listings."
+
+msgid ""
+"The default for `object_post_as_copy` has been changed to False. The option "
+"is now deprecated and will be removed in a future release. If your cluster "
+"is still running with post-as-copy enabled, please update it to use the "
+"\"fast-post\" method. Future versions of Swift will not support post-as-"
+"copy, and future features will not be supported under post-as-copy. (\"Fast-"
+"post\" is where `object_post_as_copy` is false)."
+msgstr ""
+"The default for `object_post_as_copy` has been changed to False. The option "
+"is now deprecated and will be removed in a future release. If your cluster "
+"is still running with post-as-copy enabled, please update it to use the "
+"\"fast-post\" method. Future versions of Swift will not support post-as-"
+"copy, and future features will not be supported under post-as-copy. (\"Fast-"
+"post\" is where `object_post_as_copy` is false)."
+
+msgid ""
+"The default location is now set to \"us-east-1\". This is more likely to be "
+"the default region that a client will try when using v4 signatures."
+msgstr ""
+"The default location is now set to \"us-east-1\". This is more likely to be "
+"the default region that a client will try when using v4 signatures."
+
+msgid ""
+"The erasure code reconstructor `handoffs_first` option has been deprecated "
+"in favor of `handoffs_only`. `handoffs_only` is far more useful, and just "
+"like `handoffs_first` mode in the replicator, it gives the operator the "
+"option of forcing the consistency engine to focus solely on revert (handoff) "
+"jobs, thus improving the speed of rebalances.  The `handoffs_only` behavior "
+"is somewhat consistent with the replicator's `handoffs_first` option (any "
+"error on any handoff in the replicator will make it essentially handoff only "
+"forever) but the `handoff_only` option does what you want and is named "
+"correctly in the reconstructor."
+msgstr ""
+"The erasure code reconstructor `handoffs_first` option has been deprecated "
+"in favour of `handoffs_only`. `handoffs_only` is far more useful, and just "
+"like `handoffs_first` mode in the replicator, it gives the operator the "
+"option of forcing the consistency engine to focus solely on revert (handoff) "
+"jobs, thus improving the speed of rebalances.  The `handoffs_only` behaviour "
+"is somewhat consistent with the replicator's `handoffs_first` option (any "
+"error on any hand-off in the replicator will make it essentially hand-off "
+"only forever) but the `handoff_only` option does what you want and is named "
+"correctly in the reconstructor."
+
+msgid ""
+"The erasure code reconstructor will now shuffle work jobs across all disks "
+"instead of going disk-by-disk. This eliminates single-disk I/O contention "
+"and allows continued scaling as concurrency is increased."
+msgstr ""
+"The erasure code reconstructor will now shuffle work jobs across all disks "
+"instead of going disk-by-disk. This eliminates single-disk I/O contention "
+"and allows continued scaling as concurrency is increased."
+
+msgid ""
+"The formpost digest algorithm is now configurable via the new "
+"``allowed_digests`` option, and support is added for both SHA-256 and "
+"SHA-512. Supported formpost digests are exposed to clients in ``/info``. "
+"Additionally, formpost signatures can now be base64 encoded."
+msgstr ""
+"The formpost digest algorithm is now configurable via the new "
+"``allowed_digests`` option, and support is added for both SHA-256 and "
+"SHA-512. Supported formpost digests are exposed to clients in ``/info``. "
+"Additionally, formpost signatures can now be base64 encoded."
+
+msgid ""
+"The formpost middleware now properly supports uploading multiple files with "
+"different content-types."
+msgstr ""
+"The formpost middleware now properly supports uploading multiple files with "
+"different content-types."
+
+msgid "The formpost middleware now works with unicode file names."
+msgstr "The formpost middleware now works with Unicode file names."
+
+msgid ""
+"The improvements to EC reads made in Swift 2.10.0 have also been applied to "
+"the reconstructor. This allows fragments to be rebuilt in more "
+"circumstances, resulting in faster recovery from failures."
+msgstr ""
+"The improvements to EC reads made in Swift 2.10.0 have also been applied to "
+"the reconstructor. This allows fragments to be rebuilt in more "
+"circumstances, resulting in faster recovery from failures."
+
+msgid ""
+"The message template for proxy logging may now include a ``{domain}`` field "
+"for the client-provided ``Host`` header."
+msgstr ""
+"The message template for proxy logging may now include a ``{domain}`` field "
+"for the client-provided ``Host`` header."
+
+msgid ""
+"The more-efficient shard range structure from the last release is now used "
+"when fetching ranges from the backend."
+msgstr ""
+"The more-efficient shard range structure from the last release is now used "
+"when fetching ranges from the backend."
+
+msgid ""
+"The number of container updates on object PUTs (ie to update listings) has "
+"been recomputed to be far more efficient  while maintaining durability "
+"guarantees. Specifically, object PUTs to erasure-coded policies will now "
+"normally result in far fewer container updates."
+msgstr ""
+"The number of container updates on object PUTs (ie to update listings) has "
+"been recomputed to be far more efficient  while maintaining durability "
+"guarantees. Specifically, object PUTs to erasure-coded policies will now "
+"normally result in far fewer container updates."
+
+msgid ""
+"The object and container server config option ``slowdown`` has been "
+"deprecated in favor of the new ``objects_per_second`` and "
+"``containers_per_second`` options."
+msgstr ""
+"The object and container server config option ``slowdown`` has been "
+"deprecated in favour of the new ``objects_per_second`` and "
+"``containers_per_second`` options."
+
+msgid ""
+"The object reconstructor can now rebuild an EC fragment for an expired "
+"object."
+msgstr ""
+"The object reconstructor can now rebuild an EC fragment for an expired "
+"object."
+
+msgid ""
+"The object reconstructor will now fork all available worker processes when "
+"operating on a subset of local devices."
+msgstr ""
+"The object reconstructor will now fork all available worker processes when "
+"operating on a subset of local devices."
+
+msgid ""
+"The object server runs certain IO-intensive methods outside the main pthread "
+"for performance. Previously, if one of those methods tried to log, this can "
+"cause a crash that eventually leads to an object server with hundreds or "
+"thousands of greenthreads, all deadlocked. The fix is to use a mutex that "
+"works across different greenlets and different pthreads."
+msgstr ""
+"The object server runs certain IO-intensive methods outside the main pthread "
+"for performance. Previously, if one of those methods tried to log, this can "
+"cause a crash that eventually leads to an object server with hundreds or "
+"thousands of greenthreads, all deadlocked. The fix is to use a mutex that "
+"works across different greenlets and different pthreads."
+
+msgid ""
+"The object updater now supports two configuration settings: \"concurrency\" "
+"and \"updater_workers\". The latter controls how many worker processes are "
+"spawned, while the former controls how many concurrent container updates are "
+"performed by each worker process. This should speed the processing of "
+"async_pendings."
+msgstr ""
+"The object updater now supports two configuration settings: \"concurrency\" "
+"and \"updater_workers\". The latter controls how many worker processes are "
+"spawned, while the former controls how many concurrent container updates are "
+"performed by each worker process. This should speed the processing of "
+"async_pendings."
+
+msgid "The object-expirer logs fewer client disconnects."
+msgstr "The object-expirer logs fewer client disconnects."
+
+msgid ""
+"The object-expirer now only cleans up empty containers. Previously, it would "
+"attempt to delete all processed containers, regardless of whether there were "
+"entries which were skipped or had errors."
+msgstr ""
+"The object-expirer now only cleans up empty containers. Previously, it would "
+"attempt to delete all processed containers, regardless of whether there were "
+"entries which were skipped or had errors."
+
+msgid ""
+"The object-replicator now logs successful rsync transfers at debug instead "
+"of info."
+msgstr ""
+"The object-replicator now logs successful rsync transfers at debug instead "
+"of info."
+
+msgid ""
+"The object-updater now defers rate-limited updates to the end of its cycle; "
+"these deferred updates will be processed (at the limited rate) until the "
+"configured ``interval`` elapses. A new ``max_deferred_updates`` option may "
+"be used to bound the deferral queue."
+msgstr ""
+"The object-updater now defers rate-limited updates to the end of its cycle; "
+"these deferred updates will be processed (at the limited rate) until the "
+"configured ``interval`` elapses. A new ``max_deferred_updates`` option may "
+"be used to bound the deferral queue."
+
+msgid ""
+"The output of devices from ``swift-ring-builder`` has been reordered by "
+"region, zone, ip, and device."
+msgstr ""
+"The output of devices from ``swift-ring-builder`` has been reordered by "
+"region, zone, ip, and device."
+
+msgid ""
+"The post-rsync REPLICATE call no longer recalculates hashes immediately."
+msgstr ""
+"The post-rsync REPLICATE call no longer recalculates hashes immediately."
+
+msgid ""
+"The proxy server now applies error-limiting to the correct node when "
+"handling a recoverable node error."
+msgstr ""
+"The proxy server now applies error-limiting to the correct node when "
+"handling a recoverable node error."
+
+msgid ""
+"The proxy-server now caches 'listing' shards, improving listing performance "
+"for sharded containers. A new config option, "
+"``recheck_listing_shard_ranges``, controls the cache time and defaults to 10 "
+"minutes; set it to 0 to disable caching (the previous behavior)."
+msgstr ""
+"The proxy-server now caches 'listing' shards, improving listing performance "
+"for sharded containers. A new config option, "
+"``recheck_listing_shard_ranges``, controls the cache time and defaults to 10 "
+"minutes; set it to 0 to disable caching (the previous behaviour)."
+
+msgid ""
+"The reconciler now defers working on policies that have a partition power "
+"increase in progress to avoid issues with concurrent writes."
+msgstr ""
+"The reconciler now defers working on policies that have a partition power "
+"increase in progress to avoid issues with concurrent writes."
+
+msgid ""
+"The relinker better handles data found from earlier partition power "
+"increases."
+msgstr ""
+"The relinker better handles data found from earlier partition power "
+"increases."
+
+msgid ""
+"The relinker better handles tombstones found for the same object but with "
+"different inodes."
+msgstr ""
+"The relinker better handles tombstones found for the same object but with "
+"different inodes."
+
+msgid ""
+"The relinker can now target specific storage policies or partitions by using "
+"the new ``--policy`` and ``--partition`` options."
+msgstr ""
+"The relinker can now target specific storage policies or partitions by using "
+"the new ``--policy`` and ``--partition`` options."
+
+msgid ""
+"The relinker now performs eventlet-hub selection the same way as other "
+"daemons. In particular, ``epolls`` will no longer be selected, as it seemed "
+"to cause occassional hangs."
+msgstr ""
+"The relinker now performs eventlet-hub selection the same way as other "
+"daemons. In particular, ``epolls`` will no longer be selected, as it seemed "
+"to cause occasional hangs."
+
+msgid ""
+"The relinker now spawns multiple subprocesses to process disks in parallel. "
+"By default, one worker is spawned per disk; use the new ``--workers`` option "
+"to control how many subprocesses are used. Use ``--workers=0`` to maintain "
+"the previous behavior."
+msgstr ""
+"The relinker now spawns multiple subprocesses to process disks in parallel. "
+"By default, one worker is spawned per disk; use the new ``--workers`` option "
+"to control how many subprocesses are used. Use ``--workers=0`` to maintain "
+"the previous behaviour."
+
+msgid ""
+"The sharder and swift-manage-shard-ranges now consider total row count "
+"(instead of just object count) when deciding whether a shard is a candidate "
+"for shrinking."
+msgstr ""
+"The sharder and swift-manage-shard-ranges now consider total row count "
+"(instead of just object count) when deciding whether a shard is a candidate "
+"for shrinking."
+
+msgid ""
+"The sharder daemon has been enhanced to better support the shrinking of "
+"shards that are no longer required. Shard containers will now discover from "
+"their root container if they should be shrinking. They will also discover "
+"the shards into which they should shrink, which may include the root "
+"container itself."
+msgstr ""
+"The sharder daemon has been enhanced to better support the shrinking of "
+"shards that are no longer required. Shard containers will now discover from "
+"their root container if they should be shrinking. They will also discover "
+"the shards into which they should shrink, which may include the root "
+"container itself."
+
+msgid ""
+"The sharder now correctly identifies and fails audits for shard ranges that "
+"overlap exactly."
+msgstr ""
+"The sharder now correctly identifies and fails audits for shard ranges that "
+"overlap exactly."
+
+msgid ""
+"The sharding progress reports in recon cache now continue to be included for "
+"a period of time after sharding has completed. The time period may be "
+"configured using the ``recon_sharded_timeout`` option in the ``[container-"
+"sharder]`` section of container-server.conf, and defaults to 12 hours."
+msgstr ""
+"The sharding progress reports in recon cache now continue to be included for "
+"a period of time after sharding has completed. The time period may be "
+"configured using the ``recon_sharded_timeout`` option in the ``[container-"
+"sharder]`` section of container-server.conf, and defaults to 12 hours."
+
+msgid ""
+"The standard-library ``logging`` module is no longer monkey-patched when "
+"importing ``swift.common.utils``, making it easier to re-use swift code in "
+"other contexts."
+msgstr ""
+"The standard-library ``logging`` module is no longer monkey-patched when "
+"importing ``swift.common.utils``, making it easier to re-use Swift code in "
+"other contexts."
+
+msgid ""
+"The staticweb middleware now allows empty listings at the root of a "
+"container. Previously, this would result in a 404 response."
+msgstr ""
+"The staticweb middleware now allows empty listings at the root of a "
+"container. Previously, this would result in a 404 response."
+
+msgid ""
+"The structure of cached shard ranges has changed, improving performance when "
+"listing or writing to sharded containers. Note that immediately after "
+"upgrade, the new structures will all be cache misses, which may lead to a "
+"thundering herd problem. To avoid this, upgrade just a few nodes first, let "
+"them service some fraction of traffic to populate the cache, then upgrade "
+"the rest of the cluster."
+msgstr ""
+"The structure of cached shard ranges has changed, improving performance when "
+"listing or writing to sharded containers. Note that immediately after "
+"upgrade, the new structures will all be cache misses, which may lead to a "
+"thundering herd problem. To avoid this, upgrade just a few nodes first, let "
+"them service some fraction of traffic to populate the cache, then upgrade "
+"the rest of the cluster."
+
+msgid ""
+"The tempurl digest algorithm is now configurable, and Swift added support "
+"for both SHA-256 and SHA-512. Supported tempurl digests are exposed to "
+"clients in ``/info``. Additionally, tempurl signatures can now be base64 "
+"encoded."
+msgstr ""
+"The tempurl digest algorithm is now configurable, and Swift added support "
+"for both SHA-256 and SHA-512. Supported tempurl digests are exposed to "
+"clients in ``/info``. Additionally, tempurl signatures can now be base64 "
+"encoded."
+
+msgid ""
+"The tempurl middleware has been updated to return a 503 if storing a token "
+"in memcache fails. Third party authentication middlewares are encouraged to "
+"also use the new ``raise_on_error`` keyword argument when storing ephemeral "
+"tokens in memcache."
+msgstr ""
+"The tempurl middleware has been updated to return a 503 if storing a token "
+"in memcache fails. Third party authentication middlewares are encouraged to "
+"also use the new ``raise_on_error`` keyword argument when storing ephemeral "
+"tokens in memcache."
+
+msgid "This is the final stable branch that will support Python 2.7."
+msgstr "This is the final stable branch that will support Python 2.7."
+
+msgid ""
+"Throttle update_auditor_status calls so it updates no more than once per "
+"minute."
+msgstr ""
+"Throttle update_auditor_status calls so it updates no more than once per "
+"minute."
+
+msgid ""
+"Throttle update_auditor_status calls so it updates no more than once per "
+"minute. This prevents excessive IO on a new cluster."
+msgstr ""
+"Throttle update_auditor_status calls so it updates no more than once per "
+"minute. This prevents excessive IO on a new cluster."
+
+msgid "Train Series Release Notes"
+msgstr "Train Series Release Notes"
+
+msgid "Transaction IDs are included in more error responses."
+msgstr "Transaction IDs are included in more error responses."
+
+msgid ""
+"Transaction IDs are now only included in daemon log lines in a request/"
+"response context."
+msgstr ""
+"Transaction IDs are now only included in daemon log lines in a request/"
+"response context."
+
+msgid "Truncate error logs to prevent log handler from running out of buffer."
+msgstr "Truncate error logs to prevent log handler from running out of buffer."
+
+msgid ""
+"Turned off thread-logging when monkey-patching with eventlet. This addresses "
+"a potential hang in the proxy-server while logging client disconnects."
+msgstr ""
+"Turned off thread-logging when monkey-patching with eventlet. This addresses "
+"a potential hang in the proxy-server while logging client disconnects."
+
+msgid ""
+"Ubuntu 18.04 and RDO's CentOS 7 repos package liberasurecode 1.5.0, while "
+"Ubuntu 20.04 and RDO's CentOS 8 repos currently package liberasurecode 1.6.0 "
+"or 1.6.1. Take care when upgrading major distro versions!"
+msgstr ""
+"Ubuntu 18.04 and RDO's CentOS 7 repos package liberasurecode 1.5.0, while "
+"Ubuntu 20.04 and RDO's CentOS 8 repos currently package liberasurecode 1.6.0 "
+"or 1.6.1. Take care when upgrading major distro versions!"
+
+msgid "Unsigned payloads work with v4 signatures once more."
+msgstr "Unsigned payloads work with v4 signatures once more."
+
+msgid ""
+"Update dnspython dependency to 1.14, removing the need to have separate "
+"dnspython dependencies for Py2 and Py3."
+msgstr ""
+"Update dnspython dependency to 1.14, removing the need to have separate "
+"dnspython dependencies for Py2 and Py3."
+
+msgid "Updated docs to reference appropriate ports."
+msgstr "Updated docs to reference appropriate ports."
+
+msgid "Updated requirements.txt to match global exclusions and formatting."
+msgstr "Updated requirements.txt to match global exclusions and formatting."
+
+msgid "Updated the PyECLib dependency to 1.3.1."
+msgstr "Updated the PyECLib dependency to 1.3.1."
+
+msgid ""
+"Updated the `hashes.pkl` file format to include timestamp information for "
+"race detection. Also simplified hashing logic to prevent race conditions and "
+"optimize for the common case."
+msgstr ""
+"Updated the `hashes.pkl` file format to include timestamp information for "
+"race detection. Also simplified hashing logic to prevent race conditions and "
+"optimise for the common case."
+
+msgid ""
+"Upgrade Impact: If you upgrade and roll back, you must delete all `hashes."
+"pkl` files."
+msgstr ""
+"Upgrade Impact: If you upgrade and roll back, you must delete all `hashes."
+"pkl` files."
+
+msgid "Upgrade Notes"
+msgstr "Upgrade Notes"
+
+msgid ""
+"Upgrade impact -- during a rolling upgrade, an updated proxy server may "
+"write a manifest that an out-of-date proxy server will not be able to read. "
+"This will resolve itself once the upgrade completes on all nodes."
+msgstr ""
+"Upgrade impact -- during a rolling upgrade, an updated proxy server may "
+"write a manifest that an out-of-date proxy server will not be able to read. "
+"This will resolve itself once the upgrade completes on all nodes."
+
+msgid ""
+"Upgrade liberasurecode on all object servers. Object servers can now read "
+"both old and new fragments."
+msgstr ""
+"Upgrade liberasurecode on all object servers. Object servers can now read "
+"both old and new fragments."
+
+msgid ""
+"Upgrade liberasurecode on all proxy servers. Newly-written data will now use "
+"new fragments. Note that not-yet-upgraded proxies will not be able to read "
+"these newly-written fragments but will instead respond ``500 Internal Server "
+"Error``."
+msgstr ""
+"Upgrade liberasurecode on all proxy servers. Newly-written data will now use "
+"new fragments. Note that not-yet-upgraded proxies will not be able to read "
+"these newly-written fragments but will instead respond ``500 Internal Server "
+"Error``."
+
+msgid ""
+"User metadata is now exposed via CORS when encryption is enabled, matching "
+"the behavior when encryption is not enabled."
+msgstr ""
+"User metadata is now exposed via CORS when encryption is enabled, matching "
+"the behaviour when encryption is not enabled."
+
+msgid "Ussuri Series Release Notes"
+msgstr "Ussuri Series Release Notes"
+
+msgid "Various logging and metrics improvements when talking to memcache."
+msgstr "Various logging and metrics improvements when talking to memcache."
+
+msgid "Various other minor bug fixes and improvements."
+msgstr "Various other minor bug fixes and improvements."
+
+msgid "Various other minor bug fixes."
+msgstr "Various other minor bug fixes."
+
+msgid "Victoria Series Release Notes"
+msgstr "Victoria Series Release Notes"
+
+msgid ""
+"WARNING: If you are using the ISA-L library for erasure codes, please "
+"upgrade to liberasurecode 1.3.1 (or later) as soon as possible. If you are "
+"using isa_l_rs_vand with more than 4 parity, please read https://bugs."
+"launchpad.net/swift/+bug/1639691 and take necessary action."
+msgstr ""
+"WARNING: If you are using the ISA-L library for erasure codes, please "
+"upgrade to liberasurecode 1.3.1 (or later) as soon as possible. If you are "
+"using isa_l_rs_vand with more than 4 parity, please read https://bugs."
+"launchpad.net/swift/+bug/1639691 and take necessary action."
+
+msgid "WSGI server processes can now notify systemd when they are ready."
+msgstr "WSGI server processes can now notify systemd when they are ready."
+
+msgid "Wallaby Series Release Notes"
+msgstr "Wallaby Series Release Notes"
+
+msgid "Warn when relinking/cleaning up and any disks are unmounted."
+msgstr "Warn when relinking/cleaning up and any disks are unmounted."
+
+msgid ""
+"We do not yet have CLI tools for creating composite rings, but the "
+"functionality has been enabled in the ring modules to support this advanced "
+"functionality. CLI tools will be delivered in a subsequent release."
+msgstr ""
+"We do not yet have CLI tools for creating composite rings, but the "
+"functionality has been enabled in the ring modules to support this advanced "
+"functionality. CLI tools will be delivered in a subsequent release."
+
+msgid ""
+"When building a listing from shards, any failure to retrieve listings will "
+"result in a 503 response. Previously, failures fetching a partiucular shard "
+"would result in a gap in listings."
+msgstr ""
+"When building a listing from shards, any failure to retrieve listings will "
+"result in a 503 response. Previously, failures fetching a particular shard "
+"would result in a gap in listings."
+
+msgid ""
+"When listing objects in a container in json format, static large objects "
+"(SLOs) will now include an additional new \"slo_etag\" key that matches the "
+"etag returned when requesting the SLO. The existing \"hash\" key remains "
+"unchanged as the MD5 of the SLO manifest. Text and XML listings are "
+"unaffected by this change."
+msgstr ""
+"When listing objects in a container in json format, static large objects "
+"(SLOs) will now include an additional new \"slo_etag\" key that matches the "
+"etag returned when requesting the SLO. The existing \"hash\" key remains "
+"unchanged as the MD5 of the SLO manifest. Text and XML listings are "
+"unaffected by this change."
+
+msgid ""
+"When looking for the active root secret, only the right-most keymaster is "
+"used."
+msgstr ""
+"When looking for the active root secret, only the right-most keymaster is "
+"used."
+
+msgid ""
+"When making backend requests, the ``proxy-server`` now ensures query "
+"parameters are always properly quoted. Previously, the proxy would encounter "
+"an error on Python 2.7.17 if the client included non-ASCII query parameters "
+"in object requests. This was previously fixed in 2.23.0."
+msgstr ""
+"When making backend requests, the ``proxy-server`` now ensures query "
+"parameters are always properly quoted. Previously, the proxy would encounter "
+"an error on Python 2.7.17 if the client included non-ASCII query parameters "
+"in object requests. This was previously fixed in 2.23.0."
+
+msgid ""
+"When object path is not a directory, just quarantine it, rather than the "
+"whole suffix."
+msgstr ""
+"When object path is not a directory, just quarantine it, rather than the "
+"whole suffix."
+
+msgid ""
+"When refetching Static Large Object manifests, non-manifest responses are "
+"now handled better."
+msgstr ""
+"When refetching Static Large Object manifests, non-manifest responses are "
+"now handled better."
+
+msgid ""
+"When requesting objects, return 404 if a tombstone is found and is newer "
+"than any data found. Previous behavior was to return stale data."
+msgstr ""
+"When requesting objects, return 404 if a tombstone is found and is newer "
+"than any data found. Previous behaviour was to return stale data."
+
+msgid ""
+"When running with ``s3_acl`` disabled, ``bucket-owner-full-control`` and "
+"``bucket-owner-read`` canned ACLs will be translated to the same Swift ACLs "
+"as ``private``."
+msgstr ""
+"When running with ``s3_acl`` disabled, ``bucket-owner-full-control`` and "
+"``bucket-owner-read`` canned ACLs will be translated to the same Swift ACLs "
+"as ``private``."
+
+msgid ""
+"When the object auditor examines an object, it will now add any missing "
+"metadata checksums."
+msgstr ""
+"When the object auditor examines an object, it will now add any missing "
+"metadata checksums."
+
+msgid ""
+"With heartbeating turned on, the proxy will start its response immediately "
+"with 202 Accepted then send a single whitespace character periodically until "
+"the request completes. At that point, a final summary chunk will be sent "
+"which includes a \"Response Status\" key indicating success or failure and "
+"(if successful) an \"Etag\" key indicating the Etag of the resulting SLO."
+msgstr ""
+"With heartbeating turned on, the proxy will start its response immediately "
+"with 202 Accepted then send a single whitespace character periodically until "
+"the request completes. At that point, a final summary chunk will be sent "
+"which includes a \"Response Status\" key indicating success or failure and "
+"(if successful) an \"Etag\" key indicating the Etag of the resulting SLO."
+
+msgid ""
+"Worker process logs will have a bit of information prepended so operators "
+"can tell which messages came from which worker. The prefix is \"[worker M/N "
+"pid=P] \", where M is the worker's index, N is the total number of workers, "
+"and P is the process ID. Every message from the replicator's logger will "
+"have the prefix"
+msgstr ""
+"Worker process logs will have a bit of information prepended so operators "
+"can tell which messages came from which worker. The prefix is \"[worker M/N "
+"pid=P] \", where M is the worker's index, N is the total number of workers, "
+"and P is the process ID. Every message from the replicator's logger will "
+"have the prefix"
+
+msgid "Write-affinity aware object deletion"
+msgstr "Write-affinity aware object deletion"
+
+msgid ""
+"X-Delete-At computation now uses X-Timestamp instead of system time. This "
+"prevents clock skew causing inconsistent expiry data."
+msgstr ""
+"X-Delete-At computation now uses X-Timestamp instead of system time. This "
+"prevents clock skew causing inconsistent expiry data."
+
+msgid "Xena Series Release Notes"
+msgstr "Xena Series Release Notes"
+
+msgid "Yoga Series Release Notes"
+msgstr "Yoga Series Release Notes"
+
+msgid "Zed Series Release Notes"
+msgstr "Zed Series Release Notes"
+
+msgid "``Content-Type`` can now be updated when copying an object."
+msgstr "``Content-Type`` can now be updated when copying an object."
+
+msgid ""
+"``EIO`` errors during read now cause object diskfiles to be quarantined."
+msgstr ""
+"``EIO`` errors during read now cause object diskfiles to be quarantined."
+
+msgid ""
+"``ENOENT`` and ``ENODATA`` errors are better handled in the object "
+"replicator and auditor."
+msgstr ""
+"``ENOENT`` and ``ENODATA`` errors are better handled in the object "
+"replicator and auditor."
+
+msgid "``EPIPE`` errors no longer log tracebacks."
+msgstr "``EPIPE`` errors no longer log tracebacks."
+
+msgid ""
+"``LastModified`` timestamps in listings are now rounded up to whole seconds, "
+"like they are in responses from AWS."
+msgstr ""
+"``LastModified`` timestamps in listings are now rounded up to whole seconds "
+"like they are in responses from AWS."
+
+msgid "``fallocate_reserve`` may be specified as a percentage in more places."
+msgstr "``fallocate_reserve`` may be specified as a percentage in more places."
+
+msgid ""
+"``s3token`` no longer mangles request paths that include the Access Key ID."
+msgstr ""
+"``s3token`` no longer mangles request paths that include the Access Key ID."
+
+msgid "``swift-account-audit``"
+msgstr "``swift-account-audit``"
+
+msgid ""
+"``swift-account-info`` and ``swift-container-info`` now accept a ``--sync`` "
+"flag to show information from the incoming/outgoing sync tables."
+msgstr ""
+"``swift-account-info`` and ``swift-container-info`` now accept a ``--sync`` "
+"flag to show information from the incoming/outgoing sync tables."
+
+msgid ""
+"``swift-container-info`` now summarizes shard range information. Pass ``-v``/"
+"``--verbose`` if you want to see all of them."
+msgstr ""
+"``swift-container-info`` now summarizes shard range information. Pass ``-v``/"
+"``--verbose`` if you want to see all of them."
+
+msgid "``swift-dispersion-populate``"
+msgstr "``swift-dispersion-populate``"
+
+msgid "``swift-drive-recon``"
+msgstr "``swift-drive-recon``"
+
+msgid ""
+"``swift-manage-shard-ranges`` can now accept a config file; this may be used "
+"to ensure consistency of threshold values with the container-sharder config."
+msgstr ""
+"``swift-manage-shard-ranges`` can now accept a config file; this may be used "
+"to ensure consistency of threshold values with the container-sharder config."
+
+msgid "``swift-manage-shard-ranges`` improvements:"
+msgstr "``swift-manage-shard-ranges`` improvements:"
+
+msgid ""
+"``swift-recon-cron`` now includes the last time it was run in the recon "
+"information."
+msgstr ""
+"``swift-recon-cron`` now includes the last time it was run in the recon "
+"information."
+
+msgid "``swift-recon``"
+msgstr "``swift-recon``"
+
+msgid "``swift-ring-builder`` improvements"
+msgstr "``swift-ring-builder`` improvements"
+
+msgid ""
+"``swift_source`` is set for more sub-requests in the proxy-server. See `the "
+"documentation `__."
+msgstr ""
+"``swift_source`` is set for more sub-requests in the proxy-server. See `the "
+"documentation `__."
+
+msgid "and you want to take advantage of faster updates, then do this::"
+msgstr "and you want to take advantage of faster updates, then do this::"
+
+msgid ""
+"cname_lookup middleware now accepts a ``nameservers`` config variable that, "
+"if defined, will be used for DNS lookups instead of the system default."
+msgstr ""
+"cname_lookup middleware now accepts a ``nameservers`` config variable that, "
+"if defined, will be used for DNS lookups instead of the system default."
+
+msgid "domain_remap now accepts a list of domains in \"storage_domain\"."
+msgstr "domain_remap now accepts a list of domains in \"storage_domain\"."
+
+msgid "formpost can now accept a content-encoding parameter."
+msgstr "formpost can now accept a content-encoding parameter."
+
+msgid "name_check and cname_lookup keys have been added to `/info`."
+msgstr "name_check and cname_lookup keys have been added to `/info`."
+
+msgid ""
+"s3api now mimics some forms of AWS server-side encryption based on whether "
+"Swift's at-rest encryption functionality is enabled. Note that S3 API users "
+"are now able to know more about how the cluster is configured than they were "
+"previously, ie knowledge of encryption at-rest functionality being enabled "
+"or not."
+msgstr ""
+"s3api now mimics some forms of AWS server-side encryption based on whether "
+"Swift's at-rest encryption functionality is enabled. Note that S3 API users "
+"are now able to know more about how the cluster is configured than they were "
+"previously, i.e. knowledge of encryption at-rest functionality being enabled "
+"or not."
+
+msgid ""
+"s3api now mimics the AWS S3 behavior of periodically sending whitespace "
+"characters on a Complete Multipart Upload request to keep the connection "
+"from timing out. Note that since a request could fail after the initial 200 "
+"OK response has been sent, it is important to check the response body to "
+"determine if the request succeeded."
+msgstr ""
+"s3api now mimics the AWS S3 behaviour of periodically sending whitespace "
+"characters on a Complete Multipart Upload request to keep the connection "
+"from timing out. Note that since a request could fail after the initial 200 "
+"OK response has been sent, it is important to check the response body to "
+"determine if the request succeeded."
+
+msgid ""
+"s3api now properly handles ``x-amz-metadata-directive`` headers on COPY "
+"operations."
+msgstr ""
+"s3api now properly handles ``x-amz-metadata-directive`` headers on COPY "
+"operations."
+
+msgid ""
+"s3api now uses concurrency (default 2) to handle multi-delete requests. This "
+"allows multi-delete requests to be processed much more quickly."
+msgstr ""
+"s3api now uses concurrency (default 2) to handle multi-delete requests. This "
+"allows multi-delete requests to be processed much more quickly."
+
+msgid "s3api responses now include a '-' in multipart ETags."
+msgstr "s3api responses now include a '-' in multipart ETags."
+
+msgid ""
+"statsd error messages correspond to 5xx responses only. This makes "
+"monitoring more useful because actual errors (5xx) will not be hidden by "
+"common user requests (4xx). Previously, some 4xx responses would be included "
+"in timing information in the statsd error messages."
+msgstr ""
+"statsd error messages correspond to 5xx responses only. This makes "
+"monitoring more useful because actual errors (5xx) will not be hidden by "
+"common user requests (4xx). Previously, some 4xx responses would be included "
+"in timing information in the statsd error messages."
+
+msgid "swift-recon now respects storage policy aliases."
+msgstr "swift-recon now respects storage policy aliases."
+
+msgid "tempauth user names now support unicode characters."
+msgstr "tempauth user names now support Unicode characters."
diff --git a/releasenotes/source/locale/ja/LC_MESSAGES/releasenotes.po b/releasenotes/source/locale/ja/LC_MESSAGES/releasenotes.po
new file mode 100644
index 0000000000..2a0836d53e
--- /dev/null
+++ b/releasenotes/source/locale/ja/LC_MESSAGES/releasenotes.po
@@ -0,0 +1,1351 @@
+# Shu Muto , 2017. #zanata
+# Shu Muto , 2018. #zanata
+msgid ""
+msgstr ""
+"Project-Id-Version: Swift Release Notes\n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2018-02-28 19:39+0000\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"PO-Revision-Date: 2018-02-08 07:28+0000\n"
+"Last-Translator: Shu Muto \n"
+"Language-Team: Japanese\n"
+"Language: ja\n"
+"X-Generator: Zanata 4.3.3\n"
+"Plural-Forms: nplurals=1; plural=0\n"
+
+msgid "2.10.0"
+msgstr "2.10.0"
+
+msgid "2.10.1"
+msgstr "2.10.1"
+
+msgid "2.10.2"
+msgstr "2.10.2"
+
+msgid "2.11.0"
+msgstr "2.11.0"
+
+msgid "2.12.0"
+msgstr "2.12.0"
+
+msgid "2.13.0"
+msgstr "2.13.0"
+
+msgid "2.13.1"
+msgstr "2.13.1"
+
+msgid "2.14.0"
+msgstr "2.14.0"
+
+msgid "2.15.0"
+msgstr "2.15.0"
+
+msgid "2.15.1"
+msgstr "2.15.1"
+
+msgid "2.16.0"
+msgstr "2.16.0"
+
+msgid "2.17.0"
+msgstr "2.17.0"
+
+msgid ""
+"A PUT or POST to a container will now update the container's Last-Modified "
+"time, and that value will be included in a GET/HEAD response."
+msgstr ""
+"コンテナーへの PUT または POST は、コンテナーの最終更新時刻を更新し、その値"
+"は GET/HEAD レスポンスに含まれます。"
+
+msgid ""
+"A composite ring comprises two or more component rings that are combined to "
+"form a single ring with a replica count equal to the sum of the component "
+"rings. The component rings are built independently, using distinct devices "
+"in distinct regions, which means that the dispersion of replicas between the "
+"components can be guaranteed."
+msgstr ""
+"複合リングは、コンポーネントリングの合計に等しい複製数を有して形成する単一リ"
+"ングが結合されたコンポーネントリングを2つ以上含みます。 コンポーネントリング"
+"は、別個の領域に別個のデバイスを使用して独立して構築されているため、コンポー"
+"ネント間のレプリカの分散を保証できます。"
+
+msgid ""
+"Accept a trade off of dispersion for balance in the ring builder that will "
+"result in getting to balanced rings much more quickly in some cases."
+msgstr ""
+"リングビルダーのバランスのために、分散のトレードオフを受け入れ、場合によって"
+"はバランスされたリングにより早く到達します。"
+
+msgid ""
+"Account and container databases will now be quarantined if the database "
+"schema has been corrupted."
+msgstr ""
+"データベーススキーマが壊れていると、アカウントとコンテナーのデータベースが隔"
+"離されるようになりました。"
+
+msgid ""
+"Account and container replication stats logs now include ``remote_merges``, "
+"the number of times a whole database was sent to another node."
+msgstr ""
+"アカウントとコンテナー複製の統計ログに、データベース全体が別のノードに送信さ"
+"れた回数、``remote_merges`` が追加されました。"
+
+msgid "Add Composite Ring Functionality"
+msgstr "複合リング機能を追加しました。"
+
+msgid "Add Vary headers for CORS responses."
+msgstr "CORS 応答用の Vary ヘッダーを追加しました。"
+
+msgid "Add checksum to object extended attributes."
+msgstr "オブジェクトの拡張属性にチェックサムを追加します。"
+
+msgid ""
+"Add support to increase object ring partition power transparently to end "
+"users and with no cluster downtime. Increasing the ring part power allows "
+"for incremental adjustment to the upper bound of the cluster size. Please "
+"review the `full docs `__ for more information."
+msgstr ""
+"エンドユーザーにオブジェクトのリング・パーティション・パワーを透過的に増加さ"
+"せるためのクラスタのダウンタイムが発生しないサポートを追加しました。リングの"
+"部分力を増加させることにより、クラスタサイズの上限に増分調整が可能になりま"
+"す。詳細は `フルドキュメント\n"
+" `__ を参照して"
+"ください。"
+
+msgid ""
+"Added ``--swift-versions`` to ``swift-recon`` CLI to compare installed "
+"versions in the cluster."
+msgstr ""
+"クラスターにインストールされているバージョンを比較するために、``swift-"
+"recon`` CLI に ``--swift-versions`` を追加しました。"
+
+msgid ""
+"Added a \"user\" option to the drive-audit config file. Its value is used to "
+"set the owner of the drive-audit recon cache."
+msgstr ""
+"ドライブ監査設定ファイルに \"user\" オプションを追加しました。その値は、ドラ"
+"イブ監査の調整キャッシュの所有者を設定するために使用されます。"
+
+msgid ""
+"Added a configurable URL base to staticweb, fixing issues when the "
+"accessible endpoint isn't known to the Swift cluster (eg http vs https)."
+msgstr ""
+"静的ウェブに対する設定可能な URL ベースを追加し、アクセス可能なエンドポイント"
+"が Swiftクラスタに知らされていない場合の問題を修正しました(例えば、httpと"
+"https)。"
+
+msgid "Added a configurable URL base to staticweb."
+msgstr "静的ウェブに対する設定可能な URL ベースを追加しました。"
+
+msgid "Added container/object listing with prefix to InternalClient."
+msgstr ""
+"InternalClient のコンテナー/オブジェクトの一覧作成で接頭辞を指定できるように"
+"なりました。"
+
+msgid "Added support for inline data segments in SLO manifests."
+msgstr "SLO マニフェストにおけるインラインデータセグメントをサポートしました。"
+
+msgid ""
+"Added support for per-policy proxy config options. This allows per-policy "
+"affinity options to be set for use with duplicated EC policies and composite "
+"rings. Certain options found in per-policy conf sections will override their "
+"equivalents that may be set in the [app:proxy-server] section. Currently the "
+"options handled that way are ``sorting_method``, ``read_affinity``, "
+"``write_affinity``, ``write_affinity_node_count``, and "
+"``write_affinity_handoff_delete_count``."
+msgstr ""
+"ポリシーごとのプロキシー設定オプションのサポートが追加されました。これによ"
+"り、ポリシーごとのアフィニティオプションを、複製された EC ポリシーおよび複合"
+"リングで使用するように設定できます。ポリシーごとの conf セクションにある特定"
+"のオプションは、 [app:proxy-server] セクションで設定できる同等のものよりも優"
+"先されます。現在、このように処理されるオプションは ``sorting_method``、 "
+"``read_affinity``、 ``write_affinity``、 ``write_affinity_node_count``、 "
+"``write_affinity_handoff_delete_count`` です。"
+
+msgid ""
+"Added support for retrieving the encryption root secret from an external key "
+"management system. In practice, this is currently limited to Barbican."
+msgstr ""
+"外部鍵管理システムからの暗号化ルートシークレットの取得をサポートしました。現"
+"在 Barbican に限定されています。"
+
+msgid "Added symlink objects support."
+msgstr "シンボリックリンクオブジェクトをサポートしました。"
+
+msgid ""
+"All 416 responses will now include a Content-Range header with an "
+"unsatisfied-range value. This allows the caller to know the valid range "
+"request value for an object."
+msgstr ""
+"416 のすべてのレスポンスには、範囲の値を持つ Content-Range ヘッダーが含まれる"
+"ようになりました。 これにより、呼び出し元はオブジェクトの有効範囲要求値を知る"
+"ことができます。"
+
+msgid "Allow the expirer to gracefully move past updating stale work items."
+msgstr "expirer が安全に古い作業項目を移動できるようになりました。"
+
+msgid "Always set Swift processes to use UTC."
+msgstr "Swift プロセスがいつも UTC を使うように設定しました。"
+
+msgid "Bug Fixes"
+msgstr "バグ修正"
+
+msgid "Cache all answers from nameservers in cname_lookup."
+msgstr "cname_lookup でネームサーバーからのすべての応答をキャッシュします。"
+
+msgid ""
+"Changed where liberasurecode-devel for CentOS 7 is referenced and installed "
+"as a dependency."
+msgstr ""
+"CentOS 7 での、liberasurecode-devel が参照、インストールされる場所を変更しま"
+"した。"
+
+msgid "Cleaned up logged tracebacks when talking to memcached servers."
+msgstr ""
+"memcached サーバーと通信するときのトレースバックログをクリーンアップしまし"
+"た。"
+
+msgid ""
+"Closed a bug where ssync may have written bad fragment data in some "
+"circumstances. A check was added to ensure the correct number of bytes is "
+"written for a fragment before finalizing the write. Also, erasure coded "
+"fragment metadata will now be validated on read requests and, if bad data is "
+"found, the fragment will be quarantined."
+msgstr ""
+"いくつかの状況で ssync が不正なフラグメントデータを書き込むバグをクローズしま"
+"した。書き込みを終了する前に、正しいバイト数がフラグメントに書き込まれている"
+"ことを確認するためのチェックが追加されました。また、消去コード化されたフラグ"
+"メントメタデータが読み取り要求で検証され、不良データが見つかると、そのフラグ"
+"メントが隔離されます。"
+
+msgid ""
+"Closed a bug where ssync may have written bad fragment data in some "
+"circumstances. A check was added to ensure the correct number of bytes is "
+"written for a fragment before finalizing the write. Also, erasure coded "
+"fragment metadata will now be validated when read and, if bad data is found, "
+"the fragment will be quarantined."
+msgstr ""
+"いくつかの状況で ssync が不正なフラグメントデータを書き込むバグをクローズしま"
+"した。書き込みを終了する前に、正しいバイト数がフラグメントに書き込まれている"
+"ことを確認するためのチェックが追加されました。また、消去コード化されたフラグ"
+"メントメタデータが読み取り要求で検証され、不良データが見つかると、そのフラグ"
+"メントが隔離されます。"
+
+msgid ""
+"Composite rings can be used for explicit replica placement and \"replicated "
+"EC\" for global erasure codes policies."
+msgstr ""
+"複合リングは、明示的なレプリカの配置と、グローバル消去コードポリシーのための"
+"「複製された EC」に使用できます。"
+
+msgid ""
+"Composite rings support 'cooperative' rebalance which means that during "
+"rebalance all component rings will be consulted before a partition is moved "
+"in any component ring. This avoids the same partition being simultaneously "
+"moved in multiple components."
+msgstr ""
+"複合リングは「協調的」リバランスをサポートしています。つまり、リバランス時"
+"に、コンポーネントリング内でパーティションを移動する前に、すべてのコンポーネ"
+"ントリングに諮られます。 これにより、複数のコンポーネントで同じパーティション"
+"を同時に移動されることがなくなります。"
+
+msgid ""
+"Container sync can now copy SLOs more efficiently by allowing the manifest "
+"to be synced before all of the referenced segments. This fixes a bug where "
+"container sync would not copy SLO manifests."
+msgstr ""
+"コンテナーシンクでは、マニフェストをすべての参照されるセグメントの前に同期さ"
+"せることで、SLO をより効率的にコピーできます。 これにより、コンテナーの同期"
+"が SLO マニフェストをコピーしないバグが修正されました。"
+
+msgid "Correctly handle deleted files with if-none-match requests."
+msgstr "if-none-match 要求で削除されたファイルを正しく処理します。"
+
+msgid ""
+"Correctly send 412 Precondition Failed if a user sends an invalid copy "
+"destination. Previously Swift would send a 500 Internal Server Error."
+msgstr ""
+"ユーザーが無効なコピー先を送信した場合は、 412 Precondition Failed を正しく送"
+"信します。以前は、Swift は 500 の内部サーバーエラーを送信しました。"
+
+msgid "Critical Issues"
+msgstr "致命的な問題"
+
+msgid "Current (Unreleased) Release Notes"
+msgstr "開発中バージョンのリリースノート"
+
+msgid ""
+"Currently the default is still only one process, and no workers. Set "
+"``reconstructor_workers`` in the ``[object-reconstructor]`` section to some "
+"whole number <= the number of devices on a node to get that many "
+"reconstructor workers."
+msgstr ""
+"現在のところ、デフォルトはまだ1つのプロセスしかなく、ワーカーはいません。多"
+"くの再構成ワーカーを得るためには、 ``[object-reconstructor]`` セクションの "
+"``reconstructor_workers`` をいくつかの合計数( <= ノード上にあるデバイスの"
+"数)を設定してください。"
+
+msgid "Daemons using InternalClient can now be properly killed with SIGTERM."
+msgstr ""
+"InternalClient を使用するデーモンは、 SIGTERM を使用して適切に停止できます。"
+
+msgid ""
+"Deleting an expiring object will now cause less work in the system. The "
+"number of async pending files written has been reduced for all objects and "
+"greatly reduced for erasure-coded objects. This dramatically reduces the "
+"burden on container servers."
+msgstr ""
+"期限切れオブジェクトの削除は、システムでの作業を削減します。非同期で保留され"
+"ているファイルの数は、すべてのオブジェクトで削減され、消去コード付きオブジェ"
+"クトでは大幅に削減されます。これにより、コンテナーサーバーの負担が劇的に軽減"
+"しました。"
+
+msgid ""
+"Deprecate swift-temp-url and call python-swiftclient's implementation "
+"instead. This adds python-swiftclient as an optional dependency of Swift."
+msgstr ""
+"swift-temp-url を非推奨にし、代わりに python-swiftclient の実装を呼び出してく"
+"ださい。これにより、python-swiftclient が Swift のオプションの依存関係として"
+"追加されます。"
+
+msgid "Deprecation Notes"
+msgstr "廃止予定の機能"
+
+msgid "Disallow X-Delete-At header values equal to the X-Timestamp header."
+msgstr ""
+"X-Delete-At ヘッダーの値が X-Timestamp ヘッダーと等しいことを禁止します。"
+
+msgid "Display more info on empty rings."
+msgstr "空のリングに詳細情報を表示します。"
+
+msgid "Do not follow CNAME when host is in storage_domain."
+msgstr "ホストが storage_domain にある場合、CNAME に従わないようにしました。"
+
+msgid "Drop support for auth-server from common/manager.py and `swift-init`."
+msgstr ""
+"common/manager.pyと `swift-init` から auth-server のサポートを削除しました。"
+
+msgid "EC Fragment Duplication - Foundational Global EC Cluster Support."
+msgstr ""
+"EC フラグメント複製 - 基盤的なグローバル EC クラスタをサポートしました。"
+
+msgid ""
+"Enable cluster-wide CORS Expose-Headers setting via \"cors_expose_headers\"."
+msgstr ""
+"\"cors_expose_headers\" でクラスタ全体の CORS Expose-Headers 設定を有効にしま"
+"す。"
+
+msgid "Enabled versioned writes on Dynamic Large Objects (DLOs)."
+msgstr ""
+"ダイナミックラージオブジェクト(DLO)でのバージョン管理された書き込みを有効に"
+"しました。"
+
+msgid ""
+"Ensure update of the container by object-updater, removing a rare "
+"possibility that objects would never be added to a container listing."
+msgstr ""
+"オブジェクトがコンテナーリスティングに追加されるない、まれな可能性を排除し、"
+"オブジェクトアップデータによるコンテナーの更新を確実にしました。"
+
+msgid ""
+"Erasure code GET performance has been significantly improved in clusters "
+"that are not completely healthy."
+msgstr ""
+"完全に健全でないクラスターにおける、消去コードの GET 性能が大幅に向上しまし"
+"た。"
+
+msgid ""
+"Erasure code reconstruction handles moving data from handoff nodes better. "
+"Instead of moving the data to another handoff, it waits until it can be "
+"moved to a primary node."
+msgstr ""
+"消失コード再構成は、ハンドオフノードからの移動データをより良く処理します。 "
+"データを別のハンドオフに移動する代わりに、プライマリーノードに移動できるよう"
+"になるまで待機します。"
+
+msgid ""
+"Erasure-coded storage policies using ``isa_l_rs_vand`` and ``nparity`` >= 5 "
+"must be configured as deprecated, preventing any new containers from being "
+"created with such a policy. This configuration is known to harm data "
+"durability. Any data in such policies should be migrated to a new policy. "
+"See See `Launchpad bug 1639691 `__ for more information."
+msgstr ""
+"``isa_l_rs_vand`` と ``nparity`` >= 5 を使った消去コード化ストレージポリシー"
+"は廃止予定にする必要があり、このようなポリシーで新しいコンテナーが作成されな"
+"いようにする必要があります。この設定は、データ耐久性に害を与えることが知られ"
+"ています。そのようなポリシー内のデータは、新しいポリシーに移行する必要があり"
+"ます。詳細は、 `Launchpad bug 1639691 `__ を参照してください。"
+
+msgid ""
+"Fixed UnicodeDecodeError in the object reconstructor that would prevent "
+"objects with non-ascii names from being reconstructed and caused the "
+"reconstructor process to hang."
+msgstr ""
+"非 ASCII 名のオブジェクトが再構築されず、再構築プロセスがハングアップする原因"
+"となるオブジェクト再構成の UnicodeDecodeError が修正されました。"
+
+msgid ""
+"Fixed XML responses (eg on bulk extractions and SLO upload failures) to be "
+"more correct. The enclosing \"delete\" tag was removed where it doesn't make "
+"sense and replaced with \"extract\" or \"upload\" depending on the context."
+msgstr ""
+"XML レスポンス(一括抽出や SLO アップロードの失敗など)がより正確になりまし"
+"た。意味のない \"delete\" の閉じタグは削除され、コンテキストに応じた "
+"\"extract\" あるいは \"upload\" に置き換えられました。"
+
+msgid "Fixed a bug in domain_remap when obj starts/ends with slash."
+msgstr ""
+"オブジェクトがスラッシュで開始/終了するときの domain_remap のバグを修正しまし"
+"た。"
+
+msgid ""
+"Fixed a bug in the EC reconstructor where an unsuccessful sync would cause "
+"extra disk I/O load on the remote server. Now the extra checking work is "
+"only requested if the sync request was successful."
+msgstr ""
+"失敗した同期がリモートサーバー上で余分なディスク I/O 負荷を引き起こす EC 再構"
+"成のバグを修正しました。同期要求が成功した場合にのみ、追加のチェック作業が要"
+"求されるようになりました。"
+
+msgid ""
+"Fixed a bug introduced in 2.15.0 where the object reconstructor would exit "
+"with a traceback if no EC policy was configured."
+msgstr ""
+"2.15.0 で導入されたバグを修正しました。 EC ポリシーが設定されていない場合は、"
+"オブジェクト再構成ツールがトレースバックで終了します。"
+
+msgid "Fixed a bug where SSYNC would fail to replicate unexpired object."
+msgstr "SSYNC が期限切れのオブジェクトを複製できないバグを修正しました。"
+
+msgid ""
+"Fixed a bug where a container listing delimiter wouldn't work with "
+"encryption."
+msgstr "コンテナーのリスト区切り文字が暗号化で機能しないバグを修正しました。"
+
+msgid ""
+"Fixed a bug where an SLO download with a range request may have resulted in "
+"a 5xx series response."
+msgstr ""
+"範囲リクエストで SLO をダウンロードした結果、 5xx シリーズの応答が発生する可"
+"能性があるバグを修正しました。"
+
+msgid ""
+"Fixed a bug where some headers weren't being copied correctly in a COPY "
+"request."
+msgstr ""
+"一部のヘッダーが COPY リクエストで正しくコピーされていなかったバグを修正しま"
+"した。"
+
+msgid "Fixed a bug where some tombstone files might never be reclaimed."
+msgstr ""
+"いくつかの廃棄済みオブジェクト (tombstone) ファイルが再利用されないかもしれな"
+"いバグを修正しました。"
+
+msgid ""
+"Fixed a bug where the ring builder would not allow removal of a device when "
+"min_part_seconds_left was greater than zero."
+msgstr ""
+"min_part_seconds_left が 0 より大きい場合、リングビルダーがデバイスの削除を許"
+"可しないバグを修正しました。"
+
+msgid "Fixed a few areas where the ``swiftdir`` option was not respected."
+msgstr ""
+"``swiftdir`` オプションが尊重されなかったいくつかの領域を修正しました。"
+
+msgid ""
+"Fixed a race condition in updating hashes.pkl where a partition suffix "
+"invalidation may have been skipped."
+msgstr ""
+"パーティションサフィックスの無効化がスキップされた可能性のある hashes.pkl の"
+"更新時の競合状態を修正しました。"
+
+msgid "Fixed a rare infinite loop in `swift-ring-builder` while placing parts."
+msgstr ""
+"パーツを置いている間の`swift-ring-builder` のまれな無限ループを修正しました。"
+
+msgid ""
+"Fixed a rare issue where multiple backend timeouts could result in bad data "
+"being returned to the client."
+msgstr ""
+"複数のバックエンドのタイムアウトが原因で、クライアントに不正なデータが返され"
+"るという稀な問題を修正しました。"
+
+msgid "Fixed a socket leak in copy middleware when a large object was copied."
+msgstr ""
+"ラージオブジェクトをコピーしたときの copy ミドルウェアのソケットリークを修正"
+"しました。"
+
+msgid ""
+"Fixed an issue where background consistency daemon child processes would "
+"deadlock waiting on the same file descriptor."
+msgstr ""
+"バックグラウンド一貫性デーモンの子プロセスが同じファイル記述子を待ってデッド"
+"ロックする問題を修正しました。"
+
+msgid "Fixed deadlock when logging from a tpool thread."
+msgstr "tpool スレッドからのロギング時のデッドロックを修正しました。"
+
+msgid ""
+"Fixed encoding issue in ssync where a mix of ascii and non-ascii metadata "
+"values would cause an error."
+msgstr ""
+"ASCII メタデータ値と非 ASCII メタデータ値が混在するとエラーが発生する、 "
+"ssync のエンコードの問題を修正しました。"
+
+msgid ""
+"Fixed error where a container drive error resulted in double space usage on "
+"rest drives. When drive with container or account database is unmounted, the "
+"bug would create handoff replicas on all remaining drives, increasing the "
+"drive space used and filling the cluster."
+msgstr ""
+"コンテナードライブのエラーにより、残りのドライブに二重のスペースが使用される"
+"というエラーを修正しました。コンテナーまたはアカウントデータベースを使用した"
+"ドライブのマウントが解除されたときに、このバグは残りのすべてのドライブにハン"
+"ドオフレプリカを作成し、ドライブの使用容量を増やし、クラスターを満たしていま"
+"した。。"
+
+msgid ""
+"Fixed non-deterministic suffix updates in hashes.pkl where a partition may "
+"be updated much less often than expected."
+msgstr ""
+"パーティションが予想よりもずっと少なく更新される可能性がある hashes.pkl の固"
+"定の非確定的なサフィックスの更新を修正しました。"
+
+msgid "Fixed rare socket leak on range requests to erasure-coded objects."
+msgstr ""
+"消去コード付きオブジェクトへの範囲リクエストでの稀なソケットリークを修正しま"
+"した。"
+
+msgid ""
+"Fixed regression in consolidate_hashes that occured when a new file was "
+"stored to new suffix to a non-empty partition. This bug was introduced in "
+"2.7.0 and could cause an increase in rsync replication stats during and "
+"after upgrade, due to inconsistent hashing of partition suffixes."
+msgstr ""
+"新しいファイルが空でないパーティションに新しいサフィックスで格納されたときに"
+"発生した consolidate_hash の退行バグを修正しました。 このバグは2.7.0で導入さ"
+"れ、パーティションサフィックスの一貫性のないハッシュのために、アップグレード"
+"中およびアップグレード後に rsync のレプリケーション統計を増加する可能性があり"
+"ます。"
+
+msgid ""
+"Fixed regression in consolidate_hashes that occurred when a new file was "
+"stored to new suffix to a non-empty partition. This bug was introduced in "
+"2.7.0 and could cause an increase in rsync replication stats during and "
+"after upgrade, due to inconsistent hashing of partition suffixes."
+msgstr ""
+"新しいファイルが空でないパーティションに新しいサフィックスで格納されたときに"
+"発生した consolidate_hash の退行バグを修正しました。 このバグは2.7.0で導入さ"
+"れ、パーティションサフィックスの一貫性のないハッシュのために、アップグレード"
+"中およびアップグレード後に rsync のレプリケーション統計を増加する可能性があり"
+"ます。"
+
+msgid "Fixed some minor test compatibility issues."
+msgstr "いくつかのテストの互換性の問題を修正しました。"
+
+msgid "Fixed the KeyError message when auditor finds an expired object."
+msgstr ""
+"監査が期限切れのオブジェクトを見つけたときの KeyError メッセージを修正しまし"
+"た。"
+
+msgid "Fixed the stats calculation in the erasure code reconstructor."
+msgstr "消去コード再構成の統計計算を修正しました。"
+
+msgid ""
+"Fixed using ``swift-ring-builder set_weight`` with more than one device."
+msgstr ""
+"複数のデバイスでの``swift-ring-builder set_weight`` の使用を修正しました。"
+
+msgid ""
+"For further information see the `docs `__"
+msgstr ""
+"詳細は `docs `__ を参照してください。"
+
+msgid "Fractional replicas are no longer allowed for erasure code policies."
+msgstr "断片的な複製は、消去コードポリシーには使用できなくなりました。"
+
+msgid ""
+"GET and HEAD requests to a symlink will operate on the referenced object and "
+"require appropriate permission in the target container. DELETE and PUT "
+"requests will operate on the symlink object itself. POST requests are not "
+"forwarded to the referenced object. POST requests sent to a symlink will "
+"result in a 307 Temporary Redirect response."
+msgstr ""
+"シンボリックリンクに対する GET と HEAD リクエストは、参照されたオブジェクトに"
+"対して操作が行われ、対象となるコンテナーへの適切な権限を必要とします。DELETE "
+"と PUT リクエストは、シンボリックリンクオブジェクト自身に操作が行われます。"
+"POST リクエストは参照されているオブジェクトに転送されません。シンボリックリン"
+"クに対する POST リクエストの送信は、307 Temporary Redirect レスポンスになりま"
+"す。"
+
+msgid "I/O priority is now supported on AArch64 architecture."
+msgstr ""
+"AArch64 アーキテクチャーで I/O 優先順位がサポートされるようになりました。"
+
+msgid ""
+"If a proxy server is configured to autocreate accounts and the account "
+"create fails, it will now return a server error (500) instead of Not Found "
+"(404)."
+msgstr ""
+"プロキシサーバーにアカウント自動作成が設定されていて、アカウント作成に失敗す"
+"ると、Not Found (404) ではなく、サーバーエラー (500) が返されます。"
+
+msgid ""
+"If using erasure coding with ISA-L in rs_vand mode and 5 or more parity "
+"fragments, Swift will emit a warning. This is a configuration that is known "
+"to harm data durability. In a future release, this warning will be upgraded "
+"to an error unless the policy is marked as deprecated. All data in an "
+"erasure code storage policy using isa_l_rs_vand with 5 or more parity should "
+"be migrated as soon as possible. Please see https://bugs.launchpad.net/swift/"
+"+bug/1639691 for more information."
+msgstr ""
+"rs_vand モードで消去コードに ISA-L を使用し、パリティフラグメントが5つ以上あ"
+"る場合、 Swift は警告を発します。これは、データの耐久性を損なうことが知られて"
+"いる設定です。将来のリリースでは、ポリシーが廃止予定とマークされていない限"
+"り、この警告はエラーにアップグレードされる予定です。 isa_l_rs_vand を 5 以上"
+"のパリティで使用する消去コード格納ポリシーのすべてのデータは、できるだけ早く"
+"移行する必要があります。詳細については、 https://bugs.launchpad.net/swift/"
+"+bug/1639691\n"
+" を参照してください。"
+
+msgid "If you upgrade and roll back, you must delete all `hashes.pkl` files."
+msgstr ""
+"アップグレードしてロールバックする場合は、すべての `hashes.pkl` ファイルを削"
+"除する必要があります。"
+
+msgid "Imported docs content from openstack-manuals project."
+msgstr ""
+"openstack-manuals プロジェクトからドキュメントコンテンツをインポートしまし"
+"た。"
+
+msgid ""
+"Improved ``object-updater`` stats logging. It now tells you all of its stats "
+"(successes, failures, quarantines due to bad pickles, unlinks, and errors), "
+"and it tells you incremental progress every five minutes. The logging at the "
+"end of a pass remains and has been expanded to also include all stats."
+msgstr ""
+"``object-updater`` 統計ログを改善しました。すべての統計(成功、失敗、悪いピク"
+"ルスによる検疫、リンク解除、エラー)を出力し、また、5分毎に進捗状況を出力し"
+"ます。成功の最後のログは残り、すべての統計情報も含むように拡張されました。"
+
+msgid ""
+"Improved performance by eliminating an unneeded directory structure hash."
+msgstr ""
+"不要なディレクトリ構造ハッシュを排除してパフォーマンスを向上させました。"
+
+msgid ""
+"Improved the granularity of the ring dispersion metric so that small "
+"improvements after a rebalance can show changes in the dispersion number. "
+"Dispersion in existing and new rings can be recalculated using the new ``--"
+"recalculate`` option to ``swift-ring-builder``."
+msgstr ""
+"再分散後の小さな改善により分散数の変化を示すことができるように、リング分散メ"
+"トリックの粒度を改善しました。既存、および新しいリングの分散は、``swift-ring-"
+"builder`` の新しい ``--recalculate`` オプションを使うことで再計算されます。"
+
+msgid "Improvements in key parts of the consistency engine"
+msgstr "整合性エンジンの重要な部分を改善しました。"
+
+msgid ""
+"In SLO manifests, the `etag` and `size_bytes` keys are now fully optional "
+"and not required. Previously, the keys needed to exist but the values were "
+"optional. The only required key is `path`."
+msgstr ""
+"SLO マニフェストでは、 `etag` と `size_bytes` キーは完全にオプションであり、"
+"必須ではありません。 以前は、キーが必要でしたが、値はオプションでした。唯一必"
+"要なキーは `path` です。"
+
+msgid ""
+"Include object sysmeta in POST responses. Sysmeta is still stripped from the "
+"response before being sent to the client, but this allows middleware to make "
+"use of the information."
+msgstr ""
+"POST 応答にオブジェクト sysmeta を含めます。 Sysmeta は依然としてクライアント"
+"に送信される前に応答から取り除かれますが、ミドルウェアはその情報を利用できま"
+"す。"
+
+msgid "Include received fragment index in reconstructor log warnings."
+msgstr "受信したフラグメントインデックスを再構築ログの警告に含めました。"
+
+msgid ""
+"Instead of using a separate .durable file to indicate the durable status of "
+"an EC fragment archive, we rename the .data to include a durable marker in "
+"the filename. This saves one inode for every EC .data file. Existing ."
+"durable files will not be removed, and they will continue to work just fine."
+msgstr ""
+"別の .durable ファイルを使用して EC フラグメントアーカイブの耐久性ステータス"
+"を示す代わりに、ファイル名に耐久マーカーを含めるように .data の名前を変更しま"
+"す。 これにより、すべてのEC .data ファイルに対して1つの inode が節約されま"
+"す。 既存の .durable ファイルは削除されず、正常に動作し続けます。"
+
+msgid ""
+"Let clients request heartbeats during SLO PUTs by including the query "
+"parameter ``heartbeat=on``."
+msgstr ""
+"SLO PUT の間、クエリーパラメーター ``heartbeat=on`` を含めることで、クライア"
+"ントがハートビートを要求できるようにしました。"
+
+msgid ""
+"Listing containers in accounts with json or xml now includes a "
+"`last_modified` time. This does not change any on-disk data, but simply "
+"exposes the value to offer consistency with the object listings on "
+"containers."
+msgstr ""
+"json または xml を使用してアカウントのコンテナーを表示するときに、 "
+"`last_modified` 時刻が追加されました。これにより、ディスク上のデータは変更さ"
+"れませんが、値を公開してコンテナーのオブジェクトリストとの一貫性を提供しま"
+"す。"
+
+msgid "Log correct status code for conditional requests."
+msgstr "条件付きリクエストの正しいステータスコードを記録します。"
+
+msgid ""
+"Log deprecation warning for ``allow_versions`` in the container server "
+"config. Configure the ``versioned_writes`` middleware in the proxy server "
+"instead. This option will be ignored in a future release."
+msgstr ""
+"コンテナーサーバーの設定の ``allow_versions`` のために、非推奨警告ログを出力"
+"します。代わりに ``versioned_writes`` ミドルウェアをプロキシサーバーに設定し"
+"ます。このオプションは将来のリリースでは無視されます。"
+
+msgid "Log the correct request type of a subrequest downstream of copy."
+msgstr "サブリクエストの正しいリクエストタイプをコピーの後ろに記録します。"
+
+msgid ""
+"Make mount_check option usable in containerized environments by adding a "
+"check for an \".ismount\" file at the root directory of a device."
+msgstr ""
+"デバイスのルートディレクトリの \".ismount\" ファイルのチェックを追加すること"
+"により、コンテナー化された環境で mount_check オプションを使用可能にします。"
+
+msgid "Mirror X-Trans-Id to X-Openstack-Request-Id."
+msgstr "X-Trans-Id を X-Openstack-Request-Id に写します。"
+
+msgid ""
+"Move listing formatting out to a new proxy middleware named "
+"``listing_formats``. ``listing_formats`` should be just right of the first "
+"proxy-logging middleware, and left of most other middlewares. If it is not "
+"already present, it will be automatically inserted for you."
+msgstr ""
+"リストの成型を ``listing_formats`` という新しいプロキシミドルウェアに移動しま"
+"した。``listing_formats`` は、最初の proxy-logging ミドルウェアの直ぐ右にあ"
+"り、他のミドルウェアの左になければなりません。まだ存在しない場合は、自動的に"
+"挿入されます。"
+
+msgid "Moved Zuul v3 tox jobs into the Swift code repo."
+msgstr "Zuul v3 の tox ジョブを Swift のリポジトリに移動しました。"
+
+msgid ""
+"Moved other-requirements.txt to bindep.txt. bindep.txt lists non-python "
+"dependencies of Swift."
+msgstr ""
+"other-requirements.txt を bindep.txt に移動しました。 bindep.txt は、 Swift "
+"の非 Python 依存関係をリストします。"
+
+msgid "New Features"
+msgstr "新機能"
+
+msgid ""
+"New config variables to change the schedule priority and I/O scheduling "
+"class. Servers and daemons now understand `nice_priority`, `ionice_class`, "
+"and `ionice_priority` to schedule their relative importance. Please read "
+"http://docs.openstack.org/developer/swift/deployment_guide.html for full "
+"config details."
+msgstr ""
+"スケジュール優先度と I/O スケジューリングクラスを変更する新しい設定変数を追加"
+"しました。サーバーとデーモンは `nice_priority`、`ionice_class`、"
+"`ionice_priority` を理解し、相対的な重要性をスケジューリングするようになりま"
+"した。 設定の詳細については、http://docs.openstack.org/developer/swift/"
+"deployment_guide.html を参照してください。"
+
+msgid "Newton Series Release Notes"
+msgstr "Newton バージョンのリリースノート"
+
+msgid ""
+"Note that after writing EC data with Swift 2.11.0 or later, that data will "
+"not be accessible to earlier versions of Swift."
+msgstr ""
+"Swift 2.11.0 以降で EC データを書き込んだ後は、以前のバージョンの Swift では"
+"そのデータにアクセスできないことに注意してください。"
+
+msgid ""
+"Note: if you have a custom middleware that makes account or container "
+"listings, it will only receive listings in JSON format."
+msgstr ""
+"注意: アカウントやコンテナー一覧を作るカスタムミドルウェアがある場合、受け取"
+"る一覧は JSON 形式のみです。"
+
+msgid ""
+"Now Swift will use ``write_affinity_handoff_delete_count`` to define how "
+"many local handoff nodes should swift send request to get more candidates "
+"for the final response. The default value \"auto\" means Swift will "
+"calculate the number automatically based on the number of replicas and "
+"current cluster topology."
+msgstr ""
+"Swiftは、 ``write_affinity_handoff_delete_count`` を使って、最終応答の候補を"
+"もっと多く得るために、どのくらいのローカルハンドオフノードが要求を送信するべ"
+"きかを定義します。デフォルト値 \"auto\" は、 Swift がレプリカの数と現在のクラ"
+"スタートポロジーに基づいて自動的に数を計算することを意味します。"
+
+msgid "Now ``swift-recon-cron`` works with conf.d configs."
+msgstr "``swift-recon-cron`` は conf.d の設定で動作するようになりました。"
+
+msgid "Object expiry improvements"
+msgstr "オブジェクトの有効期限の改善"
+
+msgid ""
+"Object versioning now supports a \"history\" mode in addition to the older "
+"\"stack\" mode. The difference is in how DELETE requests are handled. For "
+"full details, please read http://docs.openstack.org/developer/swift/"
+"overview_object_versioning.html."
+msgstr ""
+"オブジェクトのバージョン管理は、古い \"stack\" モードに加えて、 \"history\" "
+"モードをサポートするようになりました。 違いは、 DELETE 要求の処理方法にありま"
+"す。 詳細については、 http://docs.openstack.org/developer/swift/"
+"overview_object_versioning.html を参照してください。"
+
+msgid "Ocata Series Release Notes"
+msgstr "Ocata バージョンのリリースノート"
+
+msgid ""
+"On newer kernels (3.15+ when using xfs), Swift will use the O_TMPFILE flag "
+"when opening a file instead of creating a temporary file and renaming it on "
+"commit. This makes the data path simpler and allows the filesystem to more "
+"efficiently optimize the files on disk, resulting in better performance."
+msgstr ""
+"新しいカーネル(xfsを使用する場合 3.15+ )では、一時ファイルを作成してコミッ"
+"ト時に名前を変更する代わりに、ファイルを開くときに Swift が O_TMPFILE フラグ"
+"を使用します。これにより、データパスが簡単になり、ファイルシステムがディスク"
+"上のファイルをより効率的に最適化できるようになり、パフォーマンスが向上しま"
+"す。"
+
+msgid ""
+"Optimize the Erasure Code reconstructor protocol to reduce IO load on "
+"servers."
+msgstr ""
+"消去コード再構成プロトコルを最適化して、サーバーの IO 負荷を軽減します。"
+
+msgid ""
+"Optimized the common case for hashing filesystem trees, thus eliminating a "
+"lot of extraneous disk I/O."
+msgstr ""
+"ファイルシステムツリーをハッシュするための一般的なケースを最適化し、多くの余"
+"分なディスク I/O を無くしました。"
+
+msgid "Other Notes"
+msgstr "その他の注意点"
+
+msgid ""
+"PUT subrequests generated from a client-side COPY will now properly log the "
+"SSC (server-side copy) Swift source field. See https://docs.openstack.org/"
+"developer/swift/logs.html#swift-source for more information."
+msgstr ""
+"クライアント側の COPY から生成された PUT サブリクエストは、 SSC (サーバー側"
+"のコピー) Swift ソースフィールドを適切に記録するようになりました。詳細につい"
+"ては、\n"
+"https://docs.openstack.org/developer/swift/logs.html#swift-source を参照して"
+"ください。"
+
+msgid "Pike Series Release Notes"
+msgstr "Pike バージョンのリリースノート"
+
+msgid ""
+"Prevent logged traceback in object-server on client disconnect for chunked "
+"transfers to replicated policies."
+msgstr ""
+"複製されたポリシーへのチャンクされた転送時のクライアント切断で、オブジェクト"
+"サーバーにログされたトレースバックを防止します。"
+
+msgid ""
+"Previously, when deleting objects in multi-region swift deployment with "
+"write affinity configured, users always get 404 when deleting object before "
+"it's replicated to appropriate nodes."
+msgstr ""
+"以前は、書き込みアフィニティを設定したマルチリージョンの Swift 構成でオブジェ"
+"クトを削除すると、オブジェクトが適切なノードにレプリケートされる前にオブジェ"
+"クトを削除すると常に 404 となりました。"
+
+msgid ""
+"Remove ``swift-temp-url`` script. The functionality has been in swiftclient "
+"for a long time and this script has been deprecated since 2.10.0."
+msgstr ""
+"``swift-temp-url`` スクリプトを削除しました。この機能は、長い間 swiftclient "
+"にありましたが、2.10.0 から非推奨でした。"
+
+msgid "Remove deprecated ``vm_test_mode`` option."
+msgstr "非推奨の ``vm_test_mode`` オプションを削除しました。"
+
+msgid "Remove empty db hash and suffix directories if a db gets quarantined."
+msgstr ""
+"DB が隔離された場合に、空の DB ハッシュとサフィックスディレクトリを削除しま"
+"す。"
+
+msgid ""
+"Removed \"in-process-\" from func env tox name to work with upstream CI."
+msgstr ""
+"上流の CI で動作するように、func env tox 名から \"in-process-\" を削除しまし"
+"た。"
+
+msgid ""
+"Removed a race condition where a POST to an SLO could modify the X-Static-"
+"Large-Object metadata."
+msgstr ""
+"SLO クラウドへの POST が X-Static-Large-Object メタデータを変更できる、競合状"
+"態を削除しました。"
+
+msgid ""
+"Removed all ``post_as_copy`` related code and configs. The option has been "
+"deprecated since 2.13.0."
+msgstr ""
+"``post_as_copy`` に関連するすべてのコードと設定を削除しました。このオプション"
+"は、2.13.0 から非推奨でした。"
+
+msgid ""
+"Removed per-device reconstruction stats. Now that the reconstructor is "
+"shuffling parts before going through them, those stats no longer make sense."
+msgstr ""
+"デバイスごとの再構成の統計を削除しました。再構成は、それらを通過する前にパー"
+"ツをシャッフルするので、それらの統計はもはや意味をなしません。"
+
+msgid ""
+"Replaced ``replication_one_per_device`` by custom count defined by "
+"``replication_concurrency_per_device``. The original config value is "
+"deprecated, but continues to function for now. If both values are defined, "
+"the old ``replication_one_per_device`` is ignored."
+msgstr ""
+"``replication_one_per_device`` を ``replication_concurrency_per_device`` に"
+"よって定義されるカスタムカウントに置き換えました。元の設定値は非推奨となりま"
+"したが、引き続き機能します。両方の値が定義された場合、古い "
+"``replication_one_per_device`` は無視されます。"
+
+msgid "Require that known-bad EC schemes be deprecated"
+msgstr "既知の悪い EC スキームの要件を非推奨にしました。"
+
+msgid "Respect server type for --md5 check in swift-recon."
+msgstr "swift-recon での --md5 チェックのサーバー種別を尊重します。"
+
+msgid ""
+"Respond 400 Bad Request when Accept headers fail to parse instead of "
+"returning 406 Not Acceptable."
+msgstr ""
+"Accept ヘッダーの解析に失敗した時、406 Not Acceptable の代わりに 400 Bad "
+"Request が返されます。"
+
+msgid ""
+"Ring files now include byteorder information about the endian of the machine "
+"used to generate the file, and the values are appropriately byteswapped if "
+"deserialized on a machine with a different endianness. Newly created ring "
+"files will be byteorder agnostic, but previously generated ring files will "
+"still fail on different endian architectures. Regenerating older ring files "
+"will cause them to become byteorder agnostic. The regeneration of the ring "
+"files will not cause any new data movement. Newer ring files will still be "
+"usable by older versions of Swift (on machines with the same endianness--"
+"this maintains existing behavior)."
+msgstr ""
+"リングファイルには、ファイルを生成するために使用されたマシンのエンディアンに"
+"関するバイトオーダー情報が含まれるようになりました。エンディアンが異なるマシ"
+"ンでデシリアライズされた場合、値は適切にバイトスワップされます。新しく作成さ"
+"れたリングファイルはバイトオーダーには依存しませんが、以前に生成されたリング"
+"ファイルは引き続き異なるエンディアンアーキテクチャで失敗します。古いリング"
+"ファイルを再生成すると、それらはバイトオーダーに無関係になります。リングファ"
+"イルを再生成しても、新しいデータの移動は発生しません。最新のリングファイルは "
+"Swift の古いバージョンでも使用できます(同じエンディアンのマシンでは、これは"
+"既存の動作を維持します)。"
+
+msgid ""
+"Rings with min_part_hours set to zero will now only move one partition "
+"replica per rebalance, thus matching behavior when min_part_hours is greater "
+"than zero."
+msgstr ""
+"min_part_hours が 0 に設定されたリングは、リバランスのたびに1つのパーティ"
+"ションレプリカのみを移動するため、 min_part_hours が 0 より大きい場合の動作が"
+"一致します。"
+
+msgid ""
+"SLO manifest PUT requests can now be properly validated by sending an ETag "
+"header of the md5 sum of the concatenated md5 sums of the referenced "
+"segments."
+msgstr ""
+"参照されたセグメントの md5 合計が連結されたものの md5 合計を ETag ヘッダーで"
+"送信することによって、SLO マニフェストの PUT 要求を適切に検証することができま"
+"す。"
+
+msgid ""
+"SLO will now concurrently HEAD segments, resulting in much faster manifest "
+"validation and object creation. By default, two HEAD requests will be done "
+"at a time, but this can be changed by the operator via the new `concurrency` "
+"setting in the \"[filter:slo]\" section of the proxy server config."
+msgstr ""
+"SLO は現在、 HEAD セグメントを同時に処理するため、マニフェストの検証とオブ"
+"ジェクト作成が大幅に高速化されます。 デフォルトでは、一度に2つの HEAD リクエ"
+"ストが実行されますが、これはプロキシーサーバーの設定の \"[filter:slo]\" セク"
+"ションの新しい `concurrency` 設定によってオペレーターが変更できます。"
+
+msgid ""
+"Save the ring when dispersion improves, even if balance doesn't improve."
+msgstr ""
+"バランスが改善されない場合でも、分散が改善されたときにリングを保存します。"
+
+msgid "Send ETag header in 206 Partial Content responses to SLO reads."
+msgstr ""
+"SLO 読み込みへの 206 Partial Content 応答で ETag ヘッダーを送信します。"
+
+msgid ""
+"Significant improvements to the api-ref doc available at http://developer."
+"openstack.org/api-ref/object-storage/."
+msgstr ""
+"http://developer.openstack.org/api-ref/object-storage/ の api-ref ドキュメン"
+"トに対する重要な改善が行われました。"
+
+msgid ""
+"Static Large Object (SLO) manifest may now (again) have zero-byte last "
+"segments."
+msgstr ""
+"Static Large Object (SLO) マニフェストは、0 バイトの最終セグメントを再度持つ"
+"ようになりました。"
+
+msgid ""
+"Stop logging tracebacks in the ``object-replicator`` when it runs out of "
+"handoff locations."
+msgstr ""
+"``object-replicator`` を実行する場所を使い果たした時のトレースバックのログを"
+"停止しました。"
+
+msgid "Stopped logging tracebacks when receiving an unexpected response."
+msgstr "想定外の応答を受信した時のトレースバックのログを停止しました。"
+
+msgid "Support multi-range GETs for static large objects."
+msgstr "静的ラージオブジェクトの multi-range GET をサポートしました。"
+
+msgid "Suppress unexpected-file warnings for rsync temp files."
+msgstr "rsync の一時ファイルに対する unexpected-file 警告を抑制しました。"
+
+msgid "Suppressed the KeyError message when auditor finds an expired object."
+msgstr ""
+"監査が期限切れのオブジェクトを見つけたときの KeyError メッセージを抑制しまし"
+"た。"
+
+msgid "Swift Release Notes"
+msgstr "Swift リリースノート"
+
+msgid ""
+"Symlink objects reference one other object. They are created by creating an "
+"empty object with an X-Symlink-Target header. The value of the header is of "
+"the format /, and the target does not need to exist at "
+"the time of symlink creation. Cross-account symlinks can be created by "
+"including the X-Symlink-Target-Account header."
+msgstr ""
+"Symlink オブジェクトは他のオブジェクトを参照します。これらは、X-Symlink-"
+"Target ヘッダーを持つ空のオブジェクトの作成によって作られます。ヘッダーの値"
+"は / 形式であり、シンボリックリンク作成時にターゲットが存"
+"在する必要はありません。クロスアカウントのシンボリックリンクは、X-Symlink-"
+"Target-Account ヘッダーを含むことによって作成できます。"
+
+msgid ""
+"TempURLs now support a validation against a common prefix. A prefix-based "
+"signature grants access to all objects which share the same prefix. This "
+"avoids the creation of a large amount of signatures, when a whole container "
+"or pseudofolder is shared."
+msgstr ""
+"TempURL は、共通プレフィックスに対する検証をサポートするようになりました。接"
+"頭辞ベースの署名は、同じ接頭辞を共有するすべてのオブジェクトへのアクセスを許"
+"可します。これにより、コンテナーまたは擬似フォルダーの全体を共有するときに、"
+"大量の署名を作成することがなくなります。"
+
+msgid ""
+"TempURLs using the \"inline\" parameter can now also set the \"filename\" "
+"parameter. Both are used in the Content-Disposition response header."
+msgstr ""
+"「インライン」パラメータを使用する TempURL では、「ファイル名」パラメータも設"
+"定できるようになりました。どちらも Content-Disposition レスポンスヘッダーで使"
+"用されます。"
+
+msgid ""
+"Temporary URLs now support one common form of ISO 8601 timestamps in "
+"addition to Unix seconds-since-epoch timestamps. The ISO 8601 format "
+"accepted is '%Y-%m-%dT%H:%M:%SZ'. This makes TempURLs more user-friendly to "
+"produce and consume."
+msgstr ""
+"現在、 TempURL は、Unix エポック秒のタイムスタンプに加えて、 ISO 8601 タイム"
+"スタンプの一般的な形式をサポートするようになりました。受け入れられる ISO "
+"8601 形式は、 '%Y-%m-%dT%H:%M:%SZ' です。これにより、一時 URL の作成と使用が"
+"ユーザーフレンドリーになります。"
+
+msgid ""
+"The EC reconstructor process has been dramatically improved by adding "
+"support for multiple concurrent workers. Multiple processes are required to "
+"get high concurrency, and this change results in much faster rebalance times "
+"on servers with many drives."
+msgstr ""
+"EC 再構成プロセスは、複数の並列ワーカーのサポートを追加することによって劇的に"
+"改善されました。 高い並列性を得るためには複数のプロセスが必要です。この変更に"
+"より、多くのドライブを搭載したサーバーでは大幅に高速なリバランスが行われま"
+"す。"
+
+msgid ""
+"The ``domain_remap`` middleware now supports the ``mangle_client_paths`` "
+"option. Its default \"false\" value changes ``domain_remap`` parsing to stop "
+"stripping the ``path_root`` value from URL paths. If users depend on this "
+"path mangling, operators should set ``mangle_client_paths`` to \"True\" "
+"before upgrading."
+msgstr ""
+"``domain_remap`` ミドルウェアは、``mangle_client_paths`` オプションをサポート"
+"しました。デフォルト値 \"false\" では、``domain_remap`` の解析で URL のパスか"
+"ら ``path_root`` 値を取り除かなくなります。このパスの切り取りに依存している場"
+"合は、アップグレードする前に、オペレーターは ``mangle_client_paths`` を "
+"\"True\" に設定する必要があります。"
+
+msgid ""
+"The default for `object_post_as_copy` has been changed to False. The option "
+"is now deprecated and will be removed in a future release. If your cluster "
+"is still running with post-as-copy enabled, please update it to use the "
+"\"fast-post\" method. Future versions of Swift will not support post-as-"
+"copy, and future features will not be supported under post-as-copy. (\"Fast-"
+"post\" is where `object_post_as_copy` is false)."
+msgstr ""
+"`object_post_as_copy` のデフォルトは False に変更されました。このオプションは"
+"廃止され、将来のリリースで削除される予定です。あなたのクラスターが post-as-"
+"copy を有効にして実行している場合は、 \"fast-post\" 方式を使用するように更新"
+"してください。 Swift の将来のバージョンは post-as-copyをサポートしませんし、"
+"将来の機能は post-as-copyの下ではサポートされません。(「Fast-post」は "
+"`object_post_as_copy` が false のところです)。"
+
+msgid ""
+"The erasure code reconstructor `handoffs_first` option has been deprecated "
+"in favor of `handoffs_only`. `handoffs_only` is far more useful, and just "
+"like `handoffs_first` mode in the replicator, it gives the operator the "
+"option of forcing the consistency engine to focus solely on revert (handoff) "
+"jobs, thus improving the speed of rebalances.  The `handoffs_only` behavior "
+"is somewhat consistent with the replicator's `handoffs_first` option (any "
+"error on any handoff in the replicator will make it essentially handoff only "
+"forever) but the `handoff_only` option does what you want and is named "
+"correctly in the reconstructor."
+msgstr ""
+"消去コード再構成の `handoffs_first` オプションは `handoffs_only` のために廃止"
+"されました。 `handoffs_only` ははるかに便利で、レプリケーターの "
+"`handoffs_first` モードと同様に、一貫性エンジンに復帰(ハンドオフ)ジョブだけ"
+"に注力させるオプションをオペレーターに与え、リバランスのスピードを向上させま"
+"す。 `handoffs_only` の振る舞いは、レプリケーターの `handoffs_first` オプショ"
+"ンと一貫しています(レプリケーターのハンドオフ時にエラーが発生すると永久にハ"
+"ンドオフのみになります)が、`handoff_only` オプションは必要な処理を行い、再構"
+"成で正しく命名されます。"
+
+msgid ""
+"The erasure code reconstructor will now shuffle work jobs across all disks "
+"instead of going disk-by-disk. This eliminates single-disk I/O contention "
+"and allows continued scaling as concurrency is increased."
+msgstr ""
+"消去コード再構成は、ディスク単位で作業するのではなく、すべてのディスクで作業"
+"ジョブをシャッフルします。これにより、シングルディスクの I/O 競合がなくなり、"
+"並行性が高まるにつれて継続的なスケーリングが可能になります。"
+
+msgid ""
+"The improvements to EC reads made in Swift 2.10.0 have also been applied to "
+"the reconstructor. This allows fragments to be rebuilt in more "
+"circumstances, resulting in faster recovery from failures."
+msgstr ""
+"Swift 2.10.0 で作成された EC 読み取りの改善も、再構成に適用されています。これ"
+"により、より多くの状況でフラグメントを再構築することができ、障害からの迅速な"
+"回復が可能になります。"
+
+msgid ""
+"The number of container updates on object PUTs (ie to update listings) has "
+"been recomputed to be far more efficient  while maintaining durability "
+"guarantees. Specifically, object PUTs to erasure-coded policies will now "
+"normally result in far fewer container updates."
+msgstr ""
+"オブジェクトの PUT によるコンテナー更新の数(つまり、一覧の更新)は、耐久性の"
+"保証を維持しながら、遥かに効率的に再計算されます。具体的には、消去符号化ポリ"
+"シーへのオブジェクトの PUT は、通常、コンテナーの更新が大幅に少なくなります。"
+
+msgid ""
+"The object and container server config option ``slowdown`` has been "
+"deprecated in favor of the new ``objects_per_second`` and "
+"``containers_per_second`` options."
+msgstr ""
+"オブジェクトとコンテナーのサーバー設定オプション ``slowdown`` は、新しい "
+"``objects_per_second`` オプションと ``containers_per_second`` オプションのた"
+"めに廃止されました。"
+
+msgid ""
+"The object reconstructor can now rebuild an EC fragment for an expired "
+"object."
+msgstr ""
+"オブジェクト再構成は、期限切れのオブジェクトの EC フラグメントを再構築できる"
+"ようになりました。"
+
+msgid ""
+"The object server runs certain IO-intensive methods outside the main pthread "
+"for performance. Previously, if one of those methods tried to log, this can "
+"cause a crash that eventually leads to an object server with hundreds or "
+"thousands of greenthreads, all deadlocked. The fix is to use a mutex that "
+"works across different greenlets and different pthreads."
+msgstr ""
+"オブジェクトサーバーは、パフォーマンスのためにメインの pthread の外部で特定"
+"の IO 集約型メソッドを実行します。以前は、これらのメソッドの 1 つがログに記録"
+"しようとすると、クラッシュが発生し、最終的にオブジェクトサーバーはデッドロッ"
+"クされた数百または数千のグリーンスレッドを持つに至ります。この修正は、異なる "
+"greenlet と異なる pthread にまたがって動作する mutex を使用することです。"
+
+msgid ""
+"The output of devices from ``swift-ring-builder`` has been reordered by "
+"region, zone, ip, and device."
+msgstr ""
+"``swift-ring-builder`` からのデバイスの出力は、リージョン、ゾーン、IP、デバイ"
+"スによって、並べ替えられます。"
+
+msgid ""
+"The tempurl digest algorithm is now configurable, and Swift added support "
+"for both SHA-256 and SHA-512. Supported tempurl digests are exposed to "
+"clients in ``/info``. Additionally, tempurl signatures can now be base64 "
+"encoded."
+msgstr ""
+"tmpurl のダイジェストアルゴリズムが設定可能になり、Swift は、SHA-256 および "
+"SHA-512 の両方のサポートを追加しました。サポートされる tmpurl ダイジェスト"
+"は、``/info`` にてクライアントに公開されます。さらに、tempurl の署名を "
+"base64 でエンコードできるようになりました。"
+
+msgid ""
+"Throttle update_auditor_status calls so it updates no more than once per "
+"minute."
+msgstr ""
+"update_auditor_status の呼び出しを絞りました。なので、1分に1回しか更新しませ"
+"ん。"
+
+msgid ""
+"Throttle update_auditor_status calls so it updates no more than once per "
+"minute. This prevents excessive IO on a new cluster."
+msgstr ""
+"update_auditor_status の呼び出しを絞りました。なので、1分に1回しか更新しませ"
+"ん。これにより、新しいクラスタで過剰な I/O が発生するのを防ぎます。"
+
+msgid ""
+"Update dnspython dependency to 1.14, removing the need to have separate "
+"dnspython dependencies for Py2 and Py3."
+msgstr ""
+"dnspython の依存関係を 1.14 に更新し、dnspython の依存関係を Python 2 と "
+"Python 3 に分ける必要性をなくしました。"
+
+msgid "Updated docs to reference appropriate ports."
+msgstr "適切なポートを参照するようにドキュメントを更新しました。"
+
+msgid "Updated the PyECLib dependency to 1.3.1."
+msgstr "PyECLib の依存関係を 1.3.1 に更新しました。"
+
+msgid ""
+"Updated the `hashes.pkl` file format to include timestamp information for "
+"race detection. Also simplified hashing logic to prevent race conditions and "
+"optimize for the common case."
+msgstr ""
+"競合検出のタイムスタンプ情報を含むように `hashes.pkl` ファイル形式を更新しま"
+"した。また競合状態を防止し、一般的なケースを最適化するために、ハッシュロジッ"
+"クを簡略化しました。"
+
+msgid ""
+"Upgrade Impact: If you upgrade and roll back, you must delete all `hashes."
+"pkl` files."
+msgstr ""
+"アップグレードの影響: アップグレードしてロールバックする場合は、すべての "
+"`hashes.pkl` ファイルを削除する必要があります。"
+
+msgid "Upgrade Notes"
+msgstr "アップグレード時の注意"
+
+msgid ""
+"Upgrade impact -- during a rolling upgrade, an updated proxy server may "
+"write a manifest that an out-of-date proxy server will not be able to read. "
+"This will resolve itself once the upgrade completes on all nodes."
+msgstr ""
+"アップグレードの影響 -- ローリングアップグレード中に、更新されたプロキシサー"
+"バーは、期限切れのプロキシサーバーが読み込むことができないマニフェストを書き"
+"出す可能性があります。これは、すべてのノードでアップグレードが完了すると自ず"
+"と解決します。"
+
+msgid "Various other minor bug fixes and improvements."
+msgstr "様々な他のマイナーなバグ修正と改善。"
+
+msgid ""
+"WARNING: If you are using the ISA-L library for erasure codes, please "
+"upgrade to liberasurecode 1.3.1 (or later) as soon as possible. If you are "
+"using isa_l_rs_vand with more than 4 parity, please read https://bugs."
+"launchpad.net/swift/+bug/1639691 and take necessary action."
+msgstr ""
+"警告: 消去コードに ISA-L ライブラリを使用している場合は、できるだけ早く "
+"liberasurecode 1.3.1 (またはそれ以降)にアップグレードしてください。 4つ以上"
+"のパリティを持つ isa_l_rs_vand を使用している場合は、 https://bugs.launchpad."
+"net/swift/+bug/1639691 を参照して必要な処置を行ってください。"
+
+msgid ""
+"We do not yet have CLI tools for creating composite rings, but the "
+"functionality has been enabled in the ring modules to support this advanced "
+"functionality. CLI tools will be delivered in a subsequent release."
+msgstr ""
+"複合リングを作成するための CLI ツールはまだありませんが、この高度な機能をサ"
+"ポートするためにリングモジュールで機能が有効になっています。 CLI ツールは、以"
+"降のリリースで提供されます。"
+
+msgid ""
+"When requesting objects, return 404 if a tombstone is found and is newer "
+"than any data found. Previous behavior was to return stale data."
+msgstr ""
+"オブジェクトを要求するとき、廃棄済みオブジェクト (tombstone) があり、他のデー"
+"タよりも新しい場合には 404 を返します。以前の動作では、古いデータが返されてい"
+"ました。"
+
+msgid ""
+"When the object auditor examines an object, it will now add any missing "
+"metadata checksums."
+msgstr ""
+"オブジェクト監査がオブジェクトを検査するとき、欠落しているメタデータのチェッ"
+"クサムを追加します。"
+
+msgid ""
+"With heartbeating turned on, the proxy will start its response immediately "
+"with 202 Accepted then send a single whitespace character periodically until "
+"the request completes. At that point, a final summary chunk will be sent "
+"which includes a \"Response Status\" key indicating success or failure and "
+"(if successful) an \"Etag\" key indicating the Etag of the resulting SLO."
+msgstr ""
+"ハートビートをオンにすると、プロキシは 直ぐに 202 Accepted で応答を開始し、リ"
+"クエストが完了するまで一つの空白文字を定期的に送信します。その時点で、成功か"
+"失敗かを示す「Response Status 」キーと、成功した場合には SLO の結果として生じ"
+"る Etag を示す「Etag」キーを含む最終サマリーチャンクが送信されるようになりま"
+"す。"
+
+msgid "Write-affinity aware object deletion"
+msgstr "書き込みアフィニティは、オブジェクトの削除を認識します。"
+
+msgid ""
+"X-Delete-At computation now uses X-Timestamp instead of system time. This "
+"prevents clock skew causing inconsistent expiry data."
+msgstr ""
+"X-Delete-At の計算に、システム時間の代わりに X-Timestamp を使うようになりまし"
+"た。これは、時刻の誤差によって起こる期限データの矛盾を防止します。"
+
+msgid "``swift-ring-builder`` improvements"
+msgstr "``swift-ring-builder`` の改善"
+
+msgid ""
+"cname_lookup middleware now accepts a ``nameservers`` config variable that, "
+"if defined, will be used for DNS lookups instead of the system default."
+msgstr ""
+"cname_lookup ミドルウェアは、定義されていれば、システムのデフォルトではなく "
+"DNS ルックアップに使用される ``nameservers`` 設定変数を受け入れるようになりま"
+"した。"
+
+msgid "domain_remap now accepts a list of domains in \"storage_domain\"."
+msgstr ""
+"domain_remap は \"storage_domain\" にあるドメインのリストを受け入れるようにな"
+"りました。"
+
+msgid "name_check and cname_lookup keys have been added to `/info`."
+msgstr "name_check と cname_lookup キーが `/info` に追加されました。"
+
+msgid "swift-recon now respects storage policy aliases."
+msgstr "swift-recon はストレージポリシーの別名を尊重するようになりました。"
diff --git a/releasenotes/source/newton.rst b/releasenotes/source/newton.rst
new file mode 100644
index 0000000000..59418a33dd
--- /dev/null
+++ b/releasenotes/source/newton.rst
@@ -0,0 +1,6 @@
+=============================
+ Newton Series Release Notes
+=============================
+
+.. release-notes::
+   :branch: stable/newton
diff --git a/releasenotes/source/ocata.rst b/releasenotes/source/ocata.rst
new file mode 100644
index 0000000000..53fb86e386
--- /dev/null
+++ b/releasenotes/source/ocata.rst
@@ -0,0 +1,6 @@
+===================================
+ Ocata Series Release Notes
+===================================
+
+.. release-notes::
+   :branch: stable/ocata
diff --git a/releasenotes/source/pike.rst b/releasenotes/source/pike.rst
new file mode 100644
index 0000000000..e43bfc0ce1
--- /dev/null
+++ b/releasenotes/source/pike.rst
@@ -0,0 +1,6 @@
+===================================
+ Pike Series Release Notes
+===================================
+
+.. release-notes::
+   :branch: stable/pike
diff --git a/releasenotes/source/queens.rst b/releasenotes/source/queens.rst
new file mode 100644
index 0000000000..36ac6160ca
--- /dev/null
+++ b/releasenotes/source/queens.rst
@@ -0,0 +1,6 @@
+===================================
+ Queens Series Release Notes
+===================================
+
+.. release-notes::
+   :branch: stable/queens
diff --git a/releasenotes/source/rocky.rst b/releasenotes/source/rocky.rst
new file mode 100644
index 0000000000..40dd517b75
--- /dev/null
+++ b/releasenotes/source/rocky.rst
@@ -0,0 +1,6 @@
+===================================
+ Rocky Series Release Notes
+===================================
+
+.. release-notes::
+   :branch: stable/rocky
diff --git a/releasenotes/source/stein.rst b/releasenotes/source/stein.rst
new file mode 100644
index 0000000000..efaceb667b
--- /dev/null
+++ b/releasenotes/source/stein.rst
@@ -0,0 +1,6 @@
+===================================
+ Stein Series Release Notes
+===================================
+
+.. release-notes::
+   :branch: stable/stein
diff --git a/releasenotes/source/train.rst b/releasenotes/source/train.rst
new file mode 100644
index 0000000000..583900393c
--- /dev/null
+++ b/releasenotes/source/train.rst
@@ -0,0 +1,6 @@
+==========================
+Train Series Release Notes
+==========================
+
+.. release-notes::
+   :branch: stable/train
diff --git a/releasenotes/source/ussuri.rst b/releasenotes/source/ussuri.rst
new file mode 100644
index 0000000000..e21e50e0c6
--- /dev/null
+++ b/releasenotes/source/ussuri.rst
@@ -0,0 +1,6 @@
+===========================
+Ussuri Series Release Notes
+===========================
+
+.. release-notes::
+   :branch: stable/ussuri
diff --git a/releasenotes/source/victoria.rst b/releasenotes/source/victoria.rst
new file mode 100644
index 0000000000..8ce9334198
--- /dev/null
+++ b/releasenotes/source/victoria.rst
@@ -0,0 +1,6 @@
+=============================
+Victoria Series Release Notes
+=============================
+
+.. release-notes::
+   :branch: unmaintained/victoria
diff --git a/releasenotes/source/wallaby.rst b/releasenotes/source/wallaby.rst
new file mode 100644
index 0000000000..bcf35c5f80
--- /dev/null
+++ b/releasenotes/source/wallaby.rst
@@ -0,0 +1,6 @@
+============================
+Wallaby Series Release Notes
+============================
+
+.. release-notes::
+   :branch: unmaintained/wallaby
diff --git a/releasenotes/source/xena.rst b/releasenotes/source/xena.rst
new file mode 100644
index 0000000000..d19eda4886
--- /dev/null
+++ b/releasenotes/source/xena.rst
@@ -0,0 +1,6 @@
+=========================
+Xena Series Release Notes
+=========================
+
+.. release-notes::
+   :branch: unmaintained/xena
diff --git a/releasenotes/source/yoga.rst b/releasenotes/source/yoga.rst
new file mode 100644
index 0000000000..43cafdea89
--- /dev/null
+++ b/releasenotes/source/yoga.rst
@@ -0,0 +1,6 @@
+=========================
+Yoga Series Release Notes
+=========================
+
+.. release-notes::
+   :branch: unmaintained/yoga
diff --git a/releasenotes/source/zed.rst b/releasenotes/source/zed.rst
new file mode 100644
index 0000000000..6cc2b1554c
--- /dev/null
+++ b/releasenotes/source/zed.rst
@@ -0,0 +1,6 @@
+========================
+Zed Series Release Notes
+========================
+
+.. release-notes::
+   :branch: unmaintained/zed
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000..4ae431c958
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,13 @@
+# The order of packages is significant, because pip processes them in the order
+# of appearance. Changing the order has an impact on the overall integration
+# process, which may cause wedges in the gate later.
+
+eventlet>=0.25.0,!=0.34.3               # MIT
+greenlet>=0.4.14
+PasteDeploy>=2.0.0
+lxml>=4.2.3
+requests>=2.14.2                        # Apache-2.0
+xattr>=0.7.2;sys_platform!='win32'      # MIT
+PyECLib>=1.3.1,!=1.6.2,!=1.6.3          # BSD
+cryptography>=2.0.2                     # BSD/Apache-2.0
+dnspython>=1.15.0                       # http://www.dnspython.org/LICENSE
diff --git a/roles/additional-keystone-users/tasks/main.yaml b/roles/additional-keystone-users/tasks/main.yaml
new file mode 100644
index 0000000000..e2b9879853
--- /dev/null
+++ b/roles/additional-keystone-users/tasks/main.yaml
@@ -0,0 +1,169 @@
+- name: Set S3 endpoint
+  ini_file:
+    path: /etc/swift/test.conf
+    section: func_test
+    option: s3_storage_url
+    value: http://localhost:8080
+  become: true
+
+- name: Create primary S3 user
+  shell: >
+    openstack --os-auth-url http://localhost/identity
+    --os-project-domain-id default --os-project-name admin
+    --os-user-domain-id default --os-username admin
+    --os-password secretadmin
+    credential create --type ec2 --project swiftprojecttest1 swiftusertest1
+    '{"access": "s3-user1", "secret": "s3-secret1"}'
+- name: Add primary S3 user to test.conf
+  ini_file:
+    path: /etc/swift/test.conf
+    section: func_test
+    option: s3_access_key
+    value: s3-user1
+  become: true
+- name: Add primary S3 user secret to test.conf
+  ini_file:
+    path: /etc/swift/test.conf
+    section: func_test
+    option: s3_secret_key
+    value: s3-secret1
+  become: true
+- name: Add primary S3 user to test.conf for cross-compat tests
+  ini_file:
+    path: /etc/swift/test.conf
+    section: s3api_test
+    option: access_key1
+    value: s3-user1
+  become: true
+- name: Add primary S3 user secret to test.conf for cross-compat tests
+  ini_file:
+    path: /etc/swift/test.conf
+    section: s3api_test
+    option: secret_key1
+    value: s3-secret1
+  become: true
+
+- name: Clear secondary S3 user from test.conf
+  ini_file:
+    path: /etc/swift/test.conf
+    section: func_test
+    option: s3_access_key2
+    value: ""
+  become: true
+- name: Clear secondary S3 user from test.conf for cross-compat tests
+  ini_file:
+    path: /etc/swift/test.conf
+    section: s3api_test
+    option: access_key2
+    value: ""
+  become: true
+
+- name: Create restricted S3 user
+  shell: >
+    openstack --os-auth-url http://localhost/identity
+    --os-project-domain-id default --os-project-name admin
+    --os-user-domain-id default --os-username admin
+    --os-password secretadmin
+    credential create --type ec2 --project swiftprojecttest1 swiftusertest3
+    '{"access": "s3-user3", "secret": "s3-secret3"}'
+- name: Add restricted S3 user to test.conf
+  ini_file:
+    path: /etc/swift/test.conf
+    section: func_test
+    option: s3_access_key3
+    value: s3-user3
+  become: true
+- name: Add restricted S3 user secret to test.conf
+  ini_file:
+    path: /etc/swift/test.conf
+    section: func_test
+    option: s3_secret_key3
+    value: s3-secret3
+  become: true
+- name: Add restricted S3 user to test.conf for cross-compat tests
+  ini_file:
+    path: /etc/swift/test.conf
+    section: s3api_test
+    option: access_key3
+    value: s3-user3
+  become: true
+- name: Add restricted S3 user secret to test.conf for cross-compat tests
+  ini_file:
+    path: /etc/swift/test.conf
+    section: s3api_test
+    option: secret_key3
+    value: s3-secret3
+  become: true
+
+- name: Create service role
+  shell: >
+    openstack --os-auth-url http://localhost/identity
+    --os-project-domain-id default --os-project-name admin
+    --os-user-domain-id default --os-username admin
+    --os-password secretadmin
+    role create swift_service
+- name: Create service project
+  shell: >
+    openstack --os-auth-url http://localhost/identity
+    --os-project-domain-id default --os-project-name admin
+    --os-user-domain-id default --os-username admin
+    --os-password secretadmin
+    project create --domain default swiftprojecttest5
+- name: Create service user
+  shell: >
+    openstack --os-auth-url http://localhost/identity
+    --os-project-domain-id default --os-project-name admin
+    --os-user-domain-id default --os-username admin
+    --os-password secretadmin
+    user create --domain default --project swiftprojecttest5 swiftusertest5 --password testing5
+- name: Assign service role
+  shell: >
+    openstack --os-auth-url http://localhost/identity
+    --os-project-domain-id default --os-project-name admin
+    --os-user-domain-id default --os-username admin
+    --os-password secretadmin
+    role add --project swiftprojecttest5 --user swiftusertest5 swift_service
+
+- name: Add service_roles to proxy-server.conf
+  ini_file:
+    path: /etc/swift/proxy-server.conf
+    section: filter:keystoneauth
+    option: SERVICE_KEY_service_roles
+    value: swift_service
+  become: true
+- name: Update reseller prefixes in proxy-server.conf
+  ini_file:
+    path: /etc/swift/proxy-server.conf
+    section: filter:keystoneauth
+    option: reseller_prefix
+    value: AUTH, SERVICE_KEY
+  become: true
+
+- name: Add service account to test.conf
+  ini_file:
+    path: /etc/swift/test.conf
+    section: func_test
+    option: account5
+    value: swiftprojecttest5
+  become: true
+- name: Add service user to test.conf
+  ini_file:
+    path: /etc/swift/test.conf
+    section: func_test
+    option: username5
+    value: swiftusertest5
+  become: true
+- name: Add service password to test.conf
+  ini_file:
+    path: /etc/swift/test.conf
+    section: func_test
+    option: password5
+    value: testing5
+  become: true
+- name: Add service prefix to test.conf
+  ini_file:
+    path: /etc/swift/test.conf
+    section: func_test
+    option: service_prefix
+    value: SERVICE_KEY
+  become: true
diff --git a/roles/additional-tempauth-users/tasks/main.yaml b/roles/additional-tempauth-users/tasks/main.yaml
new file mode 100644
index 0000000000..204a56011f
--- /dev/null
+++ b/roles/additional-tempauth-users/tasks/main.yaml
@@ -0,0 +1,47 @@
+- name: Configure service auth prefix for tempauth tests
+  ini_file:
+    path: /etc/swift/proxy-server.conf
+    section: filter:tempauth
+    option: reseller_prefix
+    value: TEMPAUTH, SERVICE_TA
+  become: true
+
+- name: Configure service group for tempauth tests
+  ini_file:
+    path: /etc/swift/proxy-server.conf
+    section: filter:tempauth
+    option: SERVICE_TA_require_group
+    value: service
+  become: true
+
+- name: Configure service account for tempauth tests
+  ini_file:
+    path: "{{ ansible_env.HOME }}/{{ zuul.project.src_dir }}/../swift/test/sample.conf"
+    section: func_test
+    option: account5
+    value: test5
+  become: true
+
+- name: Configure service username for tempauth tests
+  ini_file:
+    path: "{{ ansible_env.HOME }}/{{ zuul.project.src_dir }}/../swift/test/sample.conf"
+    section: func_test
+    option: username5
+    value: tester5
+  become: true
+
+- name: Configure service user password for tempauth tests
+  ini_file:
+    path: "{{ ansible_env.HOME }}/{{ zuul.project.src_dir }}/../swift/test/sample.conf"
+    section: func_test
+    option: password5
+    value: testing5
+  become: true
+
+- name: Configure service prefix for tempauth tests
+  ini_file:
+    path: "{{ ansible_env.HOME }}/{{ zuul.project.src_dir }}/../swift/test/sample.conf"
+    section: func_test
+    option: service_prefix
+    value: SERVICE_TA
+  become: true
diff --git a/roles/dsvm-additional-middlewares/tasks/main.yaml b/roles/dsvm-additional-middlewares/tasks/main.yaml
new file mode 100644
index 0000000000..ed30b12435
--- /dev/null
+++ b/roles/dsvm-additional-middlewares/tasks/main.yaml
@@ -0,0 +1,150 @@
+- name: Add domain_remap and etag-quoter to pipeline
+  replace:
+    path: "/etc/swift/proxy-server.conf"
+    regexp: "cache listing_formats"
+    replace: "cache domain_remap etag-quoter listing_formats"
+  become: true
+
+- name: Set domain_remap domain
+  ini_file:
+    path: /etc/swift/proxy-server.conf
+    section: filter:domain_remap
+    option: storage_domain
+    value: example.com
+  become: true
+
+- name: Set storage_domain in test.conf (for Keystone tests)
+  ini_file:
+    path: /etc/swift/test.conf
+    section: func_test
+    option: storage_domain
+    value: example.com
+  become: true
+
+- name: Turn off s3_acl_tests_enabled in test.conf (for Keystone tests)
+  ini_file:
+    path: /etc/swift/test.conf
+    section: s3api_test
+    option: s3_acl_tests_enabled
+    value: false
+  become: true
+
+- name: Set storage_domain in test/sample.conf (for tempauth tests)
+  ini_file:
+    path: "{{ ansible_env.HOME }}/{{ zuul.project.src_dir }}/../swift/test/sample.conf"
+    section: func_test
+    option: storage_domain
+    value: example.com
+  become: true
+
+- name: Turn off s3_acl_tests_enabled in test/sample.conf (for tempauth tests)
+  ini_file:
+    path: "{{ ansible_env.HOME }}/{{ zuul.project.src_dir }}/../swift/test/sample.conf"
+    section: s3api_test
+    option: s3_acl_tests_enabled
+    value: false
+  become: true
+
+- name: Enable object versioning
+  ini_file:
+    path: /etc/swift/proxy-server.conf
+    section: filter:versioned_writes
+    option: allow_object_versioning
+    value: true
+  become: true
+
+- name: Configure s3api force_swift_request_proxy_log
+  ini_file:
+    path: /etc/swift/proxy-server.conf
+    section: filter:s3api
+    option: force_swift_request_proxy_log
+    value: true
+  become: true
+
+- name: "Configure s3token: auth_url"
+  ini_file:
+    path: /etc/swift/proxy-server.conf
+    section: filter:s3token
+    option: auth_url
+    value: http://localhost/identity
+  become: true
+
+- name: "Configure s3token: project_domain_name"
+  ini_file:
+    path: /etc/swift/proxy-server.conf
+    section: filter:s3token
+    option: project_domain_name
+    value: Default
+  become: true
+
+- name: "Configure s3token: project_name"
+  ini_file:
+    path: /etc/swift/proxy-server.conf
+    section: filter:s3token
+    option: project_name
+    value: service
+  become: true
+
+- name: "Configure s3token: user_domain_name"
+  ini_file:
+    path: /etc/swift/proxy-server.conf
+    section: filter:s3token
+    option: user_domain_name
+    value: Default
+  become: true
+
+- name: "Configure s3token: username"
+  ini_file:
+    path: /etc/swift/proxy-server.conf
+    section: filter:s3token
+    option: username
+    value: swift
+  become: true
+
+- name: "Configure s3token: password"
+  ini_file:
+    path: /etc/swift/proxy-server.conf
+    section: filter:s3token
+    option: password
+    value: secretservice
+  become: true
+
+- name: "Configure s3token: auth_type"
+  ini_file:
+    path: /etc/swift/proxy-server.conf
+    section: filter:s3token
+    option: auth_type
+    value: password
+  become: true
+
+- name: "Configure s3token: interface"
+  ini_file:
+    path: /etc/swift/proxy-server.conf
+    section: filter:s3token
+    option: interface
+    value: public
+  become: true
+
+- name: Copy ring for Policy-1
+  copy:
+    remote_src: true
+    src: /etc/swift/object.ring.gz
+    dest: /etc/swift/object-1.ring.gz
+  become: true
+
+- name: Add Policy-1 to swift.conf
+  ini_file:
+    path: /etc/swift/swift.conf
+    section: storage-policy:1
+    option: name
+    value: Policy-1
+  become: true
+
+- name: Restart service to pick up config changes
+  command: systemctl restart devstack@s-{{ item }}.service
+  become: true
+  with_items:
+    - proxy
+    - account
+    - container
+    - object
diff --git a/setup.cfg b/setup.cfg
index 409348eb71..830268bb3f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,31 +1,148 @@
-[build_sphinx]
-all_files = 1
-build-dir = doc/build
-source-dir = doc/source
-
-[egg_info]
-tag_build = 
-tag_date = 0
-tag_svn_revision = 0
-
-[compile_catalog]
-directory = locale
-domain = swift
-
-[update_catalog]
-domain = swift
-output_dir = locale
-input_file = locale/swift.pot
-
-[extract_messages]
-keywords = _ l_ lazy_gettext
-mapping_file = babel.cfg
-output_file = locale/swift.pot
-
-[nosetests]
-exe=1
-verbosity=2
-detailed-errors=1
-cover-package = swift
-cover-html = true
-cover-erase = true
+[metadata]
+name = swift
+summary = OpenStack Object Storage
+description_file =
+    README.rst
+long_description_content_type = text/x-rst
+author = OpenStack
+author_email = openstack-discuss@lists.openstack.org
+url = https://docs.openstack.org/swift/latest/
+python_requires = >=3.7
+project_urls =
+    Documentation = https://docs.openstack.org/swift/latest/
+    Bug Tracker = https://bugs.launchpad.net/swift
+    Source Code = https://opendev.org/openstack/swift/
+    Release Notes = https://opendev.org/openstack/swift/src/branch/master/CHANGELOG
+classifier =
+    Development Status :: 5 - Production/Stable
+    Environment :: OpenStack
+    Intended Audience :: Information Technology
+    Intended Audience :: System Administrators
+    License :: OSI Approved :: Apache Software License
+    Operating System :: POSIX :: Linux
+    Programming Language :: Python
+    Programming Language :: Python :: 3
+    Programming Language :: Python :: 3.7
+    Programming Language :: Python :: 3.8
+    Programming Language :: Python :: 3.9
+    Programming Language :: Python :: 3.10
+    Programming Language :: Python :: 3.11
+    Programming Language :: Python :: 3.12
+
+[pbr]
+skip_authors = True
+skip_changelog = True
+skip_reno = True
+
+[options]
+packages =
+    swift
+
+[extras]
+kms_keymaster =
+    oslo.config>=5.2.0 # Apache-2.0
+    castellan>=0.13.0 # Apache-2.0
+
+kmip_keymaster =
+    pykmip>=0.7.0 # Apache-2.0
+
+keystone =
+    keystonemiddleware>=4.17.0
+
+[entry_points]
+console_scripts =
+    swift-account-audit = swift.cli.account_audit:main
+    swift-account-auditor = swift.account.auditor:main
+    swift-account-info = swift.cli.info:account_main
+    swift-account-reaper = swift.account.reaper:main
+    swift-account-replicator = swift.account.replicator:main
+    swift-account-server = swift.account.server:main
+    swift-config = swift.cli.config:main
+    swift-container-auditor = swift.container.auditor:main
+    swift-container-deleter = swift.cli.container_deleter:main
+    swift-container-info = swift.cli.info:container_main
+    swift-container-replicator = swift.container.replicator:main
+    swift-container-reconciler = swift.container.reconciler:main
+    swift-container-server = swift.container.server:main
+    swift-container-sharder = swift.container.sharder:main
+    swift-container-sync = swift.container.sync:main
+    swift-container-updater = swift.container.updater:main
+    swift-dispersion-populate = swift.cli.dispersion_populate:main
+    swift-dispersion-report = swift.cli.dispersion_report:main
+    swift-drive-audit = swift.cli.drive_audit:main
+    swift-form-signature = swift.cli.form_signature:main
+    swift-get-nodes = swift.cli.get_nodes:main
+    swift-init = swift.common.manager:main
+    swift-manage-shard-ranges = swift.cli.manage_shard_ranges:main
+    swift-object-auditor = swift.obj.auditor:main
+    swift-object-expirer = swift.obj.expirer:main
+    swift-object-info = swift.cli.info:obj_main
+    swift-object-reconstructor = swift.obj.reconstructor:main
+    swift-object-relinker = swift.cli.relinker:main
+    swift-object-replicator = swift.obj.replicator:main
+    swift-object-server = swift.obj.server:main
+    swift-object-updater = swift.obj.updater:main
+    swift-oldies = swift.cli.oldies:main
+    swift-orphans = swift.cli.orphans:main
+    swift-proxy-server = swift.proxy.server:main
+    swift-recon = swift.cli.recon:main
+    swift-recon-cron = swift.cli.recon_cron:main
+    swift-reconciler-enqueue = swift.cli.reconciler_enqueue:main
+    swift-reload = swift.cli.reload:main
+    swift-ring-builder = swift.cli.ringbuilder:error_handling_main
+    swift-ring-builder-analyzer = swift.cli.ring_builder_analyzer:main
+    swift-ring-composer = swift.cli.ringcomposer:main
+
+paste.app_factory =
+    proxy = swift.proxy.server:app_factory
+    object = swift.obj.server:app_factory
+    mem_object = swift.obj.mem_server:app_factory
+    container = swift.container.server:app_factory
+    account = swift.account.server:app_factory
+
+paste.filter_factory =
+    healthcheck = swift.common.middleware.healthcheck:filter_factory
+    crossdomain = swift.common.middleware.crossdomain:filter_factory
+    memcache = swift.common.middleware.memcache:filter_factory
+    read_only = swift.common.middleware.read_only:filter_factory
+    ratelimit = swift.common.middleware.ratelimit:filter_factory
+    backend_ratelimit = swift.common.middleware.backend_ratelimit:filter_factory
+    cname_lookup = swift.common.middleware.cname_lookup:filter_factory
+    catch_errors = swift.common.middleware.catch_errors:filter_factory
+    domain_remap = swift.common.middleware.domain_remap:filter_factory
+    staticweb = swift.common.middleware.staticweb:filter_factory
+    tempauth = swift.common.middleware.tempauth:filter_factory
+    keystoneauth = swift.common.middleware.keystoneauth:filter_factory
+    recon = swift.common.middleware.recon:filter_factory
+    tempurl = swift.common.middleware.tempurl:filter_factory
+    formpost = swift.common.middleware.formpost:filter_factory
+    name_check = swift.common.middleware.name_check:filter_factory
+    bulk = swift.common.middleware.bulk:filter_factory
+    container_quotas = swift.common.middleware.container_quotas:filter_factory
+    account_quotas = swift.common.middleware.account_quotas:filter_factory
+    proxy_logging = swift.common.middleware.proxy_logging:filter_factory
+    dlo = swift.common.middleware.dlo:filter_factory
+    slo = swift.common.middleware.slo:filter_factory
+    list_endpoints = swift.common.middleware.list_endpoints:filter_factory
+    gatekeeper = swift.common.middleware.gatekeeper:filter_factory
+    container_sync = swift.common.middleware.container_sync:filter_factory
+    xprofile = swift.common.middleware.xprofile:filter_factory
+    versioned_writes = swift.common.middleware.versioned_writes:filter_factory
+    copy = swift.common.middleware.copy:filter_factory
+    keymaster = swift.common.middleware.crypto.keymaster:filter_factory
+    encryption = swift.common.middleware.crypto:filter_factory
+    kms_keymaster = swift.common.middleware.crypto.kms_keymaster:filter_factory
+    kmip_keymaster = swift.common.middleware.crypto.kmip_keymaster:filter_factory
+    listing_formats = swift.common.middleware.listing_formats:filter_factory
+    symlink = swift.common.middleware.symlink:filter_factory
+    s3api = swift.common.middleware.s3api.s3api:filter_factory
+    s3token = swift.common.middleware.s3api.s3token:filter_factory
+    etag_quoter = swift.common.middleware.etag_quoter:filter_factory
+
+swift.diskfile =
+    replication.fs = swift.obj.diskfile:DiskFileManager
+    erasure_coding.fs = swift.obj.diskfile:ECDiskFileManager
+
+swift.object_audit_watcher =
+    dark_data = swift.obj.watchers.dark_data:DarkDataWatcher
+
diff --git a/setup.py b/setup.py
index 8c3dfee83f..22cfdce874 100644
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,5 @@
-#!/usr/bin/python
-# Copyright (c) 2010-2012 OpenStack, LLC.
+#!/usr/bin/env python3
+# Copyright (c) 2013 Hewlett-Packard Development Company, L.P.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,89 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from setuptools import setup, find_packages
+import setuptools
 
-from swift import __canonical_version__ as version
-
-
-name = 'swift'
-
-
-setup(
-    name=name,
-    version=version,
-    description='Swift',
-    license='Apache License (2.0)',
-    author='OpenStack, LLC.',
-    author_email='openstack-admins@lists.launchpad.net',
-    url='https://launchpad.net/swift',
-    packages=find_packages(exclude=['test', 'bin']),
-    test_suite='nose.collector',
-    classifiers=[
-        'Development Status :: 4 - Beta',
-        'License :: OSI Approved :: Apache Software License',
-        'Operating System :: POSIX :: Linux',
-        'Programming Language :: Python :: 2.6',
-        'Environment :: No Input/Output (Daemon)',
-        'Environment :: OpenStack',
-    ],
-    install_requires=[],  # removed for better compat
-    scripts=[
-        'bin/swift-account-audit',
-        'bin/swift-account-auditor',
-        'bin/swift-account-reaper',
-        'bin/swift-account-replicator',
-        'bin/swift-account-server',
-        'bin/swift-bench',
-        'bin/swift-bench-client',
-        'bin/swift-container-auditor',
-        'bin/swift-container-replicator',
-        'bin/swift-container-server',
-        'bin/swift-container-sync',
-        'bin/swift-container-updater',
-        'bin/swift-dispersion-populate',
-        'bin/swift-dispersion-report',
-        'bin/swift-drive-audit',
-        'bin/swift-form-signature',
-        'bin/swift-get-nodes',
-        'bin/swift-init',
-        'bin/swift-object-auditor',
-        'bin/swift-object-expirer',
-        'bin/swift-object-info',
-        'bin/swift-object-replicator',
-        'bin/swift-object-server',
-        'bin/swift-object-updater',
-        'bin/swift-oldies',
-        'bin/swift-orphans',
-        'bin/swift-proxy-server',
-        'bin/swift-recon',
-        'bin/swift-recon-cron',
-        'bin/swift-ring-builder',
-        'bin/swift-temp-url',
-    ],
-    entry_points={
-        'paste.app_factory': [
-            'proxy=swift.proxy.server:app_factory',
-            'object=swift.obj.server:app_factory',
-            'container=swift.container.server:app_factory',
-            'account=swift.account.server:app_factory',
-        ],
-        'paste.filter_factory': [
-            'healthcheck=swift.common.middleware.healthcheck:filter_factory',
-            'memcache=swift.common.middleware.memcache:filter_factory',
-            'ratelimit=swift.common.middleware.ratelimit:filter_factory',
-            'cname_lookup=swift.common.middleware.cname_lookup:filter_factory',
-            'catch_errors=swift.common.middleware.catch_errors:filter_factory',
-            'domain_remap=swift.common.middleware.domain_remap:filter_factory',
-            'staticweb=swift.common.middleware.staticweb:filter_factory',
-            'tempauth=swift.common.middleware.tempauth:filter_factory',
-            'keystoneauth=swift.common.middleware.keystoneauth:filter_factory',
-            'recon=swift.common.middleware.recon:filter_factory',
-            'tempurl=swift.common.middleware.tempurl:filter_factory',
-            'formpost=swift.common.middleware.formpost:filter_factory',
-            'name_check=swift.common.middleware.name_check:filter_factory',
-            'proxy_logging=swift.common.middleware.proxy_logging:'
-            'filter_factory',
-        ],
-    },
-)
+setuptools.setup(
+    setup_requires=['pbr'],
+    pbr=True)
diff --git a/swift/__init__.py b/swift/__init__.py
index b8dc14e47b..280a902e39 100644
--- a/swift/__init__.py
+++ b/swift/__init__.py
@@ -1,21 +1,55 @@
-import gettext
+# Copyright (c) 2013 Hewlett-Packard Development Company, L.P.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
+import warnings
 
-class Version(object):
-    def __init__(self, canonical_version, final):
-        self.canonical_version = canonical_version
-        self.final = final
+__version__ = None
 
-    @property
-    def pretty_version(self):
-        if self.final:
-            return self.canonical_version
-        else:
-            return '%s-dev' % (self.canonical_version,)
+# First, try to get our version out of PKG-INFO. If we're installed,
+# this'll let us find our version without pulling in pbr. After all, if
+# we're installed on a system, we're not in a Git-managed source tree, so
+# pbr doesn't really buy us anything.
+try:
+    import importlib.metadata
+except ImportError:
+    # python < 3.8
+    import pkg_resources
+    try:
+        __version__ = __canonical_version__ = pkg_resources.get_provider(
+            pkg_resources.Requirement.parse('swift')).version
+    except pkg_resources.DistributionNotFound:
+        pass
+else:
+    try:
+        __version__ = __canonical_version__ = importlib.metadata.distribution(
+            'swift').version
+    except importlib.metadata.PackageNotFoundError:
+        pass
 
+if __version__ is None:
+    # No PKG-INFO? We're probably running from a checkout, then. Let pbr do
+    # its thing to figure out a version number.
+    import pbr.version
+    _version_info = pbr.version.VersionInfo('swift')
+    __version__ = _version_info.release_string()
+    __canonical_version__ = _version_info.version_string()
 
-_version = Version('1.7.6', False)
-__version__ = _version.pretty_version
-__canonical_version__ = _version.canonical_version
 
-gettext.install('swift')
+warnings.filterwarnings('ignore', module='cryptography|OpenSSL', message=(
+    'Python 2 is no longer supported by the Python core team. '
+    'Support for it is now deprecated in cryptography'))
+warnings.filterwarnings('ignore', message=(
+    'Python 3.6 is no longer supported by the Python core team. '
+    'Therefore, support for it is deprecated in cryptography'))
diff --git a/swift/account/auditor.py b/swift/account/auditor.py
index eeb73d8dfe..0b0799c5bc 100644
--- a/swift/account/auditor.py
+++ b/swift/account/auditor.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2010-2012 OpenStack, LLC.
+# Copyright (c) 2010-2012 OpenStack Foundation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,112 +13,46 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import time
-from random import random
 
-import swift.common.db
-from swift.account import server as account_server
-from swift.common.db import AccountBroker
-from swift.common.utils import get_logger, audit_location_generator, \
-    config_true_value, dump_recon_cache
-from swift.common.daemon import Daemon
+from swift.account.backend import AccountBroker
+from swift.common.exceptions import InvalidAccountInfo
+from swift.common.daemon import run_daemon
+from swift.common.db_auditor import DatabaseAuditor
+from swift.common.utils import parse_options
 
-from eventlet import Timeout
 
-
-class AccountAuditor(Daemon):
+class AccountAuditor(DatabaseAuditor):
     """Audit accounts."""
 
-    def __init__(self, conf):
-        self.conf = conf
-        self.logger = get_logger(conf, log_route='account-auditor')
-        self.devices = conf.get('devices', '/srv/node')
-        self.mount_check = config_true_value(conf.get('mount_check', 'true'))
-        self.interval = int(conf.get('interval', 1800))
-        self.account_passes = 0
-        self.account_failures = 0
-        swift.common.db.DB_PREALLOCATION = \
-            config_true_value(conf.get('db_preallocation', 'f'))
-        self.recon_cache_path = conf.get('recon_cache_path',
-                                         '/var/cache/swift')
-        self.rcache = os.path.join(self.recon_cache_path, "account.recon")
-
-    def _one_audit_pass(self, reported):
-        all_locs = audit_location_generator(self.devices,
-                                            account_server.DATADIR,
-                                            mount_check=self.mount_check,
-                                            logger=self.logger)
-        for path, device, partition in all_locs:
-            self.account_audit(path)
-            if time.time() - reported >= 3600:  # once an hour
-                self.logger.info(_('Since %(time)s: Account audits: '
-                                   '%(passed)s passed audit,'
-                                   '%(failed)s failed audit'),
-                                 {'time': time.ctime(reported),
-                                 'passed': self.account_passes,
-                                 'failed': self.account_failures})
-                self.account_audit(path)
-                dump_recon_cache({'account_audits_since': reported,
-                                  'account_audits_passed': self.account_passes,
-                                  'account_audits_failed':
-                                  self.account_failures},
-                                 self.rcache, self.logger)
-                reported = time.time()
-                self.account_passes = 0
-                self.account_failures = 0
-        return reported
-
-    def run_forever(self, *args, **kwargs):
-        """Run the account audit until stopped."""
-        reported = time.time()
-        time.sleep(random() * self.interval)
-        while True:
-            self.logger.info(_('Begin account audit pass.'))
-            begin = time.time()
-            try:
-                reported = self._one_audit_pass(reported)
-            except (Exception, Timeout):
-                self.logger.increment('errors')
-                self.logger.exception(_('ERROR auditing'))
-            elapsed = time.time() - begin
-            if elapsed < self.interval:
-                time.sleep(self.interval - elapsed)
-            self.logger.info(
-                _('Account audit pass completed: %.02fs'), elapsed)
-            dump_recon_cache({'account_auditor_pass_completed': elapsed},
-                             self.rcache, self.logger)
-
-    def run_once(self, *args, **kwargs):
-        """Run the account audit once."""
-        self.logger.info(_('Begin account audit "once" mode'))
-        begin = reported = time.time()
-        self._one_audit_pass(reported)
-        elapsed = time.time() - begin
-        self.logger.info(
-            _('Account audit "once" mode completed: %.02fs'), elapsed)
-        dump_recon_cache({'account_auditor_pass_completed': elapsed},
-                         self.rcache, self.logger)
-
-    def account_audit(self, path):
-        """
-        Audits the given account path
-
-        :param path: the path to an account db
-        """
-        start_time = time.time()
-        try:
-            if not path.endswith('.db'):
-                return
-            broker = AccountBroker(path)
-            if not broker.is_deleted():
-                info = broker.get_info()
-                self.logger.increment('passes')
-                self.account_passes += 1
-                self.logger.debug(_('Audit passed for %s') % broker.db_file)
-        except (Exception, Timeout):
-            self.logger.increment('failures')
-            self.account_failures += 1
-            self.logger.exception(_('ERROR Could not get account info %s'),
-                                  (broker.db_file))
-        self.logger.timing_since('timing', start_time)
+    server_type = "account"
+    broker_class = AccountBroker
+
+    def _audit(self, info, broker):
+        # Validate per policy counts
+        policy_stats = broker.get_policy_stats(do_migrations=True)
+        policy_totals = {
+            'container_count': 0,
+            'object_count': 0,
+            'bytes_used': 0,
+        }
+        for policy_stat in policy_stats.values():
+            for key in policy_totals:
+                policy_totals[key] += policy_stat[key]
+
+        for key in policy_totals:
+            if policy_totals[key] == info[key]:
+                continue
+            return InvalidAccountInfo(
+                'The total %(key)s for the account %(account)s (%(total)s) '
+                'does not match the sum of %(key)s across policies (%(sum)s)'
+                % {'key': key, 'account': info.get('account'),
+                   'total': info[key], 'sum': policy_totals[key]})
+
+
+def main():
+    conf_file, options = parse_options(once=True)
+    run_daemon(AccountAuditor, conf_file, **options)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/swift/account/backend.py b/swift/account/backend.py
new file mode 100644
index 0000000000..fc8201cb0b
--- /dev/null
+++ b/swift/account/backend.py
@@ -0,0 +1,662 @@
+# Copyright (c) 2010-2012 OpenStack Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Pluggable Back-end for Account Server
+"""
+
+
+import sqlite3
+
+from swift.common.utils import Timestamp, RESERVED_BYTE
+from swift.common.db import DatabaseBroker, zero_like
+
+DATADIR = 'accounts'
+
+
+POLICY_STAT_TRIGGER_SCRIPT = """
+    CREATE TRIGGER container_insert_ps AFTER INSERT ON container
+    BEGIN
+        INSERT OR IGNORE INTO policy_stat
+            (storage_policy_index, container_count, object_count, bytes_used)
+            VALUES (new.storage_policy_index, 0, 0, 0);
+        UPDATE policy_stat
+        SET container_count = container_count + (1 - new.deleted),
+            object_count = object_count + new.object_count,
+            bytes_used = bytes_used + new.bytes_used
+        WHERE storage_policy_index = new.storage_policy_index;
+    END;
+    CREATE TRIGGER container_delete_ps AFTER DELETE ON container
+    BEGIN
+        UPDATE policy_stat
+        SET container_count = container_count - (1 - old.deleted),
+            object_count = object_count - old.object_count,
+            bytes_used = bytes_used - old.bytes_used
+        WHERE storage_policy_index = old.storage_policy_index;
+    END;
+
+"""
+
+
+class AccountBroker(DatabaseBroker):
+    """Encapsulates working with an account database."""
+    db_type = 'account'
+    db_contains_type = 'container'
+    db_reclaim_timestamp = 'delete_timestamp'
+
+    def _initialize(self, conn, put_timestamp, **kwargs):
+        """
+        Create a brand new account database (tables, indices, triggers, etc.)
+
+        :param conn: DB connection object
+        :param put_timestamp: put timestamp
+        """
+        if not self.account:
+            raise ValueError(
+                'Attempting to create a new database with no account set')
+        self.create_container_table(conn)
+        self.create_account_stat_table(conn, put_timestamp)
+        self.create_policy_stat_table(conn)
+
+    def create_container_table(self, conn):
+        """
+        Create container table which is specific to the account DB.
+
+        :param conn: DB connection object
+        """
+        conn.executescript("""
+            CREATE TABLE container (
+                ROWID INTEGER PRIMARY KEY AUTOINCREMENT,
+                name TEXT,
+                put_timestamp TEXT,
+                delete_timestamp TEXT,
+                object_count INTEGER,
+                bytes_used INTEGER,
+                deleted INTEGER DEFAULT 0,
+                storage_policy_index INTEGER DEFAULT 0
+            );
+
+            CREATE INDEX ix_container_deleted_name ON
+                container (deleted, name);
+
+            CREATE TRIGGER container_insert AFTER INSERT ON container
+            BEGIN
+                UPDATE account_stat
+                SET container_count = container_count + (1 - new.deleted),
+                    object_count = object_count + new.object_count,
+                    bytes_used = bytes_used + new.bytes_used,
+                    hash = chexor(hash, new.name,
+                                  new.put_timestamp || '-' ||
+                                    new.delete_timestamp || '-' ||
+                                    new.object_count || '-' || new.bytes_used);
+            END;
+
+            CREATE TRIGGER container_update BEFORE UPDATE ON container
+            BEGIN
+                SELECT RAISE(FAIL, 'UPDATE not allowed; DELETE and INSERT');
+            END;
+
+
+            CREATE TRIGGER container_delete AFTER DELETE ON container
+            BEGIN
+                UPDATE account_stat
+                SET container_count = container_count - (1 - old.deleted),
+                    object_count = object_count - old.object_count,
+                    bytes_used = bytes_used - old.bytes_used,
+                    hash = chexor(hash, old.name,
+                                  old.put_timestamp || '-' ||
+                                    old.delete_timestamp || '-' ||
+                                    old.object_count || '-' || old.bytes_used);
+            END;
+        """ + POLICY_STAT_TRIGGER_SCRIPT)
+
+    def create_account_stat_table(self, conn, put_timestamp):
+        """
+        Create account_stat table which is specific to the account DB.
+        Not a part of Pluggable Back-ends, internal to the baseline code.
+
+        :param conn: DB connection object
+        :param put_timestamp: put timestamp
+        """
+        conn.executescript("""
+            CREATE TABLE account_stat (
+                account TEXT,
+                created_at TEXT,
+                put_timestamp TEXT DEFAULT '0',
+                delete_timestamp TEXT DEFAULT '0',
+                container_count INTEGER,
+                object_count INTEGER DEFAULT 0,
+                bytes_used INTEGER DEFAULT 0,
+                hash TEXT default '00000000000000000000000000000000',
+                id TEXT,
+                status TEXT DEFAULT '',
+                status_changed_at TEXT DEFAULT '0',
+                metadata TEXT DEFAULT ''
+            );
+
+            INSERT INTO account_stat (container_count) VALUES (0);
+        """)
+
+        conn.execute('''
+            UPDATE account_stat SET account = ?, created_at = ?, id = ?,
+                   put_timestamp = ?, status_changed_at = ?
+            ''', (self.account, Timestamp.now().internal, self._new_db_id(),
+                  put_timestamp, put_timestamp))
+
+    def create_policy_stat_table(self, conn):
+        """
+        Create policy_stat table which is specific to the account DB.
+        Not a part of Pluggable Back-ends, internal to the baseline code.
+
+        :param conn: DB connection object
+        """
+        conn.executescript("""
+            CREATE TABLE policy_stat (
+                storage_policy_index INTEGER PRIMARY KEY,
+                container_count INTEGER DEFAULT 0,
+                object_count INTEGER DEFAULT 0,
+                bytes_used INTEGER DEFAULT 0
+            );
+            INSERT OR IGNORE INTO policy_stat (
+                storage_policy_index, container_count, object_count,
+                bytes_used
+            )
+            SELECT 0, container_count, object_count, bytes_used
+            FROM account_stat
+            WHERE container_count > 0;
+        """)
+
+    def get_db_version(self, conn):
+        if self._db_version == -1:
+            self._db_version = 0
+            for row in conn.execute('''
+                    SELECT name FROM sqlite_master
+                    WHERE name = 'ix_container_deleted_name' '''):
+                self._db_version = 1
+        return self._db_version
+
+    def _commit_puts_load(self, item_list, entry):
+        """See :func:`swift.common.db.DatabaseBroker._commit_puts_load`"""
+        # check to see if the update includes policy_index or not
+        (name, put_timestamp, delete_timestamp, object_count, bytes_used,
+         deleted) = entry[:6]
+        if len(entry) > 6:
+            storage_policy_index = entry[6]
+        else:
+            # legacy support during upgrade until first non legacy storage
+            # policy is defined
+            storage_policy_index = 0
+        item_list.append(
+            {'name': name,
+             'put_timestamp': put_timestamp,
+             'delete_timestamp': delete_timestamp,
+             'object_count': object_count,
+             'bytes_used': bytes_used,
+             'deleted': deleted,
+             'storage_policy_index': storage_policy_index})
+
+    def empty(self):
+        """
+        Check if the account DB is empty.
+
+        :returns: True if the database has no active containers.
+        """
+        self._commit_puts_stale_ok()
+        with self.get() as conn:
+            row = conn.execute(
+                'SELECT container_count from account_stat').fetchone()
+            return zero_like(row[0])
+
+    def make_tuple_for_pickle(self, record):
+        return (record['name'], record['put_timestamp'],
+                record['delete_timestamp'], record['object_count'],
+                record['bytes_used'], record['deleted'],
+                record['storage_policy_index'])
+
+    def put_container(self, name, put_timestamp, delete_timestamp,
+                      object_count, bytes_used, storage_policy_index):
+        """
+        Create a container with the given attributes.
+
+        :param name: name of the container to create (a native string)
+        :param put_timestamp: put_timestamp of the container to create
+        :param delete_timestamp: delete_timestamp of the container to create
+        :param object_count: number of objects in the container
+        :param bytes_used: number of bytes used by the container
+        :param storage_policy_index:  the storage policy for this container
+        """
+        if Timestamp(delete_timestamp) > Timestamp(put_timestamp) and \
+                zero_like(object_count):
+            deleted = 1
+        else:
+            deleted = 0
+        record = {'name': name, 'put_timestamp': put_timestamp,
+                  'delete_timestamp': delete_timestamp,
+                  'object_count': object_count,
+                  'bytes_used': bytes_used,
+                  'deleted': deleted,
+                  'storage_policy_index': storage_policy_index}
+        self.put_record(record)
+
+    def _is_deleted_info(self, status, container_count, delete_timestamp,
+                         put_timestamp):
+        """
+        Apply delete logic to database info.
+
+        :returns: True if the DB is considered to be deleted, False otherwise
+        """
+        return status == 'DELETED' or zero_like(container_count) and (
+            Timestamp(delete_timestamp) > Timestamp(put_timestamp))
+
+    def _is_deleted(self, conn):
+        """
+        Check account_stat table and evaluate info.
+
+        :param conn: database conn
+
+        :returns: True if the DB is considered to be deleted, False otherwise
+        """
+        info = conn.execute('''
+            SELECT put_timestamp, delete_timestamp, container_count, status
+            FROM account_stat''').fetchone()
+        return self._is_deleted_info(**info)
+
+    def is_status_deleted(self):
+        """Only returns true if the status field is set to DELETED."""
+        with self.get() as conn:
+            row = conn.execute('''
+                SELECT put_timestamp, delete_timestamp, status
+                FROM account_stat''').fetchone()
+            return row['status'] == "DELETED" or (
+                row['delete_timestamp'] > row['put_timestamp'])
+
+    def get_policy_stats(self, do_migrations=False):
+        """
+        Get global policy stats for the account.
+
+        :param do_migrations: boolean, if True the policy stat dicts will
+                              always include the 'container_count' key;
+                              otherwise it may be omitted on legacy databases
+                              until they are migrated.
+
+        :returns: dict of policy stats where the key is the policy index and
+                  the value is a dictionary like {'object_count': M,
+                  'bytes_used': N, 'container_count': L}
+        """
+        columns = [
+            'storage_policy_index',
+            'container_count',
+            'object_count',
+            'bytes_used',
+        ]
+
+        def run_query():
+            return (conn.execute('''
+                SELECT %s
+                FROM policy_stat
+                ''' % ', '.join(columns)).fetchall())
+
+        self._commit_puts_stale_ok()
+        info = []
+        with self.get() as conn:
+            try:
+                info = run_query()
+            except sqlite3.OperationalError as err:
+                if "no such column: container_count" in str(err):
+                    if do_migrations:
+                        self._migrate_add_container_count(conn)
+                    else:
+                        columns.remove('container_count')
+                    info = run_query()
+                elif "no such table: policy_stat" in str(err):
+                    if do_migrations:
+                        self.create_policy_stat_table(conn)
+                        info = run_query()
+                    # else, pass and let the results be empty
+                else:
+                    raise
+
+        policy_stats = {}
+        for row in info:
+            stats = dict(row)
+            key = stats.pop('storage_policy_index')
+            policy_stats[key] = stats
+        return policy_stats
+
+    def get_info(self):
+        """
+        Get global data for the account.
+
+        :returns: dict with keys: account, created_at, put_timestamp,
+                  delete_timestamp, status_changed_at, container_count,
+                  object_count, bytes_used, hash, id
+        """
+        self._commit_puts_stale_ok()
+        with self.get() as conn:
+            data = dict(conn.execute('''
+                SELECT account, created_at,  put_timestamp, delete_timestamp,
+                       status_changed_at, container_count, object_count,
+                       bytes_used, hash, id
+                FROM account_stat
+            ''').fetchone())
+        self.account = data['account']
+        return data
+
+    def _populate_instance_cache(self):
+        """
+        Lazily hydrate instance attributes used for logging and other
+        read-mostly flows. Use `self.account is None` as the only
+        indicator that we haven't populated yet.
+        """
+        if self.account is None:
+            self.get_info()
+
+    @property
+    def path(self):
+        """
+        Logical namespace path used for logging.
+
+        For AccountBroker we return just "".
+        """
+        self._populate_instance_cache()
+        return self.account
+
+    def list_containers_iter(self, limit, marker, end_marker, prefix,
+                             delimiter, reverse=False, allow_reserved=False):
+        """
+        Get a list of containers sorted by name starting at marker onward, up
+        to limit entries. Entries will begin with the prefix and will not have
+        the delimiter after the prefix.
+
+        :param limit: maximum number of entries to get
+        :param marker: marker query
+        :param end_marker: end marker query
+        :param prefix: prefix query
+        :param delimiter: delimiter for query
+        :param reverse: reverse the result order.
+        :param allow_reserved: exclude names with reserved-byte by default
+
+        :returns: list of tuples of (name, object_count, bytes_used,
+                  put_timestamp, storage_policy_index, is_subdir)
+        """
+        delim_force_gte = False
+        if reverse:
+            # Reverse the markers if we are reversing the listing.
+            marker, end_marker = end_marker, marker
+        self._commit_puts_stale_ok()
+        if delimiter and not prefix:
+            prefix = ''
+        if prefix:
+            end_prefix = prefix[:-1] + chr(ord(prefix[-1]) + 1)
+        orig_marker = marker
+        with self.get() as conn:
+            results = []
+            while len(results) < limit:
+                query = """
+                    SELECT name, object_count, bytes_used, put_timestamp,
+                    {storage_policy_index}, 0
+                    FROM container
+                    WHERE """
+                query_args = []
+                if end_marker and (not prefix or end_marker < end_prefix):
+                    query += ' name < ? AND'
+                    query_args.append(end_marker)
+                elif prefix:
+                    query += ' name < ? AND'
+                    query_args.append(end_prefix)
+
+                if delim_force_gte:
+                    query += ' name >= ? AND'
+                    query_args.append(marker)
+                    # Always set back to False
+                    delim_force_gte = False
+                elif marker and (not prefix or marker >= prefix):
+                    query += ' name > ? AND'
+                    query_args.append(marker)
+                elif prefix:
+                    query += ' name >= ? AND'
+                    query_args.append(prefix)
+                if not allow_reserved:
+                    query += ' name >= ? AND'
+                    query_args.append(chr(ord(RESERVED_BYTE) + 1))
+                if self.get_db_version(conn) < 1:
+                    query += ' +deleted = 0'
+                else:
+                    query += ' deleted = 0'
+                query += ' ORDER BY name %s LIMIT ?' % \
+                         ('DESC' if reverse else '')
+                query_args.append(limit - len(results))
+                try:
+                    # First, try querying with the storage policy index.
+                    curs = conn.execute(
+                        query.format(
+                            storage_policy_index="storage_policy_index"),
+                        query_args)
+                except sqlite3.OperationalError as err:
+                    # If the storage policy column is not available,
+                    # the database has not been migrated to the new schema
+                    # with storage_policy_index. Re-run the query with
+                    # storage_policy_index set to 0, which is what
+                    # would be set once the database is migrated.
+                    # TODO(callumdickinson): If support for migrating
+                    # pre-storage policy versions of Swift is dropped,
+                    # then this special handling can be removed.
+                    if "no such column: storage_policy_index" in str(err):
+                        curs = conn.execute(
+                            query.format(storage_policy_index="0"),
+                            query_args)
+                    else:
+                        raise
+                curs.row_factory = None
+
+                # Delimiters without a prefix is ignored, further if there
+                # is no delimiter then we can simply return the result as
+                # prefixes are now handled in the SQL statement.
+                if prefix is None or not delimiter:
+                    return [r for r in curs]
+
+                # We have a delimiter and a prefix (possibly empty string) to
+                # handle
+                rowcount = 0
+                for row in curs:
+                    rowcount += 1
+                    name = row[0]
+                    if reverse:
+                        end_marker = name
+                    else:
+                        marker = name
+
+                    if len(results) >= limit:
+                        curs.close()
+                        return results
+                    end = name.find(delimiter, len(prefix))
+                    if end >= 0:
+                        if reverse:
+                            end_marker = name[:end + len(delimiter)]
+                        else:
+                            marker = ''.join([
+                                name[:end],
+                                delimiter[:-1],
+                                chr(ord(delimiter[-1:]) + 1),
+                            ])
+                            # we want result to be inclusive of delim+1
+                            delim_force_gte = True
+                        dir_name = name[:end + len(delimiter)]
+                        if dir_name != orig_marker:
+                            results.append([dir_name, 0, 0, '0', -1, 1])
+                        curs.close()
+                        break
+                    results.append(row)
+                if not rowcount:
+                    break
+            return results
+
+    def merge_items(self, item_list, source=None):
+        """
+        Merge items into the container table.
+
+        :param item_list: list of dictionaries of {'name', 'put_timestamp',
+                          'delete_timestamp', 'object_count', 'bytes_used',
+                          'deleted', 'storage_policy_index'}
+        :param source: if defined, update incoming_sync with the source
+        """
+        def _really_merge_items(conn):
+            max_rowid = -1
+            curs = conn.cursor()
+            for rec in item_list:
+                rec.setdefault('storage_policy_index', 0)  # legacy
+                record = [rec['name'], rec['put_timestamp'],
+                          rec['delete_timestamp'], rec['object_count'],
+                          rec['bytes_used'], rec['deleted'],
+                          rec['storage_policy_index']]
+                query = '''
+                    SELECT name, put_timestamp, delete_timestamp,
+                           object_count, bytes_used, deleted,
+                           storage_policy_index
+                    FROM container WHERE name = ?
+                '''
+                if self.get_db_version(conn) >= 1:
+                    query += ' AND deleted IN (0, 1)'
+                curs_row = curs.execute(query, (rec['name'],))
+                curs_row.row_factory = None
+                row = curs_row.fetchone()
+                if row:
+                    row = list(row)
+                    for i in range(5):
+                        if record[i] is None and row[i] is not None:
+                            record[i] = row[i]
+                    if Timestamp(row[1]) > \
+                       Timestamp(record[1]):  # Keep newest put_timestamp
+                        record[1] = row[1]
+                    if Timestamp(row[2]) > \
+                       Timestamp(record[2]):  # Keep newest delete_timestamp
+                        record[2] = row[2]
+                    # If deleted, mark as such
+                    if Timestamp(record[2]) > Timestamp(record[1]) and \
+                            zero_like(record[3]):
+                        record[5] = 1
+                    else:
+                        record[5] = 0
+                curs.execute('''
+                    DELETE FROM container WHERE name = ? AND
+                                                deleted IN (0, 1)
+                ''', (record[0],))
+                curs.execute('''
+                    INSERT INTO container (name, put_timestamp,
+                        delete_timestamp, object_count, bytes_used,
+                        deleted, storage_policy_index)
+                    VALUES (?, ?, ?, ?, ?, ?, ?)
+                ''', record)
+                if source:
+                    max_rowid = max(max_rowid, rec['ROWID'])
+            if source:
+                try:
+                    curs.execute('''
+                        INSERT INTO incoming_sync (sync_point, remote_id)
+                        VALUES (?, ?)
+                    ''', (max_rowid, source))
+                except sqlite3.IntegrityError:
+                    curs.execute('''
+                        UPDATE incoming_sync
+                        SET sync_point=max(?, sync_point)
+                        WHERE remote_id=?
+                    ''', (max_rowid, source))
+            conn.commit()
+
+        with self.get() as conn:
+            # create the policy stat table if needed and add spi to container
+            try:
+                _really_merge_items(conn)
+            except sqlite3.OperationalError as err:
+                if 'no such column: storage_policy_index' not in str(err):
+                    raise
+                self._migrate_add_storage_policy_index(conn)
+                _really_merge_items(conn)
+
+    def _migrate_add_container_count(self, conn):
+        """
+        Add the container_count column to the 'policy_stat' table and
+        update it
+
+        :param conn: DB connection object
+        """
+        # add the container_count column
+        curs = conn.cursor()
+        curs.executescript('''
+            DROP TRIGGER container_delete_ps;
+            DROP TRIGGER container_insert_ps;
+            ALTER TABLE policy_stat
+            ADD COLUMN container_count INTEGER DEFAULT 0;
+        ''' + POLICY_STAT_TRIGGER_SCRIPT)
+
+        # keep the simple case simple, if there's only one entry in the
+        # policy_stat table we just copy the total container count from the
+        # account_stat table
+
+        # if that triggers an update then the where changes <> 0 *would* exist
+        # and the insert or replace from the count subqueries won't execute
+
+        curs.executescript("""
+        UPDATE policy_stat
+        SET container_count = (
+            SELECT container_count
+            FROM account_stat)
+        WHERE (
+            SELECT COUNT(storage_policy_index)
+            FROM policy_stat
+        ) <= 1;
+
+        INSERT OR REPLACE INTO policy_stat (
+            storage_policy_index,
+            container_count,
+            object_count,
+            bytes_used
+        )
+        SELECT p.storage_policy_index,
+               c.count,
+               p.object_count,
+               p.bytes_used
+        FROM (
+            SELECT storage_policy_index,
+                   COUNT(*) as count
+            FROM container
+            WHERE deleted = 0
+            GROUP BY storage_policy_index
+        ) c
+        JOIN policy_stat p
+        ON p.storage_policy_index = c.storage_policy_index
+        WHERE NOT EXISTS(
+            SELECT changes() as change
+            FROM policy_stat
+            WHERE change <> 0
+        );
+        """)
+        conn.commit()
+
+    def _migrate_add_storage_policy_index(self, conn):
+        """
+        Add the storage_policy_index column to the 'container' table and
+        set up triggers, creating the policy_stat table if needed.
+
+        :param conn: DB connection object
+        """
+        try:
+            self.create_policy_stat_table(conn)
+        except sqlite3.OperationalError as err:
+            if 'table policy_stat already exists' not in str(err):
+                raise
+        conn.executescript('''
+            ALTER TABLE container
+            ADD COLUMN storage_policy_index INTEGER DEFAULT 0;
+        ''' + POLICY_STAT_TRIGGER_SCRIPT)
diff --git a/swift/account/reaper.py b/swift/account/reaper.py
index 385d95275e..0061e790ac 100644
--- a/swift/account/reaper.py
+++ b/swift/account/reaper.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2010-2012 OpenStack, LLC.
+# Copyright (c) 2010-2012 OpenStack Foundation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,20 +15,28 @@
 
 import os
 import random
+import socket
 from logging import DEBUG
 from math import sqrt
 from time import time
+import itertools
 
 from eventlet import GreenPool, sleep, Timeout
 
 import swift.common.db
-from swift.account.server import DATADIR
-from swift.common.db import AccountBroker
-from swift.common.direct_client import ClientException, \
-    direct_delete_container, direct_delete_object, direct_get_container
+from swift.account.backend import AccountBroker, DATADIR
+from swift.common.constraints import check_drive
+from swift.common.daemon import run_daemon
+from swift.common.direct_client import direct_delete_container, \
+    direct_delete_object, direct_get_container
+from swift.common.exceptions import ClientException
+from swift.common.request_helpers import USE_REPLICATION_NETWORK_HEADER
 from swift.common.ring import Ring
-from swift.common.utils import get_logger, whataremyips, config_true_value
+from swift.common.ring.utils import is_local_device
+from swift.common.utils import get_logger, whataremyips, config_true_value, \
+    Timestamp, md5, node_to_string, parse_options
 from swift.common.daemon import Daemon
+from swift.common.storage_policy import POLICIES, PolicyError
 
 
 class AccountReaper(Daemon):
@@ -52,19 +60,20 @@ class AccountReaper(Daemon):
     configuration parameters.
     """
 
-    def __init__(self, conf):
+    def __init__(self, conf, logger=None):
         self.conf = conf
-        self.logger = get_logger(conf, log_route='account-reaper')
+        self.logger = logger or get_logger(conf, log_route='account-reaper')
         self.devices = conf.get('devices', '/srv/node')
         self.mount_check = config_true_value(conf.get('mount_check', 'true'))
-        self.interval = int(conf.get('interval', 3600))
+        self.interval = float(conf.get('interval', 3600))
         self.swift_dir = conf.get('swift_dir', '/etc/swift')
         self.account_ring = None
         self.container_ring = None
         self.object_ring = None
-        self.node_timeout = int(conf.get('node_timeout', 10))
+        self.node_timeout = float(conf.get('node_timeout', 10))
         self.conn_timeout = float(conf.get('conn_timeout', 0.5))
-        self.myips = whataremyips()
+        self.myips = whataremyips(conf.get('bind_ip', '0.0.0.0'))
+        self.bind_port = int(conf.get('bind_port', 6202))
         self.concurrency = int(conf.get('concurrency', 25))
         self.container_concurrency = self.object_concurrency = \
             sqrt(self.concurrency)
@@ -72,32 +81,39 @@ def __init__(self, conf):
         swift.common.db.DB_PREALLOCATION = \
             config_true_value(conf.get('db_preallocation', 'f'))
         self.delay_reaping = int(conf.get('delay_reaping') or 0)
+        reap_warn_after = float(conf.get('reap_warn_after') or 86400 * 30)
+        self.reap_not_done_after = reap_warn_after + self.delay_reaping
+        self.start_time = time()
+        self.reset_stats()
 
     def get_account_ring(self):
-        """ The account :class:`swift.common.ring.Ring` for the cluster. """
+        """The account :class:`swift.common.ring.Ring` for the cluster."""
         if not self.account_ring:
             self.account_ring = Ring(self.swift_dir, ring_name='account')
         return self.account_ring
 
     def get_container_ring(self):
-        """ The container :class:`swift.common.ring.Ring` for the cluster. """
+        """The container :class:`swift.common.ring.Ring` for the cluster."""
         if not self.container_ring:
             self.container_ring = Ring(self.swift_dir, ring_name='container')
         return self.container_ring
 
-    def get_object_ring(self):
-        """ The object :class:`swift.common.ring.Ring` for the cluster. """
-        if not self.object_ring:
-            self.object_ring = Ring(self.swift_dir, ring_name='object')
-        return self.object_ring
+    def get_object_ring(self, policy_idx):
+        """
+        Get the ring identified by the policy index
 
-    def run_forever(self, *args, **kwargs):
+        :param policy_idx: Storage policy index
+        :returns: A ring matching the storage policy
         """
-        Main entry point when running the reaper in its normal daemon mode.
-        This repeatedly calls :func:`reap_once` no quicker than the
+        return POLICIES.get_object_ring(policy_idx, self.swift_dir)
+
+    def run_forever(self, *args, **kwargs):
+        """Main entry point when running the reaper in normal daemon mode.
+
+        This repeatedly calls :func:`run_once` no quicker than the
         configuration interval.
         """
-        self.logger.debug(_('Daemon started.'))
+        self.logger.debug('Daemon started.')
         sleep(random.random() * self.interval)
         while True:
             begin = time()
@@ -113,22 +129,22 @@ def run_once(self, *args, **kwargs):
         repeatedly by :func:`run_forever`. This will call :func:`reap_device`
         once for each device on the server.
         """
-        self.logger.debug(_('Begin devices pass: %s'), self.devices)
+        self.logger.debug('Begin devices pass: %s', self.devices)
         begin = time()
         try:
             for device in os.listdir(self.devices):
-                if self.mount_check and not os.path.ismount(
-                        os.path.join(self.devices, device)):
+                try:
+                    check_drive(self.devices, device, self.mount_check)
+                except ValueError as err:
                     self.logger.increment('errors')
-                    self.logger.debug(
-                        _('Skipping %s as it is not mounted'), device)
+                    self.logger.debug('Skipping: %s', err)
                     continue
                 self.reap_device(device)
         except (Exception, Timeout):
-            self.logger.exception(_("Exception in top-level account reaper "
-                                    "loop"))
+            self.logger.exception("Exception in top-level account reaper "
+                                  "loop")
         elapsed = time() - begin
-        self.logger.info(_('Devices pass completed: %.02fs'), elapsed)
+        self.logger.info('Devices pass completed: %.02fs', elapsed)
 
     def reap_device(self, device):
         """
@@ -151,9 +167,18 @@ def reap_device(self, device):
             if not partition.isdigit():
                 continue
             nodes = self.get_account_ring().get_part_nodes(int(partition))
-            if nodes[0]['ip'] not in self.myips or \
-                    not os.path.isdir(partition_path):
+            if not os.path.isdir(partition_path):
                 continue
+            container_shard = None
+            for container_shard, node in enumerate(nodes):
+                if is_local_device(self.myips, None, node['ip'], None) and \
+                        (not self.bind_port or
+                         self.bind_port == node['port']) and \
+                        (device == node['device']):
+                    break
+            else:
+                continue
+
             for suffix in os.listdir(partition_path):
                 suffix_path = os.path.join(partition_path, suffix)
                 if not os.path.isdir(suffix_path):
@@ -168,12 +193,24 @@ def reap_device(self, device):
                         elif fname.endswith('.db'):
                             self.start_time = time()
                             broker = \
-                                AccountBroker(os.path.join(hsh_path, fname))
+                                AccountBroker(os.path.join(hsh_path, fname),
+                                              logger=self.logger)
                             if broker.is_status_deleted() and \
                                     not broker.empty():
-                                self.reap_account(broker, partition, nodes)
+                                self.reap_account(
+                                    broker, partition, nodes,
+                                    container_shard=container_shard)
 
-    def reap_account(self, broker, partition, nodes):
+    def reset_stats(self):
+        self.stats_return_codes = {}
+        self.stats_containers_deleted = 0
+        self.stats_objects_deleted = 0
+        self.stats_containers_remaining = 0
+        self.stats_objects_remaining = 0
+        self.stats_containers_possibly_remaining = 0
+        self.stats_objects_possibly_remaining = 0
+
+    def reap_account(self, broker, partition, nodes, container_shard=None):
         """
         Called once per pass for each account this server is the primary for
         and attempts to delete the data for the given account. The reaper will
@@ -200,10 +237,12 @@ def reap_account(self, broker, partition, nodes):
         :param broker: The AccountBroker for the account to delete.
         :param partition: The partition in the account ring the account is on.
         :param nodes: The primary node dicts for the account to delete.
+        :param container_shard: int used to shard containers reaped. If None,
+                                will reap all containers.
 
         .. seealso::
 
-            :class:`swift.common.db.AccountBroker` for the broker class.
+            :class:`swift.account.backend.AccountBroker` for the broker class.
 
         .. seealso::
 
@@ -212,62 +251,74 @@ def reap_account(self, broker, partition, nodes):
         """
         begin = time()
         info = broker.get_info()
-        if time() - float(info['delete_timestamp']) <= self.delay_reaping:
+        if time() - float(Timestamp(info['delete_timestamp'])) <= \
+                self.delay_reaping:
             return False
         account = info['account']
-        self.logger.info(_('Beginning pass on account %s'), account)
-        self.stats_return_codes = {}
-        self.stats_containers_deleted = 0
-        self.stats_objects_deleted = 0
-        self.stats_containers_remaining = 0
-        self.stats_objects_remaining = 0
-        self.stats_containers_possibly_remaining = 0
-        self.stats_objects_possibly_remaining = 0
+        self.logger.info('Beginning pass on account %s', account)
+        self.reset_stats()
+        container_limit = 1000
+        if container_shard is not None:
+            container_limit *= len(nodes)
         try:
-            marker = ''
-            while True:
-                containers = \
-                    list(broker.list_containers_iter(1000, marker, None, None,
-                                                     None))
-                if not containers:
-                    break
+            containers = list(broker.list_containers_iter(
+                container_limit, '', None, None, None, allow_reserved=True))
+            while containers:
                 try:
-                    for (container, _junk, _junk, _junk) in containers:
+                    for row in containers:
+                        container = row[0]
+                        this_shard = (
+                            int(md5(container.encode('utf-8'),
+                                    usedforsecurity=False)
+                                .hexdigest(), 16) % len(nodes))
+                        if container_shard not in (this_shard, None):
+                            continue
+
                         self.container_pool.spawn(self.reap_container, account,
                                                   partition, nodes, container)
                     self.container_pool.waitall()
                 except (Exception, Timeout):
                     self.logger.exception(
-                        _('Exception with containers for account %s'), account)
-                marker = containers[-1][0]
-            log = 'Completed pass on account %s' % account
+                        'Exception with containers for account %s', account)
+                containers = list(broker.list_containers_iter(
+                    container_limit, containers[-1][0], None, None, None,
+                    allow_reserved=True))
+            log_buf = ['Completed pass on account %s' % account]
         except (Exception, Timeout):
-            self.logger.exception(
-                _('Exception with account %s'), account)
-            log = _('Incomplete pass on account %s') % account
+            self.logger.exception('Exception with account %s', account)
+            log_buf = ['Incomplete pass on account %s' % account]
         if self.stats_containers_deleted:
-            log += _(', %s containers deleted') % self.stats_containers_deleted
+            log_buf.append(', %s containers deleted' %
+                           self.stats_containers_deleted)
         if self.stats_objects_deleted:
-            log += _(', %s objects deleted') % self.stats_objects_deleted
+            log_buf.append(', %s objects deleted' % self.stats_objects_deleted)
         if self.stats_containers_remaining:
-            log += _(', %s containers remaining') % \
-                self.stats_containers_remaining
+            log_buf.append(', %s containers remaining' %
+                           self.stats_containers_remaining)
         if self.stats_objects_remaining:
-            log += _(', %s objects remaining') % self.stats_objects_remaining
+            log_buf.append(', %s objects remaining' %
+                           self.stats_objects_remaining)
         if self.stats_containers_possibly_remaining:
-            log += _(', %s containers possibly remaining') % \
-                self.stats_containers_possibly_remaining
+            log_buf.append(', %s containers possibly remaining' %
+                           self.stats_containers_possibly_remaining)
         if self.stats_objects_possibly_remaining:
-            log += _(', %s objects possibly remaining') % \
-                self.stats_objects_possibly_remaining
+            log_buf.append(', %s objects possibly remaining' %
+                           self.stats_objects_possibly_remaining)
         if self.stats_return_codes:
-            log += _(', return codes: ')
-            for code in sorted(self.stats_return_codes.keys()):
-                log += '%s %sxxs, ' % (self.stats_return_codes[code], code)
-            log = log[:-2]
-        log += _(', elapsed: %.02fs') % (time() - begin)
-        self.logger.info(log)
+            log_buf.append(', return codes: ')
+            for code in sorted(self.stats_return_codes):
+                log_buf.append('%s %sxxs, ' % (self.stats_return_codes[code],
+                                               code))
+            log_buf[-1] = log_buf[-1][:-2]
+        log_buf.append(', elapsed: %.02fs' % (time() - begin))
+        self.logger.info(''.join(log_buf))
         self.logger.timing_since('timing', self.start_time)
+        delete_timestamp = Timestamp(info['delete_timestamp'])
+        if self.stats_containers_remaining and \
+           begin - float(delete_timestamp) >= self.reap_not_done_after:
+            self.logger.warning(
+                'Account %(account)s has not been reaped since %(time)s' %
+                {'account': account, 'time': delete_timestamp.isoformat})
         return True
 
     def reap_container(self, account, account_partition, account_nodes,
@@ -314,40 +365,47 @@ def reap_container(self, account, account_partition, account_nodes,
         while True:
             objects = None
             try:
-                objects = direct_get_container(
+                headers, objects = direct_get_container(
                     node, part, account, container,
                     marker=marker,
                     conn_timeout=self.conn_timeout,
-                    response_timeout=self.node_timeout)[1]
+                    response_timeout=self.node_timeout,
+                    headers={USE_REPLICATION_NETWORK_HEADER: 'true'})
                 self.stats_return_codes[2] = \
                     self.stats_return_codes.get(2, 0) + 1
                 self.logger.increment('return_codes.2')
-            except ClientException, err:
+            except ClientException as err:
                 if self.logger.getEffectiveLevel() <= DEBUG:
                     self.logger.exception(
-                        _('Exception with %(ip)s:%(port)s/%(device)s'), node)
-                self.stats_return_codes[err.http_status / 100] = \
-                    self.stats_return_codes.get(err.http_status / 100, 0) + 1
+                        'Exception with %s', node_to_string(node))
+                self.stats_return_codes[err.http_status // 100] = \
+                    self.stats_return_codes.get(err.http_status // 100, 0) + 1
                 self.logger.increment(
-                    'return_codes.%d' % (err.http_status / 100,))
+                    'return_codes.%d' % (err.http_status // 100,))
+            except (Timeout, socket.error):
+                self.logger.error(
+                    'Timeout Exception with %s', node_to_string(node))
             if not objects:
                 break
             try:
+                policy_index = headers.get('X-Backend-Storage-Policy-Index', 0)
+                policy = POLICIES.get_by_index(policy_index)
+                if not policy:
+                    self.logger.error('ERROR: invalid storage policy index: %r'
+                                      % policy_index)
                 for obj in objects:
-                    if isinstance(obj['name'], unicode):
-                        obj['name'] = obj['name'].encode('utf8')
                     pool.spawn(self.reap_object, account, container, part,
-                               nodes, obj['name'])
+                               nodes, obj['name'], policy_index)
                 pool.waitall()
             except (Exception, Timeout):
-                self.logger.exception(_('Exception with objects for container '
-                                        '%(container)s for account %(account)s'
-                                        ),
+                self.logger.exception('Exception with objects for container '
+                                      '%(container)s for account %(account)s',
                                       {'container': container,
                                        'account': account})
             marker = objects[-1]['name']
         successes = 0
         failures = 0
+        timestamp = Timestamp.now()
         for node in nodes:
             anode = account_nodes.pop()
             try:
@@ -358,21 +416,28 @@ def reap_container(self, account, account_partition, account_nodes,
                     headers={'X-Account-Host': '%(ip)s:%(port)s' % anode,
                              'X-Account-Partition': str(account_partition),
                              'X-Account-Device': anode['device'],
-                             'X-Account-Override-Deleted': 'yes'})
+                             'X-Account-Override-Deleted': 'yes',
+                             'X-Timestamp': timestamp.internal,
+                             USE_REPLICATION_NETWORK_HEADER: 'true'})
                 successes += 1
                 self.stats_return_codes[2] = \
                     self.stats_return_codes.get(2, 0) + 1
                 self.logger.increment('return_codes.2')
-            except ClientException, err:
+            except ClientException as err:
                 if self.logger.getEffectiveLevel() <= DEBUG:
                     self.logger.exception(
-                        _('Exception with %(ip)s:%(port)s/%(device)s'), node)
+                        'Exception with %s', node_to_string(node))
                 failures += 1
                 self.logger.increment('containers_failures')
-                self.stats_return_codes[err.http_status / 100] = \
-                    self.stats_return_codes.get(err.http_status / 100, 0) + 1
+                self.stats_return_codes[err.http_status // 100] = \
+                    self.stats_return_codes.get(err.http_status // 100, 0) + 1
                 self.logger.increment(
-                    'return_codes.%d' % (err.http_status / 100,))
+                    'return_codes.%d' % (err.http_status // 100,))
+            except (Timeout, socket.error):
+                self.logger.error(
+                    'Timeout Exception with %s', node_to_string(node))
+                failures += 1
+                self.logger.increment('containers_failures')
         if successes > failures:
             self.stats_containers_deleted += 1
             self.logger.increment('containers_deleted')
@@ -384,7 +449,7 @@ def reap_container(self, account, account_partition, account_nodes,
             self.logger.increment('containers_possibly_remaining')
 
     def reap_object(self, account, container, container_partition,
-                    container_nodes, obj):
+                    container_nodes, obj, policy_index):
         """
         Deletes the given object by issuing a delete request to each node for
         the object. The format of the delete request is such that each object
@@ -400,16 +465,25 @@ def reap_object(self, account, container, container_partition,
                                     container ring.
         :param container_nodes: The primary node dicts for the container.
         :param obj: The name of the object to delete.
+        :param policy_index: The storage policy index of the object's container
 
         * See also: :func:`swift.common.ring.Ring.get_nodes` for a description
           of the container node dicts.
         """
-        container_nodes = list(container_nodes)
-        part, nodes = self.get_object_ring().get_nodes(account, container, obj)
+        cnodes = itertools.cycle(container_nodes)
+        try:
+            ring = self.get_object_ring(policy_index)
+        except PolicyError:
+            self.stats_objects_remaining += 1
+            self.logger.increment('objects_remaining')
+            return
+        part, nodes = ring.get_nodes(account, container, obj)
         successes = 0
         failures = 0
+        timestamp = Timestamp.now()
+
         for node in nodes:
-            cnode = container_nodes.pop()
+            cnode = next(cnodes)
             try:
                 direct_delete_object(
                     node, part, account, container, obj,
@@ -417,21 +491,29 @@ def reap_object(self, account, container, container_partition,
                     response_timeout=self.node_timeout,
                     headers={'X-Container-Host': '%(ip)s:%(port)s' % cnode,
                              'X-Container-Partition': str(container_partition),
-                             'X-Container-Device': cnode['device']})
+                             'X-Container-Device': cnode['device'],
+                             'X-Backend-Storage-Policy-Index': policy_index,
+                             'X-Timestamp': timestamp.internal,
+                             USE_REPLICATION_NETWORK_HEADER: 'true'})
                 successes += 1
                 self.stats_return_codes[2] = \
                     self.stats_return_codes.get(2, 0) + 1
                 self.logger.increment('return_codes.2')
-            except ClientException, err:
+            except ClientException as err:
                 if self.logger.getEffectiveLevel() <= DEBUG:
                     self.logger.exception(
-                        _('Exception with %(ip)s:%(port)s/%(device)s'), node)
+                        'Exception with %s', node_to_string(node))
                 failures += 1
                 self.logger.increment('objects_failures')
-                self.stats_return_codes[err.http_status / 100] = \
-                    self.stats_return_codes.get(err.http_status / 100, 0) + 1
+                self.stats_return_codes[err.http_status // 100] = \
+                    self.stats_return_codes.get(err.http_status // 100, 0) + 1
                 self.logger.increment(
-                    'return_codes.%d' % (err.http_status / 100,))
+                    'return_codes.%d' % (err.http_status // 100,))
+            except (Timeout, socket.error):
+                failures += 1
+                self.logger.increment('objects_failures')
+                self.logger.error(
+                    'Timeout Exception with %s', node_to_string(node))
             if successes > failures:
                 self.stats_objects_deleted += 1
                 self.logger.increment('objects_deleted')
@@ -441,3 +523,12 @@ def reap_object(self, account, container, container_partition,
             else:
                 self.stats_objects_possibly_remaining += 1
                 self.logger.increment('objects_possibly_remaining')
+
+
+def main():
+    conf_file, options = parse_options(once=True)
+    run_daemon(AccountReaper, conf_file, **options)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/swift/account/replicator.py b/swift/account/replicator.py
index c7f93d9b90..71be882e02 100644
--- a/swift/account/replicator.py
+++ b/swift/account/replicator.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2010-2012 OpenStack, LLC.
+# Copyright (c) 2010-2012 OpenStack Foundation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,12 +13,34 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from swift.account import server as account_server
-from swift.common import db, db_replicator
+import optparse
+
+from swift.account.backend import AccountBroker, DATADIR
+from swift.common import db_replicator
+from swift.common.daemon import run_daemon
+from swift.common.utils import parse_options
 
 
 class AccountReplicator(db_replicator.Replicator):
     server_type = 'account'
-    brokerclass = db.AccountBroker
-    datadir = account_server.DATADIR
-    default_port = 6002
+    brokerclass = AccountBroker
+    datadir = DATADIR
+    default_port = 6202
+
+
+def main():
+    parser = optparse.OptionParser("%prog CONFIG [options]")
+    parser.add_option('-d', '--devices',
+                      help=('Replicate only given devices. '
+                            'Comma-separated list. '
+                            'Only has effect if --once is used.'))
+    parser.add_option('-p', '--partitions',
+                      help=('Replicate only given partitions. '
+                            'Comma-separated list. '
+                            'Only has effect if --once is used.'))
+    conf_file, options = parse_options(parser=parser, once=True)
+    run_daemon(AccountReplicator, conf_file, **options)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/swift/account/server.py b/swift/account/server.py
index 5b3df742a5..2134056d02 100644
--- a/swift/account/server.py
+++ b/swift/account/server.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2010-2012 OpenStack, LLC.
+# Copyright (c) 2010-2012 OpenStack Foundation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,361 +13,327 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import with_statement
-
+import json
 import os
+import sys
 import time
 import traceback
-from urllib import unquote
-from xml.sax import saxutils
 
 from eventlet import Timeout
 
 import swift.common.db
-from swift.common.db import AccountBroker
-from swift.common.utils import get_logger, get_param, hash_path, public, \
-    normalize_timestamp, split_path, storage_directory, config_true_value, \
-    validate_device_partition, json, timing_stats
-from swift.common.constraints import ACCOUNT_LISTING_LIMIT, \
-    check_mount, check_float, check_utf8, FORMAT2CONTENT_TYPE
+from swift.account.backend import AccountBroker, DATADIR
+from swift.account.utils import account_listing_response, get_response_headers
+from swift.common.db import DatabaseConnectionError, DatabaseAlreadyExists
+from swift.common.request_helpers import get_param, \
+    split_and_validate_path, validate_internal_account, \
+    validate_internal_container, constrain_req_limit
+from swift.common.utils import get_logger, hash_path, public, \
+    Timestamp, storage_directory, config_true_value, \
+    replication, get_log_line, \
+    config_fallocate_value, fs_has_free_space, parse_options
+from swift.common.constraints import valid_timestamp, check_utf8, \
+    check_drive, AUTO_CREATE_ACCOUNT_PREFIX
+from swift.common import constraints
 from swift.common.db_replicator import ReplicatorRpc
+from swift.common.base_storage_server import BaseStorageServer, timing_stats
+from swift.common.middleware import listing_formats
 from swift.common.swob import HTTPAccepted, HTTPBadRequest, \
     HTTPCreated, HTTPForbidden, HTTPInternalServerError, \
     HTTPMethodNotAllowed, HTTPNoContent, HTTPNotFound, \
-    HTTPPreconditionFailed, HTTPConflict, Request, Response, \
-    HTTPInsufficientStorage, HTTPNotAcceptable
+    HTTPPreconditionFailed, HTTPConflict, Request, \
+    HTTPInsufficientStorage, HTTPException, wsgi_to_str
+from swift.common.request_helpers import is_sys_or_user_meta
+from swift.common.wsgi import run_wsgi
+
+
+def get_account_name_and_placement(req):
+    """
+    Split and validate path for an account.
+
+    :param req: a swob request
+
+    :returns: a tuple of path parts as strings
+    """
+    drive, part, account = split_and_validate_path(req, 3)
+    validate_internal_account(account)
+    return drive, part, account
 
 
-DATADIR = 'accounts'
+def get_container_name_and_placement(req):
+    """
+    Split and validate path for a container.
 
+    :param req: a swob request
 
-class AccountController(object):
+    :returns: a tuple of path parts as strings
+    """
+    drive, part, account, container = split_and_validate_path(req, 3, 4)
+    validate_internal_container(account, container)
+    return drive, part, account, container
+
+
+class AccountController(BaseStorageServer):
     """WSGI controller for the account server."""
 
-    def __init__(self, conf):
-        self.logger = get_logger(conf, log_route='account-server')
+    server_type = 'account-server'
+
+    def __init__(self, conf, logger=None):
+        super(AccountController, self).__init__(conf)
+        self.logger = logger or get_logger(conf, log_route='account-server')
+        self.log_requests = config_true_value(conf.get('log_requests', 'true'))
         self.root = conf.get('devices', '/srv/node')
         self.mount_check = config_true_value(conf.get('mount_check', 'true'))
         self.replicator_rpc = ReplicatorRpc(self.root, DATADIR, AccountBroker,
                                             self.mount_check,
                                             logger=self.logger)
-        self.auto_create_account_prefix = \
-            conf.get('auto_create_account_prefix') or '.'
+        self.auto_create_account_prefix = AUTO_CREATE_ACCOUNT_PREFIX
+
         swift.common.db.DB_PREALLOCATION = \
             config_true_value(conf.get('db_preallocation', 'f'))
+        swift.common.db.QUERY_LOGGING = \
+            config_true_value(conf.get('db_query_logging', 'f'))
+        self.fallocate_reserve, self.fallocate_is_percent = \
+            config_fallocate_value(conf.get('fallocate_reserve', '1%'))
 
-    def _get_account_broker(self, drive, part, account):
+    def _get_account_broker(self, drive, part, account, **kwargs):
         hsh = hash_path(account)
         db_dir = storage_directory(DATADIR, part, hsh)
         db_path = os.path.join(self.root, drive, db_dir, hsh + '.db')
-        return AccountBroker(db_path, account=account, logger=self.logger)
+        kwargs.setdefault('account', account)
+        kwargs.setdefault('logger', self.logger)
+        return AccountBroker(db_path, **kwargs)
+
+    def _deleted_response(self, broker, req, resp, body=''):
+        # We are here since either the account does not exist or
+        # it exists but marked for deletion.
+        headers = {}
+        # Try to check if account exists and is marked for deletion
+        try:
+            if broker.is_status_deleted():
+                # Account does exist and is marked for deletion
+                headers = {'X-Account-Status': 'Deleted'}
+        except DatabaseConnectionError:
+            # Account does not exist!
+            pass
+        return resp(request=req, headers=headers, charset='utf-8', body=body)
+
+    def check_free_space(self, drive):
+        drive_root = os.path.join(self.root, drive)
+        return fs_has_free_space(
+            drive_root, self.fallocate_reserve, self.fallocate_is_percent)
 
     @public
-    @timing_stats
+    @timing_stats()
     def DELETE(self, req):
         """Handle HTTP DELETE request."""
+        drive, part, account = get_account_name_and_placement(req)
         try:
-            drive, part, account = split_path(unquote(req.path), 3)
-            validate_device_partition(drive, part)
-        except ValueError, err:
-            return HTTPBadRequest(body=str(err), content_type='text/plain',
-                                  request=req)
-        if self.mount_check and not check_mount(self.root, drive):
+            check_drive(self.root, drive, self.mount_check)
+        except ValueError:
             return HTTPInsufficientStorage(drive=drive, request=req)
-        if 'x-timestamp' not in req.headers or \
-                not check_float(req.headers['x-timestamp']):
-            return HTTPBadRequest(body='Missing timestamp', request=req,
-                                  content_type='text/plain')
+        req_timestamp = valid_timestamp(req)
         broker = self._get_account_broker(drive, part, account)
         if broker.is_deleted():
-            return HTTPNotFound(request=req)
-        broker.delete_db(req.headers['x-timestamp'])
-        return HTTPNoContent(request=req)
+            return self._deleted_response(broker, req, HTTPNotFound)
+        broker.delete_db(req_timestamp.internal)
+        return self._deleted_response(broker, req, HTTPNoContent)
+
+    def _update_metadata(self, req, broker, req_timestamp):
+        metadata = {
+            wsgi_to_str(key): (wsgi_to_str(value), req_timestamp.internal)
+            for key, value in req.headers.items()
+            if is_sys_or_user_meta('account', key)}
+        if metadata:
+            broker.update_metadata(metadata, validate_metadata=True)
 
     @public
-    @timing_stats
+    @timing_stats()
     def PUT(self, req):
         """Handle HTTP PUT request."""
+        drive, part, account, container = get_container_name_and_placement(req)
         try:
-            drive, part, account, container = split_path(unquote(req.path),
-                                                         3, 4)
-            validate_device_partition(drive, part)
-        except ValueError, err:
-            return HTTPBadRequest(body=str(err), content_type='text/plain',
-                                  request=req)
-        if self.mount_check and not check_mount(self.root, drive):
+            check_drive(self.root, drive, self.mount_check)
+        except ValueError:
+            return HTTPInsufficientStorage(drive=drive, request=req)
+        if not self.check_free_space(drive):
             return HTTPInsufficientStorage(drive=drive, request=req)
-        broker = self._get_account_broker(drive, part, account)
         if container:   # put account container
+            if 'x-timestamp' not in req.headers:
+                timestamp = Timestamp.now()
+            else:
+                timestamp = valid_timestamp(req)
+            pending_timeout = None
+            container_policy_index = \
+                req.headers.get('X-Backend-Storage-Policy-Index', 0)
             if 'x-trans-id' in req.headers:
-                broker.pending_timeout = 3
+                pending_timeout = 3
+            broker = self._get_account_broker(drive, part, account,
+                                              pending_timeout=pending_timeout)
             if account.startswith(self.auto_create_account_prefix) and \
                     not os.path.exists(broker.db_file):
-                broker.initialize(normalize_timestamp(
-                    req.headers.get('x-timestamp') or time.time()))
-            if req.headers.get('x-account-override-deleted', 'no').lower() != \
-                    'yes' and broker.is_deleted():
+                try:
+                    broker.initialize(timestamp.internal)
+                except DatabaseAlreadyExists:
+                    pass
+            if (req.headers.get('x-account-override-deleted', 'no').lower() !=
+                    'yes' and broker.is_deleted()) \
+                    or not os.path.exists(broker.db_file):
                 return HTTPNotFound(request=req)
             broker.put_container(container, req.headers['x-put-timestamp'],
                                  req.headers['x-delete-timestamp'],
                                  req.headers['x-object-count'],
-                                 req.headers['x-bytes-used'])
+                                 req.headers['x-bytes-used'],
+                                 container_policy_index)
             if req.headers['x-delete-timestamp'] > \
                     req.headers['x-put-timestamp']:
                 return HTTPNoContent(request=req)
             else:
                 return HTTPCreated(request=req)
         else:   # put account
-            timestamp = normalize_timestamp(req.headers['x-timestamp'])
+            timestamp = valid_timestamp(req)
+            broker = self._get_account_broker(drive, part, account)
             if not os.path.exists(broker.db_file):
-                broker.initialize(timestamp)
-                created = True
+                try:
+                    broker.initialize(timestamp.internal)
+                    created = True
+                except DatabaseAlreadyExists:
+                    created = False
             elif broker.is_status_deleted():
-                return HTTPForbidden(request=req, body='Recently deleted')
+                return self._deleted_response(broker, req, HTTPForbidden,
+                                              body='Recently deleted')
             else:
                 created = broker.is_deleted()
-                broker.update_put_timestamp(timestamp)
+                broker.update_put_timestamp(timestamp.internal)
                 if broker.is_deleted():
                     return HTTPConflict(request=req)
-            metadata = {}
-            metadata.update((key, (value, timestamp))
-                            for key, value in req.headers.iteritems()
-                            if key.lower().startswith('x-account-meta-'))
-            if metadata:
-                broker.update_metadata(metadata)
+            self._update_metadata(req, broker, timestamp)
             if created:
                 return HTTPCreated(request=req)
             else:
                 return HTTPAccepted(request=req)
 
     @public
-    @timing_stats
+    @timing_stats()
     def HEAD(self, req):
         """Handle HTTP HEAD request."""
-        # TODO(refactor): The account server used to provide a 'account and
-        # container existence check all-in-one' call by doing a HEAD with a
-        # container path. However, container existence is now checked with the
-        # container servers directly so this is no longer needed. We should
-        # refactor out the container existence check here and retest
-        # everything.
+        drive, part, account = get_account_name_and_placement(req)
+        out_content_type = listing_formats.get_listing_content_type(req)
         try:
-            drive, part, account, container = split_path(unquote(req.path),
-                                                         3, 4)
-            validate_device_partition(drive, part)
-        except ValueError, err:
-            return HTTPBadRequest(body=str(err), content_type='text/plain',
-                                  request=req)
-        if self.mount_check and not check_mount(self.root, drive):
+            check_drive(self.root, drive, self.mount_check)
+        except ValueError:
             return HTTPInsufficientStorage(drive=drive, request=req)
-        broker = self._get_account_broker(drive, part, account)
-        if not container:
-            broker.pending_timeout = 0.1
-            broker.stale_reads_ok = True
+        broker = self._get_account_broker(drive, part, account,
+                                          pending_timeout=0.1,
+                                          stale_reads_ok=True)
         if broker.is_deleted():
-            return HTTPNotFound(request=req)
-        info = broker.get_info()
-        headers = {
-            'X-Account-Container-Count': info['container_count'],
-            'X-Account-Object-Count': info['object_count'],
-            'X-Account-Bytes-Used': info['bytes_used'],
-            'X-Timestamp': info['created_at'],
-            'X-PUT-Timestamp': info['put_timestamp']}
-        if container:
-            container_ts = broker.get_container_timestamp(container)
-            if container_ts is not None:
-                headers['X-Container-Timestamp'] = container_ts
-        headers.update((key, value)
-                       for key, (value, timestamp) in
-                       broker.metadata.iteritems() if value != '')
-        if get_param(req, 'format'):
-            req.accept = FORMAT2CONTENT_TYPE.get(
-                get_param(req, 'format').lower(), FORMAT2CONTENT_TYPE['plain'])
-        headers['Content-Type'] = req.accept.best_match(
-            ['text/plain', 'application/json', 'application/xml', 'text/xml'])
-        if not headers['Content-Type']:
-            return HTTPNotAcceptable(request=req)
+            return self._deleted_response(broker, req, HTTPNotFound)
+        headers = get_response_headers(broker)
+        headers['Content-Type'] = out_content_type
+        headers['Content-Length'] = 0
         return HTTPNoContent(request=req, headers=headers, charset='utf-8')
 
     @public
-    @timing_stats
+    @timing_stats()
     def GET(self, req):
         """Handle HTTP GET request."""
+        drive, part, account = get_account_name_and_placement(req)
+        prefix = get_param(req, 'prefix')
+        delimiter = get_param(req, 'delimiter')
+        reverse = config_true_value(get_param(req, 'reverse'))
+        limit = constrain_req_limit(req, constraints.ACCOUNT_LISTING_LIMIT)
+        marker = get_param(req, 'marker', '')
+        end_marker = get_param(req, 'end_marker')
+        out_content_type = listing_formats.get_listing_content_type(req)
+
         try:
-            drive, part, account = split_path(unquote(req.path), 3)
-            validate_device_partition(drive, part)
-        except ValueError, err:
-            return HTTPBadRequest(body=str(err), content_type='text/plain',
-                                  request=req)
-        if self.mount_check and not check_mount(self.root, drive):
+            check_drive(self.root, drive, self.mount_check)
+        except ValueError:
             return HTTPInsufficientStorage(drive=drive, request=req)
-        broker = self._get_account_broker(drive, part, account)
-        broker.pending_timeout = 0.1
-        broker.stale_reads_ok = True
+        broker = self._get_account_broker(drive, part, account,
+                                          pending_timeout=0.1,
+                                          stale_reads_ok=True)
         if broker.is_deleted():
-            return HTTPNotFound(request=req)
-        info = broker.get_info()
-        resp_headers = {
-            'X-Account-Container-Count': info['container_count'],
-            'X-Account-Object-Count': info['object_count'],
-            'X-Account-Bytes-Used': info['bytes_used'],
-            'X-Timestamp': info['created_at'],
-            'X-PUT-Timestamp': info['put_timestamp']}
-        resp_headers.update((key, value)
-                            for key, (value, timestamp) in
-                            broker.metadata.iteritems() if value != '')
-        try:
-            prefix = get_param(req, 'prefix')
-            delimiter = get_param(req, 'delimiter')
-            if delimiter and (len(delimiter) > 1 or ord(delimiter) > 254):
-                # delimiters can be made more flexible later
-                return HTTPPreconditionFailed(body='Bad delimiter')
-            limit = ACCOUNT_LISTING_LIMIT
-            given_limit = get_param(req, 'limit')
-            if given_limit and given_limit.isdigit():
-                limit = int(given_limit)
-                if limit > ACCOUNT_LISTING_LIMIT:
-                    return HTTPPreconditionFailed(request=req,
-                                                  body='Maximum limit is %d' %
-                                                  ACCOUNT_LISTING_LIMIT)
-            marker = get_param(req, 'marker', '')
-            end_marker = get_param(req, 'end_marker')
-            query_format = get_param(req, 'format')
-        except UnicodeDecodeError, err:
-            return HTTPBadRequest(body='parameters not utf8',
-                                  content_type='text/plain', request=req)
-        if query_format:
-            req.accept = FORMAT2CONTENT_TYPE.get(query_format.lower(),
-                                                 FORMAT2CONTENT_TYPE['plain'])
-        out_content_type = req.accept.best_match(
-            ['text/plain', 'application/json', 'application/xml', 'text/xml'])
-        if not out_content_type:
-            return HTTPNotAcceptable(request=req)
-        account_list = broker.list_containers_iter(limit, marker, end_marker,
-                                                   prefix, delimiter)
-        if out_content_type == 'application/json':
-            data = []
-            for (name, object_count, bytes_used, is_subdir) in account_list:
-                if is_subdir:
-                    data.append({'subdir': name})
-                else:
-                    data.append({'name': name, 'count': object_count,
-                                'bytes': bytes_used})
-            account_list = json.dumps(data)
-        elif out_content_type.endswith('/xml'):
-            output_list = ['',
-                           '' % account]
-            for (name, object_count, bytes_used, is_subdir) in account_list:
-                name = saxutils.escape(name)
-                if is_subdir:
-                    output_list.append('' % name)
-                else:
-                    item = '%s%s' \
-                           '%s' % \
-                           (name, object_count, bytes_used)
-                    output_list.append(item)
-            output_list.append('')
-            account_list = '\n'.join(output_list)
-        else:
-            if not account_list:
-                return HTTPNoContent(request=req, headers=resp_headers)
-            account_list = '\n'.join(r[0] for r in account_list) + '\n'
-        ret = Response(body=account_list, request=req, headers=resp_headers)
-        ret.content_type = out_content_type
-        ret.charset = 'utf-8'
-        return ret
+            return self._deleted_response(broker, req, HTTPNotFound)
+        return account_listing_response(account, req, out_content_type, broker,
+                                        limit, marker, end_marker, prefix,
+                                        delimiter, reverse)
 
     @public
-    @timing_stats
+    @replication
+    @timing_stats()
     def REPLICATE(self, req):
         """
         Handle HTTP REPLICATE request.
         Handler for RPC calls for account replication.
         """
+        post_args = split_and_validate_path(req, 3)
+        drive, partition, hash = post_args
         try:
-            post_args = split_path(unquote(req.path), 3)
-            drive, partition, hash = post_args
-            validate_device_partition(drive, partition)
-        except ValueError, err:
-            return HTTPBadRequest(body=str(err), content_type='text/plain',
-                                  request=req)
-        if self.mount_check and not check_mount(self.root, drive):
+            check_drive(self.root, drive, self.mount_check)
+        except ValueError:
+            return HTTPInsufficientStorage(drive=drive, request=req)
+        if not self.check_free_space(drive):
             return HTTPInsufficientStorage(drive=drive, request=req)
         try:
             args = json.load(req.environ['wsgi.input'])
-        except ValueError, err:
+        except ValueError as err:
             return HTTPBadRequest(body=str(err), content_type='text/plain')
         ret = self.replicator_rpc.dispatch(post_args, args)
         ret.request = req
         return ret
 
     @public
-    @timing_stats
+    @timing_stats()
     def POST(self, req):
         """Handle HTTP POST request."""
+        drive, part, account = get_account_name_and_placement(req)
+        req_timestamp = valid_timestamp(req)
         try:
-            drive, part, account = split_path(unquote(req.path), 3)
-            validate_device_partition(drive, part)
-        except ValueError, err:
-            return HTTPBadRequest(body=str(err), content_type='text/plain',
-                                  request=req)
-        if 'x-timestamp' not in req.headers or \
-                not check_float(req.headers['x-timestamp']):
-            return HTTPBadRequest(body='Missing or bad timestamp',
-                                  request=req,
-                                  content_type='text/plain')
-        if self.mount_check and not check_mount(self.root, drive):
+            check_drive(self.root, drive, self.mount_check)
+        except ValueError:
+            return HTTPInsufficientStorage(drive=drive, request=req)
+        if not self.check_free_space(drive):
             return HTTPInsufficientStorage(drive=drive, request=req)
         broker = self._get_account_broker(drive, part, account)
         if broker.is_deleted():
-            return HTTPNotFound(request=req)
-        timestamp = normalize_timestamp(req.headers['x-timestamp'])
-        metadata = {}
-        metadata.update((key, (value, timestamp))
-                        for key, value in req.headers.iteritems()
-                        if key.lower().startswith('x-account-meta-'))
-        if metadata:
-            broker.update_metadata(metadata)
+            return self._deleted_response(broker, req, HTTPNotFound)
+        self._update_metadata(req, broker, req_timestamp)
         return HTTPNoContent(request=req)
 
     def __call__(self, env, start_response):
         start_time = time.time()
         req = Request(env)
         self.logger.txn_id = req.headers.get('x-trans-id', None)
-        if not check_utf8(req.path_info):
+        if not check_utf8(wsgi_to_str(req.path_info), internal=True):
             res = HTTPPreconditionFailed(body='Invalid UTF8')
         else:
             try:
                 # disallow methods which are not publicly accessible
-                try:
-                    method = getattr(self, req.method)
-                    getattr(method, 'publicly_accessible')
-                except AttributeError:
+                if req.method not in self.allowed_methods:
                     res = HTTPMethodNotAllowed()
                 else:
-                    res = method(req)
+                    res = getattr(self, req.method)(req)
+            except HTTPException as error_response:
+                res = error_response
             except (Exception, Timeout):
-                self.logger.exception(_('ERROR __call__ error with %(method)s'
-                                        ' %(path)s '),
+                self.logger.exception('ERROR __call__ error with %(method)s'
+                                      ' %(path)s ',
                                       {'method': req.method, 'path': req.path})
                 res = HTTPInternalServerError(body=traceback.format_exc())
-        trans_time = '%.4f' % (time.time() - start_time)
-        additional_info = ''
-        if res.headers.get('x-container-timestamp') is not None:
-            additional_info += 'x-container-timestamp: %s' % \
-                res.headers['x-container-timestamp']
-        log_message = '%s - - [%s] "%s %s" %s %s "%s" "%s" "%s" %s "%s"' % (
-            req.remote_addr,
-            time.strftime('%d/%b/%Y:%H:%M:%S +0000', time.gmtime()),
-            req.method, req.path,
-            res.status.split()[0], res.content_length or '-',
-            req.headers.get('x-trans-id', '-'),
-            req.referer or '-', req.user_agent or '-',
-            trans_time,
-            additional_info)
-        if req.method.upper() == 'REPLICATE':
-            self.logger.debug(log_message)
-        else:
-            self.logger.info(log_message)
+        if self.log_requests:
+            trans_time = time.time() - start_time
+            additional_info = ''
+            if res.headers.get('x-container-timestamp') is not None:
+                additional_info += 'x-container-timestamp: %s' % \
+                    res.headers['x-container-timestamp']
+            log_msg = get_log_line(req, res, trans_time, additional_info,
+                                   self.log_format, self.anonymization_method,
+                                   self.anonymization_salt)
+            if req.method.upper() == 'REPLICATE':
+                self.logger.debug(log_msg)
+            else:
+                self.logger.info(log_msg)
         return res(env, start_response)
 
 
@@ -376,3 +342,12 @@ def app_factory(global_conf, **local_conf):
     conf = global_conf.copy()
     conf.update(local_conf)
     return AccountController(conf)
+
+
+def main():
+    conf_file, options = parse_options(test_config=True)
+    sys.exit(run_wsgi(conf_file, 'account-server', **options))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/swift/account/utils.py b/swift/account/utils.py
new file mode 100644
index 0000000000..2a540faaeb
--- /dev/null
+++ b/swift/account/utils.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2010-2013 OpenStack Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+from swift.common import constraints
+from swift.common.middleware import listing_formats
+from swift.common.swob import HTTPOk, HTTPNoContent, str_to_wsgi
+from swift.common.utils import Timestamp
+from swift.common.storage_policy import POLICIES
+
+
+class FakeAccountBroker(object):
+    """
+    Quacks like an account broker, but doesn't actually do anything. Responds
+    like an account broker would for a real, empty account with no metadata.
+    """
+    def get_info(self):
+        now = Timestamp.now().internal
+        return {'container_count': 0,
+                'object_count': 0,
+                'bytes_used': 0,
+                'created_at': now,
+                'put_timestamp': now}
+
+    def list_containers_iter(self, *_, **__):
+        return []
+
+    @property
+    def metadata(self):
+        return {}
+
+    def get_policy_stats(self):
+        return {}
+
+
+def get_response_headers(broker):
+    info = broker.get_info()
+    resp_headers = {
+        'X-Account-Container-Count': info['container_count'],
+        'X-Account-Object-Count': info['object_count'],
+        'X-Account-Bytes-Used': info['bytes_used'],
+        'X-Timestamp': Timestamp(info['created_at']).normal,
+        'X-PUT-Timestamp': Timestamp(info['put_timestamp']).normal}
+    policy_stats = broker.get_policy_stats()
+    for policy_idx, stats in policy_stats.items():
+        policy = POLICIES.get_by_index(policy_idx)
+        if not policy:
+            continue
+        header_prefix = 'X-Account-Storage-Policy-%s-%%s' % policy.name
+        for key, value in stats.items():
+            header_name = header_prefix % key.replace('_', '-')
+            resp_headers[header_name] = value
+    resp_headers.update((str_to_wsgi(key), str_to_wsgi(value))
+                        for key, (value, _timestamp) in
+                        broker.metadata.items() if value != '')
+    return resp_headers
+
+
+def account_listing_response(account, req, response_content_type, broker=None,
+                             limit=constraints.ACCOUNT_LISTING_LIMIT,
+                             marker='', end_marker='', prefix='', delimiter='',
+                             reverse=False):
+    if broker is None:
+        broker = FakeAccountBroker()
+
+    resp_headers = get_response_headers(broker)
+
+    account_list = broker.list_containers_iter(limit, marker, end_marker,
+                                               prefix, delimiter, reverse,
+                                               req.allow_reserved_names)
+    data = []
+    for (name, object_count, bytes_used, put_timestamp,
+         storage_policy_index, is_subdir) \
+            in account_list:
+        if is_subdir:
+            data.append({'subdir': name})
+        else:
+            container = {
+                'name': name,
+                'count': object_count,
+                'bytes': bytes_used,
+                'last_modified': Timestamp(put_timestamp).isoformat}
+            # Add the container's storage policy to the response, unless
+            # storage_policy_index was not found in POLICIES, which means
+            # the storage policy is missing from the Swift configuration
+            # or otherwise could not be determined.
+            #
+            # The storage policy should always be returned when
+            # everything is configured correctly, but clients are
+            # expected to be able to handle this case regardless,
+            # if only to support older versions of swift.
+            if storage_policy_index in POLICIES:
+                container['storage_policy'] = (
+                    POLICIES[storage_policy_index].name
+                )
+            data.append(container)
+    if response_content_type.endswith('/xml'):
+        account_list = listing_formats.account_to_xml(data, account)
+        ret = HTTPOk(body=account_list, request=req, headers=resp_headers)
+    elif response_content_type.endswith('/json'):
+        account_list = json.dumps(data).encode('ascii')
+        ret = HTTPOk(body=account_list, request=req, headers=resp_headers)
+    elif data:
+        account_list = listing_formats.listing_to_text(data)
+        ret = HTTPOk(body=account_list, request=req, headers=resp_headers)
+    else:
+        ret = HTTPNoContent(request=req, headers=resp_headers)
+    ret.content_type = response_content_type
+    ret.charset = 'utf-8'
+    return ret
diff --git a/test/functionalnosetests/__init__.py b/swift/cli/__init__.py
similarity index 100%
rename from test/functionalnosetests/__init__.py
rename to swift/cli/__init__.py
diff --git a/bin/swift-account-audit b/swift/cli/account_audit.py
similarity index 71%
rename from bin/swift-account-audit
rename to swift/cli/account_audit.py
index 000048fa1a..6e67fff8e6 100755
--- a/bin/swift-account-audit
+++ b/swift/cli/account_audit.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright (c) 2010-2012 OpenStack, LLC.
+# Copyright (c) 2010-2012 OpenStack Foundation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,17 +16,17 @@
 
 import os
 import sys
-from urllib import quote
-from hashlib import md5
 import getopt
 from itertools import chain
 
-import simplejson
+import json
 from eventlet.greenpool import GreenPool
 from eventlet.event import Event
+from urllib.parse import quote
 
 from swift.common.ring import Ring
 from swift.common.utils import split_path
+from swift.common.utils.base import md5
 from swift.common.bufferedhttp import http_connect
 
 
@@ -43,9 +43,9 @@
 
 Examples!
 
-    %(cmd)s SOSO_88ad0b83-b2c5-4fa1-b2d6-60c597202076
-    %(cmd)s SOSO_88ad0b83-b2c5-4fa1-b2d6-60c597202076/container/object
-    %(cmd)s -e errors.txt SOSO_88ad0b83-b2c5-4fa1-b2d6-60c597202076/container
+    %(cmd)s AUTH_88ad0b83-b2c5-4fa1-b2d6-60c597202076
+    %(cmd)s AUTH_88ad0b83-b2c5-4fa1-b2d6-60c597202076/container/object
+    %(cmd)s -e errors.txt AUTH_88ad0b83-b2c5-4fa1-b2d6-60c597202076/container
     %(cmd)s < errors.txt
     %(cmd)s -c 25 -d < errors.txt
 """ % {'cmd': sys.argv[0]}
@@ -55,10 +55,9 @@ class Auditor(object):
     def __init__(self, swift_dir='/etc/swift', concurrency=50, deep=False,
                  error_file=None):
         self.pool = GreenPool(concurrency)
-        self.object_ring = Ring(os.path.join(swift_dir, ring_name='object'))
-        self.container_ring = \
-            Ring(os.path.join(swift_dir, ring_name='container'))
-        self.account_ring = Ring(os.path.join(swift_dir, ring_name='account'))
+        self.object_ring = Ring(swift_dir, ring_name='object')
+        self.container_ring = Ring(swift_dir, ring_name='container')
+        self.account_ring = Ring(swift_dir, ring_name='account')
         self.deep = deep
         self.error_file = error_file
         # zero out stats
@@ -79,7 +78,7 @@ def audit_object(self, account, container, name):
         container_listing = self.audit_container(account, container)
         consistent = True
         if name not in container_listing:
-            print "  Object %s missing in container listing!" % path
+            print("  Object %s missing in container listing!" % path)
             consistent = False
             hash = None
         else:
@@ -91,7 +90,7 @@ def audit_object(self, account, container, name):
                     conn = http_connect(node['ip'], node['port'],
                                         node['device'], part, 'GET', path, {})
                     resp = conn.getresponse()
-                    calc_hash = md5()
+                    calc_hash = md5(usedforsecurity=False)
                     chunk = True
                     while chunk:
                         chunk = resp.read(8192)
@@ -100,15 +99,15 @@ def audit_object(self, account, container, name):
                     if resp.status // 100 != 2:
                         self.object_not_found += 1
                         consistent = False
-                        print '  Bad status GETting object "%s" on %s/%s' \
-                            % (path, node['ip'], node['device'])
+                        print('  Bad status %s GETting object "%s" on %s/%s'
+                              % (resp.status, path,
+                                 node['ip'], node['device']))
                         continue
                     if resp.getheader('ETag').strip('"') != calc_hash:
                         self.object_checksum_mismatch += 1
                         consistent = False
-                        print '  MD5 doesnt match etag for "%s" on %s/%s' \
-                            % (path, node['ip'], node['device'])
-                    etags.append(resp.getheader('ETag'))
+                        print('  MD5 does not match etag for "%s" on %s/%s'
+                              % (path, node['ip'], node['device']))
                 else:
                     conn = http_connect(node['ip'], node['port'],
                                         node['device'], part, 'HEAD',
@@ -117,28 +116,36 @@ def audit_object(self, account, container, name):
                     if resp.status // 100 != 2:
                         self.object_not_found += 1
                         consistent = False
-                        print '  Bad status HEADing object "%s" on %s/%s' \
-                            % (path, node['ip'], node['device'])
+                        print('  Bad status %s HEADing object "%s" on %s/%s'
+                              % (resp.status, path,
+                                 node['ip'], node['device']))
                         continue
-                    etags.append(resp.getheader('ETag'))
+
+                override_etag = resp.getheader(
+                    'X-Object-Sysmeta-Container-Update-Override-Etag')
+                if override_etag:
+                    etags.append((override_etag, node))
+                else:
+                    etags.append((resp.getheader('ETag'), node))
             except Exception:
                 self.object_exceptions += 1
                 consistent = False
-                print '  Exception fetching object "%s" on %s/%s' \
-                    % (path, node['ip'], node['device'])
+                print('  Exception fetching object "%s" on %s/%s'
+                      % (path, node['ip'], node['device']))
                 continue
         if not etags:
             consistent = False
-            print "  Failed fo fetch object %s at all!" % path
+            print("  Failed fo fetch object %s at all!" % path)
         elif hash:
-            for etag in etags:
-                if resp.getheader('ETag').strip('"') != hash:
+            for etag, node in etags:
+                if etag.strip('"') != hash:
                     consistent = False
                     self.object_checksum_mismatch += 1
-                    print '  ETag mismatch for "%s" on %s/%s' \
-                        % (path, node['ip'], node['device'])
+                    print('  ETag mismatch for "%s" on %s/%s'
+                          % (path, node['ip'], node['device']))
         if not consistent and self.error_file:
-            print >>open(self.error_file, 'a'), path
+            with open(self.error_file, 'a') as err_file:
+                print(path, file=err_file)
         self.objects_checked += 1
 
     def audit_container(self, account, name, recurse=False):
@@ -147,13 +154,13 @@ def audit_container(self, account, name, recurse=False):
         if (account, name) in self.list_cache:
             return self.list_cache[(account, name)]
         self.in_progress[(account, name)] = Event()
-        print 'Auditing container "%s"' % name
+        print('Auditing container "%s"' % name)
         path = '/%s/%s' % (account, name)
         account_listing = self.audit_account(account)
         consistent = True
         if name not in account_listing:
             consistent = False
-            print "  Container %s not in account listing!" % path
+            print("  Container %s not in account listing!" % path)
         part, nodes = \
             self.container_ring.get_nodes(account, name.encode('utf-8'))
         rec_d = {}
@@ -162,7 +169,6 @@ def audit_container(self, account, name, recurse=False):
             marker = ''
             results = True
             while results:
-                node_id = node['id']
                 try:
                     conn = http_connect(node['ip'], node['port'],
                                         node['device'], part, 'GET',
@@ -177,13 +183,14 @@ def audit_container(self, account, name, recurse=False):
                               (path, node['ip'], node['device']))
                         break
                     if node['id'] not in responses:
-                        responses[node['id']] = dict(resp.getheaders())
-                    results = simplejson.loads(resp.read())
+                        responses[node['id']] = {
+                            h.lower(): v for h, v in resp.getheaders()}
+                    results = json.loads(resp.read())
                 except Exception:
                     self.container_exceptions += 1
                     consistent = False
-                    print '  Exception GETting container "%s" on %s/%s' % \
-                        (path, node['ip'], node['device'])
+                    print('  Exception GETting container "%s" on %s/%s' %
+                          (path, node['ip'], node['device']))
                     break
                 if results:
                     marker = results[-1]['name']
@@ -204,13 +211,15 @@ def audit_container(self, account, name, recurse=False):
                       for header in responses.values()]
         if not obj_counts:
             consistent = False
-            print "  Failed to fetch container %s at all!" % path
+            print("  Failed to fetch container %s at all!" % path)
         else:
             if len(set(obj_counts)) != 1:
                 self.container_count_mismatch += 1
                 consistent = False
-                print "  Container databases don't agree on number of objects."
-                print "  Max: %s, Min: %s" % (max(obj_counts), min(obj_counts))
+                print(
+                    "  Container databases don't agree on number of objects.")
+                print(
+                    "  Max: %s, Min: %s" % (max(obj_counts), min(obj_counts)))
         self.containers_checked += 1
         self.list_cache[(account, name)] = rec_d
         self.in_progress[(account, name)].send(True)
@@ -219,7 +228,8 @@ def audit_container(self, account, name, recurse=False):
             for obj in rec_d.keys():
                 self.pool.spawn_n(self.audit_object, account, name, obj)
         if not consistent and self.error_file:
-            print >>open(self.error_file, 'a'), path
+            with open(self.error_file, 'a') as error_file:
+                print(path, file=error_file)
         return rec_d
 
     def audit_account(self, account, recurse=False):
@@ -228,7 +238,7 @@ def audit_account(self, account, recurse=False):
         if account in self.list_cache:
             return self.list_cache[account]
         self.in_progress[account] = Event()
-        print 'Auditing account "%s"' % account
+        print('Auditing account "%s"' % account)
         consistent = True
         path = '/%s' % account
         part, nodes = self.account_ring.get_nodes(account)
@@ -248,22 +258,23 @@ def audit_account(self, account, recurse=False):
                         self.account_not_found += 1
                         consistent = False
                         print("  Bad status GETting account '%s' "
-                              " from %ss:%ss" %
+                              " from %s:%s" %
                               (account, node['ip'], node['device']))
                         break
-                    results = simplejson.loads(resp.read())
+                    results = json.loads(resp.read())
                 except Exception:
                     self.account_exceptions += 1
                     consistent = False
-                    print("  Exception GETting account '%s' on %ss:%ss" %
+                    print("  Exception GETting account '%s' on %s:%s" %
                           (account, node['ip'], node['device']))
                     break
                 if node_id not in responses:
-                    responses[node_id] = [dict(resp.getheaders()), []]
+                    responses[node_id] = [
+                        {h.lower(): v for h, v in resp.getheaders()}, []]
                 responses[node_id][1].extend(results)
                 if results:
                     marker = results[-1]['name']
-        headers = [resp[0] for resp in responses.values()]
+        headers = [r[0] for r in responses.values()]
         cont_counts = [int(header['x-account-container-count'])
                        for header in headers]
         if len(set(cont_counts)) != 1:
@@ -272,8 +283,8 @@ def audit_account(self, account, recurse=False):
             print("  Account databases for '%s' don't agree on"
                   " number of containers." % account)
             if cont_counts:
-                print "  Max: %s, Min: %s" % (max(cont_counts),
-                                              min(cont_counts))
+                print("  Max: %s, Min: %s" % (max(cont_counts),
+                                              min(cont_counts)))
         obj_counts = [int(header['x-account-object-count'])
                       for header in headers]
         if len(set(obj_counts)) != 1:
@@ -282,8 +293,8 @@ def audit_account(self, account, recurse=False):
             print("  Account databases for '%s' don't agree on"
                   " number of objects." % account)
             if obj_counts:
-                print "  Max: %s, Min: %s" % (max(obj_counts),
-                                              min(obj_counts))
+                print("  Max: %s, Min: %s" % (max(obj_counts),
+                                              min(obj_counts)))
         containers = set()
         for resp in responses.values():
             containers.update(container['name'] for container in resp[1])
@@ -296,7 +307,8 @@ def audit_account(self, account, recurse=False):
                 self.pool.spawn_n(self.audit_container, account,
                                   container, True)
         if not consistent and self.error_file:
-            print >>open(self.error_file, 'a'), path
+            with open(self.error_file, 'a') as error_file:
+                print(path, error_file)
         return containers
 
     def audit(self, account, container=None, obj=None):
@@ -311,45 +323,50 @@ def wait(self):
         self.pool.waitall()
 
     def print_stats(self):
-        print
-        print "  Accounts checked: %d" % self.accounts_checked
+
+        def _print_stat(name, stat):
+            # Right align stat name in a field of 18 characters
+            print("{0:>18}: {1}".format(name, stat))
+
+        print()
+        _print_stat("Accounts checked", self.accounts_checked)
         if self.account_not_found:
-            print "  Missing Replicas: %d" % self.account_not_found
+            _print_stat("Missing Replicas", self.account_not_found)
         if self.account_exceptions:
-            print "        Exceptions: %d" % self.account_exceptions
+            _print_stat("Exceptions", self.account_exceptions)
         if self.account_container_mismatch:
-            print " Cntainer mismatch: %d" % self.account_container_mismatch
+            _print_stat("Container mismatch", self.account_container_mismatch)
         if self.account_object_mismatch:
-            print "   Object mismatch: %d" % self.account_object_mismatch
-        print
-        print "Containers checked: %d" % self.containers_checked
+            _print_stat("Object mismatch", self.account_object_mismatch)
+        print()
+        _print_stat("Containers checked", self.containers_checked)
         if self.container_not_found:
-            print "  Missing Replicas: %d" % self.container_not_found
+            _print_stat("Missing Replicas", self.container_not_found)
         if self.container_exceptions:
-            print "        Exceptions: %d" % self.container_exceptions
+            _print_stat("Exceptions", self.container_exceptions)
         if self.container_count_mismatch:
-            print "    Count mismatch: %d" % self.container_count_mismatch
+            _print_stat("Count mismatch", self.container_count_mismatch)
         if self.container_obj_mismatch:
-            print "      Obj mismatch: %d" % self.container_obj_mismatch
-        print
-        print "   Objects checked: %d" % self.objects_checked
+            _print_stat("Object mismatch", self.container_obj_mismatch)
+        print()
+        _print_stat("Objects checked", self.objects_checked)
         if self.object_not_found:
-            print "  Missing Replicas: %d" % self.object_not_found
+            _print_stat("Missing Replicas", self.object_not_found)
         if self.object_exceptions:
-            print "        Exceptions: %d" % self.object_exceptions
+            _print_stat("Exceptions", self.object_exceptions)
         if self.object_checksum_mismatch:
-            print "      MD5 Mismatch: %d" % self.object_checksum_mismatch
+            _print_stat("MD5 Mismatch", self.object_checksum_mismatch)
 
 
-if __name__ == '__main__':
+def main():
     try:
         optlist, args = getopt.getopt(sys.argv[1:], 'c:r:e:d')
-    except getopt.GetoptError, err:
-        print str(err)
-        print usage
+    except getopt.GetoptError as err:
+        print(str(err))
+        print(usage)
         sys.exit(2)
     if not args and os.isatty(sys.stdin.fileno()):
-        print usage
+        print(usage)
         sys.exit()
     opts = dict(optlist)
     options = {
@@ -366,3 +383,7 @@ def print_stats(self):
         auditor.audit(*split_path(path, 1, 3, True))
     auditor.wait()
     auditor.print_stats()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/swift/cli/config.py b/swift/cli/config.py
new file mode 100755
index 0000000000..447f77520a
--- /dev/null
+++ b/swift/cli/config.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import optparse
+import os
+import sys
+
+from swift.common.manager import Server
+from swift.common.utils import readconf
+from swift.common.wsgi import appconfig
+
+parser = optparse.OptionParser('%prog [options] SERVER')
+parser.add_option('-c', '--config-num', metavar="N", type="int",
+                  dest="number", default=0,
+                  help="parse config for the Nth server only")
+parser.add_option('-s', '--section', help="only display matching sections")
+parser.add_option('-w', '--wsgi', action='store_true',
+                  help="use wsgi/paste parser instead of readconf")
+
+
+def _context_name(context):
+    return ':'.join((context.object_type.name, context.name))
+
+
+def inspect_app_config(app_config):
+    conf = {}
+    context = app_config.context
+    section_name = _context_name(context)
+    conf[section_name] = context.config()
+    if context.object_type.name == 'pipeline':
+        filters = context.filter_contexts
+        pipeline = []
+        for filter_context in filters:
+            conf[_context_name(filter_context)] = filter_context.config()
+            pipeline.append(filter_context.entry_point_name)
+        app_context = context.app_context
+        conf[_context_name(app_context)] = app_context.config()
+        pipeline.append(app_context.entry_point_name)
+        conf[section_name]['pipeline'] = ' '.join(pipeline)
+    return conf
+
+
+def main():
+    options, args = parser.parse_args()
+    options = dict(vars(options))
+
+    if not args:
+        return 'ERROR: specify type of server or conf_path'
+    conf_files = []
+    for arg in args:
+        if os.path.exists(arg):
+            conf_files.append(arg)
+        else:
+            conf_files += Server(arg).conf_files(**options)
+    for conf_file in conf_files:
+        print('# %s' % conf_file)
+        if options['wsgi']:
+            app_config = appconfig(conf_file)
+            conf = inspect_app_config(app_config)
+        else:
+            conf = readconf(conf_file)
+        flat_vars = {}
+        for k, v in conf.items():
+            if options['section'] and k != options['section']:
+                continue
+            if not isinstance(v, dict):
+                flat_vars[k] = v
+                continue
+            print('[%s]' % k)
+            for opt, value in v.items():
+                print('%s = %s' % (opt, value))
+            print()
+        for k, v in flat_vars.items():
+            print('# %s = %s' % (k, v))
+        print()
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/swift/cli/container_deleter.py b/swift/cli/container_deleter.py
new file mode 100644
index 0000000000..7b3ae95892
--- /dev/null
+++ b/swift/cli/container_deleter.py
@@ -0,0 +1,168 @@
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy
+# of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+'''
+Enqueue background jobs to delete portions of a container's namespace.
+
+Accepts prefix, marker, and end-marker args that work as in container
+listings. Objects found in the listing will be marked to be deleted
+by the object-expirer; until the object is actually deleted, it will
+continue to appear in listings.
+
+If there are many objects, this operation may take some time. Stats will
+periodically be emitted so you know the process hasn't hung. These will
+also include the last object marked for deletion; if there is a failure,
+pass this as the ``--marker`` when retrying to minimize duplicative work.
+'''
+
+import argparse
+import io
+import itertools
+import json
+import time
+
+from swift.common.internal_client import InternalClient
+from swift.common.utils import Timestamp, MD5_OF_EMPTY_STRING
+from swift.obj.expirer import build_task_obj, ASYNC_DELETE_TYPE
+
+OBJECTS_PER_UPDATE = 10000
+
+
+def make_delete_jobs(account, container, objects, timestamp):
+    '''
+    Create a list of async-delete jobs
+
+    :param account: (native or unicode string) account to delete from
+    :param container: (native or unicode string) container to delete from
+    :param objects: (list of native or unicode strings) objects to delete
+    :param timestamp: (Timestamp) time at which objects should be marked
+                      deleted
+    :returns: list of dicts appropriate for an UPDATE request to an
+              expiring-object queue
+    '''
+    return [
+        {
+            'name': build_task_obj(
+                timestamp, account, container,
+                obj, high_precision=True),
+            'deleted': 0,
+            'created_at': timestamp.internal,
+            'etag': MD5_OF_EMPTY_STRING,
+            'size': 0,
+            'storage_policy_index': 0,
+            'content_type': ASYNC_DELETE_TYPE,
+        } for obj in objects]
+
+
+def mark_for_deletion(swift, account, container, marker, end_marker,
+                      prefix, timestamp=None, yield_time=10):
+    '''
+    Enqueue jobs to async-delete some portion of a container's namespace
+
+    :param swift: InternalClient to use
+    :param account: account to delete from
+    :param container: container to delete from
+    :param marker: only delete objects after this name
+    :param end_marker: only delete objects before this name. Use ``None`` or
+                       empty string to delete to the end of the namespace.
+    :param prefix: only delete objects starting with this prefix
+    :param timestamp: delete all objects as of this time. If ``None``, the
+                      current time will be used.
+    :param yield_time: approximate period with which intermediate results
+                       should be returned. If ``None``, disable intermediate
+                       results.
+    :returns: If ``yield_time`` is ``None``, the number of objects marked for
+              deletion. Otherwise, a generator that will yield out tuples of
+              ``(number of marked objects, last object name)`` approximately
+              every ``yield_time`` seconds. The final tuple will have ``None``
+              as the second element. This form allows you to retry when an
+              error occurs partway through while minimizing duplicate work.
+    '''
+    if timestamp is None:
+        timestamp = Timestamp.now()
+
+    def enqueue_deletes():
+        deleted = 0
+        obj_iter = swift.iter_objects(
+            account, container,
+            marker=marker, end_marker=end_marker, prefix=prefix)
+        time_marker = time.time()
+        while True:
+            to_delete = [obj['name'] for obj in itertools.islice(
+                obj_iter, OBJECTS_PER_UPDATE)]
+            if not to_delete:
+                break
+            delete_jobs = make_delete_jobs(
+                account, container, to_delete, timestamp)
+            swift.make_request(
+                'UPDATE',
+                swift.make_path('.expiring_objects', str(int(timestamp))),
+                headers={'X-Backend-Allow-Private-Methods': 'True',
+                         'X-Backend-Storage-Policy-Index': '0',
+                         'X-Timestamp': timestamp.internal},
+                acceptable_statuses=(2,),
+                body_file=io.BytesIO(json.dumps(delete_jobs).encode('ascii')))
+            deleted += len(delete_jobs)
+            if yield_time is not None and \
+                    time.time() - time_marker > yield_time:
+                yield deleted, to_delete[-1]
+                time_marker = time.time()
+        yield deleted, None
+
+    if yield_time is None:
+        for deleted, marker in enqueue_deletes():
+            if marker is None:
+                return deleted
+    else:
+        return enqueue_deletes()
+
+
+def main(args=None):
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument('--config', default='/etc/swift/internal-client.conf',
+                        help=('internal-client config file '
+                              '(default: /etc/swift/internal-client.conf'))
+    parser.add_argument('--request-tries', type=int, default=3,
+                        help='(default: 3)')
+    parser.add_argument('account', help='account from which to delete')
+    parser.add_argument('container', help='container from which to delete')
+    parser.add_argument(
+        '--prefix', default='',
+        help='only delete objects with this prefix (default: none)')
+    parser.add_argument(
+        '--marker', default='',
+        help='only delete objects after this marker (default: none)')
+    parser.add_argument(
+        '--end-marker', default='',
+        help='only delete objects before this end-marker (default: none)')
+    parser.add_argument(
+        '--timestamp', type=Timestamp, default=Timestamp.now(),
+        help='delete all objects as of this time (default: now)')
+    args = parser.parse_args(args)
+
+    swift = InternalClient(
+        args.config, 'Swift Container Deleter', args.request_tries,
+        global_conf={'log_name': 'container-deleter-ic'})
+    for deleted, marker in mark_for_deletion(
+            swift, args.account, args.container,
+            args.marker, args.end_marker, args.prefix, args.timestamp):
+        if marker is None:
+            print('Finished. Marked %d objects for deletion.' % deleted)
+        else:
+            print('Marked %d objects for deletion, through %r' % (
+                deleted, marker))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/swift/cli/dispersion_populate.py b/swift/cli/dispersion_populate.py
new file mode 100755
index 0000000000..b9508f551c
--- /dev/null
+++ b/swift/cli/dispersion_populate.py
@@ -0,0 +1,284 @@
+#!/usr/bin/env python
+# Copyright (c) 2010-2012 OpenStack Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+import traceback
+from optparse import OptionParser
+from sys import exit, stdout
+from time import time
+
+from eventlet import GreenPool, patcher, sleep
+from eventlet.pools import Pool
+from configparser import ConfigParser
+
+from swift.common.internal_client import SimpleClient
+from swift.common.ring import Ring
+from swift.common.utils import compute_eta, get_time_units, config_true_value
+from swift.common.storage_policy import POLICIES
+
+insecure = False
+
+
+def put_container(connpool, container, report, headers):
+    global retries_done
+    try:
+        with connpool.item() as conn:
+            conn.put_container(container, headers=headers)
+            retries_done += conn.attempts - 1
+        if report:
+            report(True)
+    except Exception:
+        if report:
+            report(False)
+        raise
+
+
+def put_object(connpool, container, obj, report):
+    global retries_done
+    try:
+        with connpool.item() as conn:
+            data = io.BytesIO(obj.encode('utf8'))
+            conn.put_object(container, obj, data,
+                            headers={'x-object-meta-dispersion': obj})
+            retries_done += conn.attempts - 1
+        if report:
+            report(True)
+    except Exception:
+        if report:
+            report(False)
+        raise
+
+
+def report(success):
+    global begun, created, item_type, next_report, need_to_create, retries_done
+    if not success:
+        traceback.print_exc()
+        exit('Gave up due to error(s).')
+    created += 1
+    if time() < next_report:
+        return
+    next_report = time() + 5
+    eta, eta_unit = compute_eta(begun, created, need_to_create)
+    print('\r\x1B[KCreating %s: %d of %d, %d%s left, %d retries'
+          % (item_type, created, need_to_create, round(eta), eta_unit,
+             retries_done), end='')
+    stdout.flush()
+
+
+def main():
+    global begun, created, item_type, next_report, need_to_create, retries_done
+    patcher.monkey_patch()
+    try:
+        # Delay importing so urllib3 will import monkey-patched modules
+        from swiftclient import get_auth
+    except ImportError:
+        from swift.common.internal_client import get_auth
+
+    conffile = '/etc/swift/dispersion.conf'
+
+    parser = OptionParser(usage='''
+Usage: %%prog [options] [conf_file]
+
+[conf_file] defaults to %s'''.strip() % conffile)
+    parser.add_option('--container-only', action='store_true', default=False,
+                      help='Only run container population')
+    parser.add_option('--object-only', action='store_true', default=False,
+                      help='Only run object population')
+    parser.add_option('--container-suffix-start', type=int, default=0,
+                      help='container suffix start value, defaults to 0')
+    parser.add_option('--object-suffix-start', type=int, default=0,
+                      help='object suffix start value, defaults to 0')
+    parser.add_option('--insecure', action='store_true', default=False,
+                      help='Allow accessing insecure keystone server. '
+                           'The keystone\'s certificate will not be verified.')
+    parser.add_option('--no-overlap', action='store_true', default=False,
+                      help="No overlap of partitions if running populate \
+                      more than once. Will increase coverage by amount shown \
+                      in dispersion.conf file")
+    parser.add_option('-P', '--policy-name', dest='policy_name',
+                      help="Specify storage policy name")
+
+    options, args = parser.parse_args()
+
+    if args:
+        conffile = args.pop(0)
+
+    c = ConfigParser()
+    if not c.read(conffile):
+        exit('Unable to read config file: %s' % conffile)
+    conf = dict(c.items('dispersion'))
+
+    if options.policy_name is None:
+        policy = POLICIES.default
+    else:
+        policy = POLICIES.get_by_name(options.policy_name)
+        if policy is None:
+            exit('Unable to find policy: %s' % options.policy_name)
+    print('Using storage policy: %s ' % policy.name)
+
+    swift_dir = conf.get('swift_dir', '/etc/swift')
+    dispersion_coverage = float(conf.get('dispersion_coverage', 1))
+    retries = int(conf.get('retries', 5))
+    concurrency = int(conf.get('concurrency', 25))
+    endpoint_type = str(conf.get('endpoint_type', 'publicURL'))
+    region_name = str(conf.get('region_name', ''))
+    user_domain_name = str(conf.get('user_domain_name', ''))
+    project_domain_name = str(conf.get('project_domain_name', ''))
+    project_name = str(conf.get('project_name', ''))
+    insecure = options.insecure \
+        or config_true_value(conf.get('keystone_api_insecure', 'no'))
+    container_populate = config_true_value(
+        conf.get('container_populate', 'yes')) and not options.object_only
+    object_populate = config_true_value(
+        conf.get('object_populate', 'yes')) and not options.container_only
+
+    if not (object_populate or container_populate):
+        exit("Neither container or object populate is set to run")
+
+    coropool = GreenPool(size=concurrency)
+    retries_done = 0
+
+    os_options = {'endpoint_type': endpoint_type}
+    if user_domain_name:
+        os_options['user_domain_name'] = user_domain_name
+    if project_domain_name:
+        os_options['project_domain_name'] = project_domain_name
+    if project_name:
+        os_options['project_name'] = project_name
+    if region_name:
+        os_options['region_name'] = region_name
+
+    url, token = get_auth(conf['auth_url'], conf['auth_user'],
+                          conf['auth_key'],
+                          auth_version=conf.get('auth_version', '1.0'),
+                          os_options=os_options,
+                          insecure=insecure)
+    account = url.rsplit('/', 1)[1]
+    connpool = Pool(max_size=concurrency)
+    headers = {}
+    headers['X-Storage-Policy'] = policy.name
+    connpool.create = lambda: SimpleClient(
+        url=url, token=token, retries=retries)
+
+    if container_populate:
+        container_ring = Ring(swift_dir, ring_name='container')
+        parts_left = dict((x, x)
+                          for x in range(container_ring.partition_count))
+
+        if options.no_overlap:
+            with connpool.item() as conn:
+                containers = [cont['name'] for cont in conn.get_account(
+                    prefix='dispersion_%d' % policy.idx, full_listing=True)[1]]
+            containers_listed = len(containers)
+            if containers_listed > 0:
+                for container in containers:
+                    partition, _junk = container_ring.get_nodes(account,
+                                                                container)
+                    if partition in parts_left:
+                        del parts_left[partition]
+
+        item_type = 'containers'
+        created = 0
+        retries_done = 0
+        need_to_create = need_to_queue = \
+            dispersion_coverage / 100.0 * container_ring.partition_count
+        begun = next_report = time()
+        next_report += 2
+        suffix = 0
+        while need_to_queue >= 1 and parts_left:
+            container = 'dispersion_%d_%d' % (policy.idx, suffix)
+            part = container_ring.get_part(account, container)
+            if part in parts_left:
+                if suffix >= options.container_suffix_start:
+                    coropool.spawn(put_container, connpool, container, report,
+                                   headers)
+                    sleep()
+                else:
+                    report(True)
+                del parts_left[part]
+                need_to_queue -= 1
+            suffix += 1
+        coropool.waitall()
+        elapsed, elapsed_unit = get_time_units(time() - begun)
+        print('\r\x1B[KCreated %d containers for dispersion reporting, '
+              '%d%s, %d retries' %
+              ((need_to_create - need_to_queue), round(elapsed), elapsed_unit,
+               retries_done))
+        if options.no_overlap:
+            con_coverage = container_ring.partition_count - len(parts_left)
+            print('\r\x1B[KTotal container coverage is now %.2f%%.' %
+                  ((float(con_coverage) / container_ring.partition_count
+                    * 100)))
+        stdout.flush()
+
+    if object_populate:
+        container = 'dispersion_objects_%d' % policy.idx
+        put_container(connpool, container, None, headers)
+        object_ring = Ring(swift_dir, ring_name=policy.ring_name)
+        parts_left = dict((x, x) for x in range(object_ring.partition_count))
+
+        if options.no_overlap:
+            with connpool.item() as conn:
+                obj_container = [cont_b['name'] for cont_b in conn.get_account(
+                    prefix=container, full_listing=True)[1]]
+            if obj_container:
+                with connpool.item() as conn:
+                    objects = [o['name'] for o in
+                               conn.get_container(container,
+                                                  prefix='dispersion_',
+                                                  full_listing=True)[1]]
+                for my_object in objects:
+                    partition = object_ring.get_part(account, container,
+                                                     my_object)
+                    if partition in parts_left:
+                        del parts_left[partition]
+
+        item_type = 'objects'
+        created = 0
+        retries_done = 0
+        need_to_create = need_to_queue = \
+            dispersion_coverage / 100.0 * object_ring.partition_count
+        begun = next_report = time()
+        next_report += 2
+        suffix = 0
+        while need_to_queue >= 1 and parts_left:
+            obj = 'dispersion_%d' % suffix
+            part = object_ring.get_part(account, container, obj)
+            if part in parts_left:
+                if suffix >= options.object_suffix_start:
+                    coropool.spawn(
+                        put_object, connpool, container, obj, report)
+                    sleep()
+                else:
+                    report(True)
+                del parts_left[part]
+                need_to_queue -= 1
+            suffix += 1
+        coropool.waitall()
+        elapsed, elapsed_unit = get_time_units(time() - begun)
+        print('\r\x1B[KCreated %d objects for dispersion reporting, '
+              '%d%s, %d retries' %
+              ((need_to_create - need_to_queue), round(elapsed), elapsed_unit,
+               retries_done))
+        if options.no_overlap:
+            obj_coverage = object_ring.partition_count - len(parts_left)
+            print('\r\x1B[KTotal object coverage is now %.2f%%.' %
+                  ((float(obj_coverage) / object_ring.partition_count * 100)))
+        stdout.flush()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/bin/swift-dispersion-report b/swift/cli/dispersion_report.py
old mode 100755
new mode 100644
similarity index 56%
rename from bin/swift-dispersion-report
rename to swift/cli/dispersion_report.py
index 590dede13c..20b0f7e114
--- a/bin/swift-dispersion-report
+++ b/swift/cli/dispersion_report.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright (c) 2010-2012 OpenStack, LLC.
+# Copyright (c) 2010-2012 OpenStack Foundation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,29 +14,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-from ConfigParser import ConfigParser
+import json
+from collections import defaultdict
+from configparser import ConfigParser
 from optparse import OptionParser
 from sys import exit, stdout, stderr
 from time import time
-try:
-    import simplejson as json
-except ImportError:
-    import json
 
 from eventlet import GreenPool, hubs, patcher, Timeout
 from eventlet.pools import Pool
 
 from swift.common import direct_client
-from swiftclient import ClientException, Connection, get_auth
+from swift.common.internal_client import SimpleClient
 from swift.common.ring import Ring
-from swift.common.utils import compute_eta, get_time_units, config_true_value
+from swift.common.exceptions import ClientException
+from swift.common.utils import compute_eta, get_time_units, \
+    config_true_value, node_to_string
+from swift.common.storage_policy import POLICIES
 
 
 unmounted = []
 notfound = []
 json_output = False
 debug = False
+insecure = False
 
 
 def get_error_log(prefix):
@@ -50,103 +51,106 @@ def error_log(msg_or_exc):
             if msg_or_exc.http_status == 507:
                 if identifier not in unmounted:
                     unmounted.append(identifier)
-                    print >>stderr, 'ERROR: %s is unmounted -- This will ' \
-                        'cause replicas designated for that device to be ' \
-                        'considered missing until resolved or the ring is ' \
-                        'updated.' % (identifier)
+                    print('ERROR: %s is unmounted -- This will '
+                          'cause replicas designated for that device to be '
+                          'considered missing until resolved or the ring is '
+                          'updated.' % (identifier), file=stderr)
                     stderr.flush()
             if debug and identifier not in notfound:
                 notfound.append(identifier)
-                print >>stderr, 'ERROR: %s returned a 404' % (identifier)
+                print('ERROR: %s returned a 404' % (identifier), file=stderr)
                 stderr.flush()
         if not hasattr(msg_or_exc, 'http_status') or \
                 msg_or_exc.http_status not in (404, 507):
-            print >>stderr, 'ERROR: %s: %s' % (prefix, msg_or_exc)
+            print('ERROR: %s: %s' % (prefix, msg_or_exc), file=stderr)
             stderr.flush()
     return error_log
 
 
 def container_dispersion_report(coropool, connpool, account, container_ring,
-                                retries, output_missing_partitions):
+                                retries, output_missing_partitions, policy):
     with connpool.item() as conn:
         containers = [c['name'] for c in conn.get_account(
-            prefix='dispersion_', full_listing=True)[1]]
+            prefix='dispersion_%d' % policy.idx, full_listing=True)[1]]
     containers_listed = len(containers)
     if not containers_listed:
-        print >>stderr, 'No containers to query. Has ' \
-                        'swift-dispersion-populate been run?'
+        print('No containers to query. Has '
+              'swift-dispersion-populate been run?', file=stderr)
         stderr.flush()
         return
     retries_done = [0]
     containers_queried = [0]
-    container_copies_found = [0] * (container_ring.replica_count + 1)
+    container_copies_missing = defaultdict(int)
+    container_copies_found = [0]
+    container_copies_expected = [0]
     begun = time()
     next_report = [time() + 2]
 
     def direct(container, part, nodes):
         found_count = 0
         for node in nodes:
-            error_log = get_error_log('%(ip)s:%(port)s/%(device)s' % node)
+            error_log = get_error_log(node_to_string(node))
             try:
                 attempts, _junk = direct_client.retry(
                     direct_client.direct_head_container, node, part, account,
                     container, error_log=error_log, retries=retries)
                 retries_done[0] += attempts - 1
                 found_count += 1
-            except ClientException, err:
+            except ClientException as err:
                 if err.http_status not in (404, 507):
                     error_log('Giving up on /%s/%s/%s: %s' % (part, account,
                               container, err))
-            except (Exception, Timeout), err:
+            except (Exception, Timeout) as err:
                 error_log('Giving up on /%s/%s/%s: %s' % (part, account,
                           container, err))
         if output_missing_partitions and \
-                found_count < container_ring.replica_count:
-            missing = container_ring.replica_count - found_count
-            print '\r\x1B[K',
+                found_count < len(nodes):
+            missing = len(nodes) - found_count
+            print('\r\x1B[K', end='')
             stdout.flush()
-            print >>stderr, '# Container partition %s missing %s cop%s' % (
-                part, missing, 'y' if missing == 1 else 'ies')
-        container_copies_found[found_count] += 1
+            print('# Container partition %s missing %s cop%s' % (
+                part, missing, 'y' if missing == 1 else 'ies'), file=stderr)
+        container_copies_found[0] += found_count
         containers_queried[0] += 1
+        container_copies_missing[len(nodes) - found_count] += 1
         if time() >= next_report[0]:
             next_report[0] = time() + 5
             eta, eta_unit = compute_eta(begun, containers_queried[0],
                                         containers_listed)
             if not json_output:
-                print '\r\x1B[KQuerying containers: %d of %d, %d%s left, %d ' \
+                print('\r\x1B[KQuerying containers: %d of %d, %d%s left, %d '
                       'retries' % (containers_queried[0], containers_listed,
-                      round(eta), eta_unit, retries_done[0]),
+                                   round(eta), eta_unit, retries_done[0]),
+                      end='')
                 stdout.flush()
     container_parts = {}
     for container in containers:
         part, nodes = container_ring.get_nodes(account, container)
         if part not in container_parts:
+            container_copies_expected[0] += len(nodes)
             container_parts[part] = part
             coropool.spawn(direct, container, part, nodes)
     coropool.waitall()
     distinct_partitions = len(container_parts)
-    copies_expected = distinct_partitions * container_ring.replica_count
-    copies_found = sum(a * b for a, b in enumerate(container_copies_found))
+    copies_found = container_copies_found[0]
+    copies_expected = container_copies_expected[0]
     value = 100.0 * copies_found / copies_expected
     elapsed, elapsed_unit = get_time_units(time() - begun)
+    container_copies_missing.pop(0, None)
     if not json_output:
-        print '\r\x1B[KQueried %d containers for dispersion reporting, ' \
+        print('\r\x1B[KQueried %d containers for dispersion reporting, '
               '%d%s, %d retries' % (containers_listed, round(elapsed),
-              elapsed_unit, retries_done[0])
+                                    elapsed_unit, retries_done[0]))
         if containers_listed - distinct_partitions:
-            print 'There were %d overlapping partitions' % (
-                  containers_listed - distinct_partitions)
-        for copies in xrange(container_ring.replica_count - 1, -1, -1):
-            missing_copies = container_ring.replica_count - copies
-            if container_copies_found[copies]:
-                print missing_string(container_copies_found[copies],
-                                     missing_copies,
-                                     container_ring.replica_count)
-        print '%.02f%% of container copies found (%d of %d)' % (
-            value, copies_found, copies_expected)
-        print 'Sample represents %.02f%% of the container partition space' % (
-            100.0 * distinct_partitions / container_ring.partition_count)
+            print('There were %d overlapping partitions' % (
+                  containers_listed - distinct_partitions))
+        for missing_copies, num_parts in container_copies_missing.items():
+            print(missing_string(num_parts, missing_copies,
+                                 container_ring.replica_count))
+        print('%.02f%% of container copies found (%d of %d)' % (
+            value, copies_found, copies_expected))
+        print('Sample represents %.02f%% of the container partition space' % (
+            100.0 * distinct_partitions / container_ring.partition_count))
         stdout.flush()
         return None
     else:
@@ -155,102 +159,112 @@ def direct(container, part, nodes):
                    'pct_found': value,
                    'copies_found': copies_found,
                    'copies_expected': copies_expected}
-        for copies in xrange(container_ring.replica_count):
-            missing_copies = container_ring.replica_count - copies
-            results['missing_%d' % (missing_copies)] = \
-                container_copies_found[copies]
+        for missing_copies, num_parts in container_copies_missing.items():
+            results['missing_%d' % (missing_copies)] = num_parts
         return results
 
 
 def object_dispersion_report(coropool, connpool, account, object_ring,
-                             retries, output_missing_partitions):
-    container = 'dispersion_objects'
+                             retries, output_missing_partitions, policy):
+    container = 'dispersion_objects_%d' % policy.idx
     with connpool.item() as conn:
         try:
             objects = [o['name'] for o in conn.get_container(
                 container, prefix='dispersion_', full_listing=True)[1]]
-        except ClientException, err:
+        except ClientException as err:
             if err.http_status != 404:
                 raise
-            print >>stderr, 'No objects to query. Has ' \
-                            'swift-dispersion-populate been run?'
+
+            print('No objects to query. Has '
+                  'swift-dispersion-populate been run?', file=stderr)
             stderr.flush()
             return
     objects_listed = len(objects)
     if not objects_listed:
-        print >>stderr, 'No objects to query. Has swift-dispersion-populate ' \
-                        'been run?'
+        print('No objects to query. Has swift-dispersion-populate '
+              'been run?', file=stderr)
         stderr.flush()
         return
     retries_done = [0]
     objects_queried = [0]
-    object_copies_found = [0] * (object_ring.replica_count + 1)
+    object_copies_found = [0]
+    object_copies_expected = [0]
+    object_copies_missing = defaultdict(int)
     begun = time()
     next_report = [time() + 2]
 
+    headers = None
+    if policy is not None:
+        headers = {}
+        headers['X-Backend-Storage-Policy-Index'] = int(policy)
+
     def direct(obj, part, nodes):
         found_count = 0
         for node in nodes:
-            error_log = get_error_log('%(ip)s:%(port)s/%(device)s' % node)
+            error_log = get_error_log(node_to_string(node))
             try:
                 attempts, _junk = direct_client.retry(
                     direct_client.direct_head_object, node, part, account,
-                    container, obj, error_log=error_log, retries=retries)
+                    container, obj, error_log=error_log, retries=retries,
+                    headers=headers)
                 retries_done[0] += attempts - 1
                 found_count += 1
-            except ClientException, err:
+            except ClientException as err:
                 if err.http_status not in (404, 507):
                     error_log('Giving up on /%s/%s/%s/%s: %s' % (part, account,
                               container, obj, err))
-            except (Exception, Timeout), err:
+            except (Exception, Timeout) as err:
                 error_log('Giving up on /%s/%s/%s/%s: %s' % (part, account,
                           container, obj, err))
         if output_missing_partitions and \
-                found_count < object_ring.replica_count:
-            missing = object_ring.replica_count - found_count
-            print '\r\x1B[K',
+                found_count < len(nodes):
+            missing = len(nodes) - found_count
+            print('\r\x1B[K', end='')
             stdout.flush()
-            print >>stderr, '# Object partition %s missing %s cop%s' % (
-                part, missing, 'y' if missing == 1 else 'ies')
-        object_copies_found[found_count] += 1
+            print('# Object partition %s missing %s cop%s' % (
+                part, missing, 'y' if missing == 1 else 'ies'), file=stderr)
+        object_copies_found[0] += found_count
+        object_copies_missing[len(nodes) - found_count] += 1
         objects_queried[0] += 1
         if time() >= next_report[0]:
             next_report[0] = time() + 5
             eta, eta_unit = compute_eta(begun, objects_queried[0],
                                         objects_listed)
             if not json_output:
-                print '\r\x1B[KQuerying objects: %d of %d, %d%s left, %d ' \
+                print('\r\x1B[KQuerying objects: %d of %d, %d%s left, %d '
                       'retries' % (objects_queried[0], objects_listed,
                                    round(eta), eta_unit, retries_done[0]),
+                      end='')
             stdout.flush()
     object_parts = {}
     for obj in objects:
         part, nodes = object_ring.get_nodes(account, container, obj)
         if part not in object_parts:
+            object_copies_expected[0] += len(nodes)
             object_parts[part] = part
             coropool.spawn(direct, obj, part, nodes)
     coropool.waitall()
     distinct_partitions = len(object_parts)
-    copies_expected = distinct_partitions * object_ring.replica_count
-    copies_found = sum(a * b for a, b in enumerate(object_copies_found))
+    copies_found = object_copies_found[0]
+    copies_expected = object_copies_expected[0]
     value = 100.0 * copies_found / copies_expected
     elapsed, elapsed_unit = get_time_units(time() - begun)
     if not json_output:
-        print '\r\x1B[KQueried %d objects for dispersion reporting, ' \
+        print('\r\x1B[KQueried %d objects for dispersion reporting, '
               '%d%s, %d retries' % (objects_listed, round(elapsed),
-              elapsed_unit, retries_done[0])
+                                    elapsed_unit, retries_done[0]))
         if objects_listed - distinct_partitions:
-            print 'There were %d overlapping partitions' % (
-                  objects_listed - distinct_partitions)
-        for copies in xrange(object_ring.replica_count - 1, -1, -1):
-            missing_copies = object_ring.replica_count - copies
-            if object_copies_found[copies]:
-                print missing_string(object_copies_found[copies],
-                                     missing_copies, object_ring.replica_count)
-        print '%.02f%% of object copies found (%d of %d)' % \
-            (value, copies_found, copies_expected)
-        print 'Sample represents %.02f%% of the object partition space' % (
-            100.0 * distinct_partitions / object_ring.partition_count)
+            print('There were %d overlapping partitions' % (
+                  objects_listed - distinct_partitions))
+
+        for missing_copies, num_parts in object_copies_missing.items():
+            print(missing_string(num_parts, missing_copies,
+                                 object_ring.replica_count))
+
+        print('%.02f%% of object copies found (%d of %d)' %
+              (value, copies_found, copies_expected))
+        print('Sample represents %.02f%% of the object partition space' % (
+            100.0 * distinct_partitions / object_ring.partition_count))
         stdout.flush()
         return None
     else:
@@ -259,10 +273,9 @@ def direct(obj, part, nodes):
                    'pct_found': value,
                    'copies_found': copies_found,
                    'copies_expected': copies_expected}
-        for copies in xrange(object_ring.replica_count):
-            missing_copies = object_ring.replica_count - copies
-            results['missing_%d' % (missing_copies)] = \
-                object_copies_found[copies]
+
+        for missing_copies, num_parts in object_copies_missing.items():
+            results['missing_%d' % (missing_copies,)] = num_parts
         return results
 
 
@@ -281,9 +294,9 @@ def missing_string(partition_count, missing_copies, copy_count):
         verb_string = 'were'
         partition_string = 'partitions'
 
-    copy_string = 'copy'
-    if missing_copies > 1:
-        copy_string = 'copies'
+    copy_string = 'copies'
+    if missing_copies == 1:
+        copy_string = 'copy'
 
     return '%sThere %s %d %s missing %s %s.' % (
         exclamations, verb_string, partition_count, partition_string,
@@ -291,7 +304,7 @@ def missing_string(partition_count, missing_copies, copy_count):
     )
 
 
-if __name__ == '__main__':
+def main():
     patcher.monkey_patch()
     hubs.get_hub().debug_exceptions = False
 
@@ -311,52 +324,110 @@ def missing_string(partition_count, missing_copies, copy_count):
                       help='Only run container report')
     parser.add_option('--object-only', action='store_true', default=False,
                       help='Only run object report')
-    options, args = parser.parse_args()
+    parser.add_option('--insecure', action='store_true', default=False,
+                      help='Allow accessing insecure keystone server. '
+                           'The keystone\'s certificate will not be verified.')
+    parser.add_option('-P', '--policy-name', dest='policy_name',
+                      help="Specify storage policy name")
 
+    options, args = parser.parse_args()
     if args:
         conffile = args.pop(0)
 
+    if options.debug:
+        global debug
+        debug = True
+
     c = ConfigParser()
     if not c.read(conffile):
         exit('Unable to read config file: %s' % conffile)
     conf = dict(c.items('dispersion'))
+
+    if options.dump_json:
+        conf['dump_json'] = 'yes'
+    if options.object_only:
+        conf['container_report'] = 'no'
+    if options.container_only:
+        conf['object_report'] = 'no'
+    if options.insecure:
+        conf['keystone_api_insecure'] = 'yes'
+    if options.partitions:
+        conf['partitions'] = 'yes'
+
+    output = generate_report(conf, options.policy_name)
+
+    if json_output:
+        print(json.dumps(output))
+
+
+def generate_report(conf, policy_name=None):
+    try:
+        # Delay importing so urllib3 will import monkey-patched modules
+        from swiftclient import get_auth
+    except ImportError:
+        from swift.common.internal_client import get_auth
+    global json_output
+    json_output = config_true_value(conf.get('dump_json', 'no'))
+    if policy_name is None:
+        policy = POLICIES.default
+    else:
+        policy = POLICIES.get_by_name(policy_name)
+        if policy is None:
+            exit('Unable to find policy: %s' % policy_name)
+    if not json_output:
+        print('Using storage policy: %s ' % policy.name)
+
     swift_dir = conf.get('swift_dir', '/etc/swift')
-    dispersion_coverage = int(conf.get('dispersion_coverage', 1))
     retries = int(conf.get('retries', 5))
     concurrency = int(conf.get('concurrency', 25))
-    if options.dump_json or config_true_value(conf.get('dump_json', 'no')):
-        json_output = True
-    container_report = config_true_value(conf.get('container_report', 'yes')) \
-        and not options.object_only
-    object_report = config_true_value(conf.get('object_report', 'yes')) \
-        and not options.container_only
+    endpoint_type = str(conf.get('endpoint_type', 'publicURL'))
+    region_name = str(conf.get('region_name', ''))
+    container_report = config_true_value(conf.get('container_report', 'yes'))
+    object_report = config_true_value(conf.get('object_report', 'yes'))
     if not (object_report or container_report):
         exit("Neither container or object report is set to run")
-    if options.debug:
-        debug = True
+    user_domain_name = str(conf.get('user_domain_name', ''))
+    project_domain_name = str(conf.get('project_domain_name', ''))
+    project_name = str(conf.get('project_name', ''))
+    insecure = config_true_value(conf.get('keystone_api_insecure', 'no'))
 
     coropool = GreenPool(size=concurrency)
 
+    os_options = {'endpoint_type': endpoint_type}
+    if user_domain_name:
+        os_options['user_domain_name'] = user_domain_name
+    if project_domain_name:
+        os_options['project_domain_name'] = project_domain_name
+    if project_name:
+        os_options['project_name'] = project_name
+    if region_name:
+        os_options['region_name'] = region_name
+
     url, token = get_auth(conf['auth_url'], conf['auth_user'],
                           conf['auth_key'],
-                          auth_version=conf.get('auth_version', '1.0'))
+                          auth_version=conf.get('auth_version', '1.0'),
+                          os_options=os_options,
+                          insecure=insecure)
     account = url.rsplit('/', 1)[1]
     connpool = Pool(max_size=concurrency)
-    connpool.create = lambda: Connection(
-        conf['auth_url'], conf['auth_user'], conf['auth_key'], retries=retries,
-        preauthurl=url, preauthtoken=token)
+    connpool.create = lambda: SimpleClient(
+        url=url, token=token, retries=retries)
 
     container_ring = Ring(swift_dir, ring_name='container')
-    object_ring = Ring(swift_dir, ring_name='object')
+    object_ring = Ring(swift_dir, ring_name=policy.ring_name)
 
     output = {}
     if container_report:
         output['container'] = container_dispersion_report(
             coropool, connpool, account, container_ring, retries,
-            options.partitions)
+            conf.get('partitions'), policy)
     if object_report:
         output['object'] = object_dispersion_report(
             coropool, connpool, account, object_ring, retries,
-            options.partitions)
-    if json_output:
-        print json.dumps(output)
+            conf.get('partitions'), policy)
+
+    return output
+
+
+if __name__ == '__main__':
+    main()
diff --git a/swift/cli/drive_audit.py b/swift/cli/drive_audit.py
new file mode 100755
index 0000000000..fa8367895c
--- /dev/null
+++ b/swift/cli/drive_audit.py
@@ -0,0 +1,266 @@
+#!/usr/bin/env python
+# Copyright (c) 2010-2012 OpenStack Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+import glob
+import locale
+import os
+import os.path
+import re
+import subprocess
+import sys
+
+
+from configparser import ConfigParser
+
+from swift.common.utils import backward, get_logger, dump_recon_cache, \
+    config_true_value
+
+
+def get_devices(device_dir, logger):
+    devices = []
+    majmin_devices = {}
+
+    # List /dev/block
+    # Using os.scandir on recent versions of python, else os.listdir
+    if 'scandir' in dir(os):
+        with os.scandir("/dev/block") as it:
+            for ent in it:
+                if ent.is_symlink():
+                    dev_name = os.path.basename(os.readlink(ent.path))
+                    majmin = os.path.basename(ent.path).split(':')
+                    majmin_devices[dev_name] = {'major': majmin[0],
+                                                'minor': majmin[1]}
+    else:
+        for ent in os.listdir("/dev/block"):
+            ent_path = os.path.join("/dev/block", ent)
+            if os.path.is_symlink(ent_path):
+                dev_name = os.path.basename(os.readlink(ent_path))
+                majmin = os.path.basename(ent_path).split(':')
+                majmin_devices[dev_name] = {'major': majmin[0],
+                                            'minor': majmin[1]}
+
+    for line in open('/proc/mounts').readlines():
+        data = line.strip().split()
+        block_device = data[0]
+        mount_point = data[1]
+        if mount_point.startswith(device_dir):
+            device = {}
+            device['mount_point'] = mount_point
+            device['block_device'] = block_device
+            dev_name = os.path.basename(block_device)
+            if dev_name in majmin_devices:
+                # If symlink is in /dev/block
+                device['major'] = majmin_devices[dev_name]['major']
+                device['minor'] = majmin_devices[dev_name]['minor']
+            else:
+                # Else we try to stat block_device
+                try:
+                    device_num = os.stat(block_device).st_rdev
+                except OSError:
+                    # If we can't stat the device,
+                    # then something weird is going on
+                    logger.error(
+                        'Could not determine major:minor numbers for %s '
+                        '(mounted at %s)! Skipping...',
+                        block_device, mount_point)
+                    continue
+                device['major'] = str(os.major(device_num))
+                device['minor'] = str(os.minor(device_num))
+            devices.append(device)
+    for line in open('/proc/partitions').readlines()[2:]:
+        major, minor, blocks, kernel_device = line.strip().split()
+        device = [d for d in devices
+                  if d['major'] == major and d['minor'] == minor]
+        if device:
+            device[0]['kernel_device'] = kernel_device
+    return devices
+
+
+def get_errors(error_re, log_file_pattern, minutes, logger,
+               log_file_encoding):
+    # Assuming log rotation is being used, we need to examine
+    # recently rotated files in case the rotation occurred
+    # just before the script is being run - the data we are
+    # looking for may have rotated.
+    #
+    # The globbing used before would not work with all out-of-box
+    # distro setup for logrotate and syslog therefore moving this
+    # to the config where one can set it with the desired
+    # globbing pattern.
+    log_files = [f for f in glob.glob(log_file_pattern)]
+    try:
+        log_files.sort(key=lambda f: os.stat(f).st_mtime, reverse=True)
+    except (IOError, OSError) as exc:
+        logger.error(exc)
+        print(exc)
+        sys.exit(1)
+
+    now_time = datetime.datetime.now()
+    end_time = now_time - datetime.timedelta(minutes=minutes)
+    # kern.log does not contain the year so we need to keep
+    # track of the year and month in case the year recently
+    # ticked over
+    year = now_time.year
+    prev_ent_month = now_time.strftime('%b')
+    errors = {}
+
+    reached_old_logs = False
+    for path in log_files:
+        try:
+            f = open(path, 'rb')
+        except IOError:
+            logger.error("Error: Unable to open " + path)
+            print("Unable to open " + path)
+            sys.exit(1)
+        for line in backward(f):
+            line = line.decode(log_file_encoding, 'surrogateescape')
+            if '[    0.000000]' in line \
+                or 'KERNEL supported cpus:' in line \
+                    or 'BIOS-provided physical RAM map:' in line:
+                # Ignore anything before the last boot
+                reached_old_logs = True
+                break
+            # Solves the problem with year change - kern.log does not
+            # keep track of the year.
+            log_time_ent = line.split()[:3]
+            if log_time_ent[0] == 'Dec' and prev_ent_month == 'Jan':
+                year -= 1
+            prev_ent_month = log_time_ent[0]
+            log_time_string = '%d %s' % (year, ' '.join(log_time_ent))
+            try:
+                log_time = datetime.datetime.strptime(
+                    log_time_string, '%Y %b %d %H:%M:%S')
+            except ValueError:
+                # Some versions use ISO timestamps instead
+                try:
+                    log_time = datetime.datetime.strptime(
+                        line[0:19], '%Y-%m-%dT%H:%M:%S')
+                except ValueError:
+                    continue
+            if log_time > end_time:
+                for err in error_re:
+                    for device in err.findall(line):
+                        errors[device] = errors.get(device, 0) + 1
+            else:
+                reached_old_logs = True
+                break
+        if reached_old_logs:
+            break
+    return errors
+
+
+def comment_fstab(mount_point):
+    with open('/etc/fstab', 'r') as fstab:
+        with open('/etc/fstab.new', 'w') as new_fstab:
+            for line in fstab:
+                parts = line.split()
+                if len(parts) > 2 \
+                    and parts[1] == mount_point \
+                        and not line.startswith('#'):
+                    new_fstab.write('#' + line)
+                else:
+                    new_fstab.write(line)
+    os.rename('/etc/fstab.new', '/etc/fstab')
+
+
+def main():
+    c = ConfigParser()
+    try:
+        conf_path = sys.argv[1]
+    except Exception:
+        print("Usage: %s CONF_FILE" % sys.argv[0].split('/')[-1])
+        sys.exit(1)
+    if not c.read(conf_path):
+        print("Unable to read config file %s" % conf_path)
+        sys.exit(1)
+    conf = dict(c.items('drive-audit'))
+    device_dir = conf.get('device_dir', '/srv/node')
+    minutes = int(conf.get('minutes', 60))
+    error_limit = int(conf.get('error_limit', 1))
+    recon_cache_path = conf.get('recon_cache_path', "/var/cache/swift")
+    log_file_pattern = conf.get('log_file_pattern',
+                                '/var/log/kern.*[!.][!g][!z]')
+    log_file_encoding = conf.get('log_file_encoding', 'auto')
+    if log_file_encoding == 'auto':
+        log_file_encoding = locale.getpreferredencoding()
+    log_to_console = config_true_value(conf.get('log_to_console', False))
+    error_re = []
+    for conf_key in conf:
+        if conf_key.startswith('regex_pattern_'):
+            error_pattern = conf[conf_key]
+            try:
+                r = re.compile(error_pattern)
+            except re.error:
+                sys.exit('Error: unable to compile regex pattern "%s"' %
+                         error_pattern)
+            error_re.append(r)
+    if not error_re:
+        error_re = [
+            re.compile(r'\berror\b.*\b(sd[a-z]{1,2}\d?)\b'),
+            re.compile(r'\b(sd[a-z]{1,2}\d?)\b.*\berror\b'),
+        ]
+    conf['log_name'] = conf.get('log_name', 'drive-audit')
+    logger = get_logger(conf, log_to_console=log_to_console,
+                        log_route='drive-audit')
+    devices = get_devices(device_dir, logger)
+    logger.debug("Devices found: %s" % str(devices))
+    if not devices:
+        logger.error("Error: No devices found!")
+    recon_errors = {}
+    total_errors = 0
+    for device in devices:
+        recon_errors[device['mount_point']] = 0
+    errors = get_errors(error_re, log_file_pattern, minutes, logger,
+                        log_file_encoding)
+    logger.debug("Errors found: %s" % str(errors))
+    unmounts = 0
+    for kernel_device, count in errors.items():
+        if count >= error_limit:
+            device = \
+                [d for d in devices if d['kernel_device'] == kernel_device]
+            if device:
+                mount_point = device[0]['mount_point']
+                if mount_point.startswith(device_dir):
+                    if config_true_value(conf.get('unmount_failed_device',
+                                                  True)):
+                        logger.info("Unmounting %s with %d errors" %
+                                    (mount_point, count))
+                        subprocess.call(['umount', '-fl', mount_point])
+                        logger.info("Commenting out %s from /etc/fstab" %
+                                    (mount_point))
+                        comment_fstab(mount_point)
+                        unmounts += 1
+                    else:
+                        logger.info("Detected %s with %d errors "
+                                    "(Device not unmounted)" %
+                                    (mount_point, count))
+                    recon_errors[mount_point] = count
+                    total_errors += count
+    recon_file = recon_cache_path + "/drive.recon"
+    dump_recon_cache(recon_errors, recon_file, logger)
+    dump_recon_cache({'drive_audit_errors': total_errors}, recon_file, logger,
+                     set_owner=conf.get("user", "swift"))
+
+    if unmounts == 0:
+        logger.info("No drives were unmounted")
+    elif os.path.isdir("/run/systemd/system"):
+        logger.debug("fstab updated, calling systemctl daemon-reload")
+        subprocess.call(["/usr/bin/systemctl", "daemon-reload"])
+
+
+if __name__ == '__main__':
+    main()
diff --git a/swift/cli/form_signature.py b/swift/cli/form_signature.py
new file mode 100644
index 0000000000..7a8bff7250
--- /dev/null
+++ b/swift/cli/form_signature.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2010-2012 OpenStack Foundation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Script for generating a form signature for use with FormPost middleware.
+"""
+import hmac
+from hashlib import sha1
+from os.path import basename
+from time import time
+
+
+def main(argv):
+    if len(argv) != 7:
+        prog = basename(argv[0])
+        print('Syntax: %s    '
+              '  ' % prog)
+        print()
+        print('Where:')
+        print('              The prefix to use for form uploaded')
+        print('                    objects. For example:')
+        print('                    /v1/account/container/object_prefix_ would')
+        print('                    ensure all form uploads have that path')
+        print('                    prepended to the browser-given file name.')
+        print('          The URL to redirect the browser to after')
+        print('                    the uploads have completed.')
+        print('     The maximum file size per file uploaded.')
+        print('    The maximum number of uploaded files')
+        print('                    allowed.')
+        print('           The number of seconds from now to allow')
+        print('                    the form post to begin.')
+        print('               The X-Account-Meta-Temp-URL-Key for the')
+        print('                    account.')
+        print()
+        print('Example output:')
+        print('    Expires: 1323842228')
+        print('  Signature: 18de97e47345a82c4dbfb3b06a640dbb')
+        print()
+        print('Sample form:')
+        print()
+        print('NOTE: the 
tag\'s "action" attribute does not contain ' + 'the Swift cluster\'s hostname.') + print('You should manually add it before using the form.') + print() + print('') + print(' ') + print(' ... more HTML ...') + print(' ') + print('
') + return 1 + path, redirect, max_file_size, max_file_count, seconds, key = argv[1:] + try: + max_file_size = int(max_file_size) + except ValueError: + max_file_size = -1 + if max_file_size < 0: + print('Please use a value greater than or equal to 0.') + return 1 + try: + max_file_count = int(max_file_count) + except ValueError: + max_file_count = 0 + if max_file_count < 1: + print('Please use a positive value.') + return 1 + try: + expires = int(time() + int(seconds)) + except ValueError: + expires = 0 + if expires < 1: + print('Please use a positive value.') + return 1 + parts = path.split('/', 4) + # Must be four parts, ['', 'v1', 'a', 'c'], must be a v1 request, have + # account and container values, and optionally have an object prefix. + if len(parts) < 4 or parts[0] or parts[1] != 'v1' or not parts[2] or \ + not parts[3]: + print(' must point to a container at least.') + print('For example: /v1/account/container') + print(' Or: /v1/account/container/object_prefix') + return 1 + data = '%s\n%s\n%s\n%s\n%s' % (path, redirect, max_file_size, + max_file_count, expires) + data = data.encode('utf8') + key = key if isinstance(key, bytes) else \ + key.encode('utf8') + sig = hmac.new(key, data, + sha1).hexdigest() + print(' Expires:', expires) + print('Signature:', sig) + print('') + + print('Sample form:\n') + + print('NOTE: the
tag\'s "action" attribute does not ' + 'contain the Swift cluster\'s hostname.') + print('You should manually add it before using the form.\n') + + print('' + % path) + if redirect: + print(' ' + % redirect) + print(' ' + % max_file_size) + print(' ' + % max_file_count) + print(' ' % expires) + print(' ' % sig) + print(' ' + % max_file_count) + print(' ') + print(' ') + for i in range(max_file_count): + print(' ' % i) + print('
') + print(' ') + print('
') + return 0 diff --git a/swift/cli/get_nodes.py b/swift/cli/get_nodes.py new file mode 100755 index 0000000000..7586cd3b46 --- /dev/null +++ b/swift/cli/get_nodes.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python +# Copyright (c) 2010-2012 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +from optparse import OptionParser +from os.path import basename + +from swift.common.ring import Ring +from swift.common.storage_policy import reload_storage_policies +from swift.common.utils import set_swift_dir +from swift.cli.info import (parse_get_node_args, print_item_locations, + InfoSystemExit) + + +def main(): + + usage = ''' + Shows the nodes responsible for the item specified. + Usage: %prog [-a] [ []] + Or: %prog [-a] -p partition + Or: %prog [-a] -P policy_name [ []] + Or: %prog [-a] -P policy_name -p partition + Note: account, container, object can also be a single arg separated by / + Example: + $ %prog -a /etc/swift/account.ring.gz MyAccount + Partition 5743883 + Hash 96ae332a60b58910784e4417a03e1ad0 + 10.1.1.7:8000 sdd1 + 10.1.9.2:8000 sdb1 + 10.1.5.5:8000 sdf1 + 10.1.5.9:8000 sdt1 # [Handoff] + ''' + parser = OptionParser(usage) + parser.add_option('-a', '--all', action='store_true', + help='Show all handoff nodes') + parser.add_option('-p', '--partition', metavar='PARTITION', + help='Show nodes for a given partition') + parser.add_option('-P', '--policy-name', dest='policy_name', + help='Specify which policy to use') + parser.add_option('-d', '--swift-dir', default='/etc/swift', + dest='swift_dir', help='Path to swift directory') + parser.add_option('-Q', '--quoted', action='store_true', + help='Assume swift paths are quoted') + options, args = parser.parse_args() + + if set_swift_dir(options.swift_dir): + reload_storage_policies() + + try: + ring_path, args = parse_get_node_args(options, args) + except InfoSystemExit as e: + parser.print_help() + sys.exit('ERROR: %s' % e) + + ring = ring_name = None + if ring_path: + ring_name = basename(ring_path)[:-len('.ring.gz')] + ring = Ring(ring_path) + + try: + print_item_locations(ring, ring_name, *args, **vars(options)) + except InfoSystemExit: + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/swift/cli/info.py b/swift/cli/info.py new file mode 100644 index 0000000000..ffaaee253a --- /dev/null +++ b/swift/cli/info.py @@ -0,0 +1,813 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy +# of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + + +import codecs +import itertools +import json +from optparse import OptionParser +import os +import sqlite3 +import sys +from collections import defaultdict + +import urllib + +from swift.common.exceptions import LockTimeout +from swift.common.utils import hash_path, storage_directory, \ + Timestamp, is_valid_ipv6 +from swift.common.ring import Ring +from swift.common.request_helpers import is_sys_meta, is_user_meta, \ + strip_sys_meta_prefix, strip_user_meta_prefix, \ + is_object_transient_sysmeta, strip_object_transient_sysmeta_prefix +from swift.account.backend import AccountBroker, DATADIR as ABDATADIR +from swift.container.backend import ContainerBroker, DATADIR as CBDATADIR +from swift.obj.diskfile import get_data_dir, read_metadata, DATADIR_BASE, \ + extract_policy +from swift.common.storage_policy import POLICIES, reload_storage_policies +from swift.common.swob import wsgi_to_str +from swift.common.middleware.crypto.crypto_utils import load_crypto_meta +from swift.common.utils import md5, set_swift_dir + + +class InfoSystemExit(Exception): + """ + Indicates to the caller that a sys.exit(1) should be performed. + """ + pass + + +def parse_get_node_args(options, args): + """ + Parse the get_nodes commandline args + + :returns: a tuple, (ring_path, args) + """ + ring_path = None + + if options.policy_name: + if POLICIES.get_by_name(options.policy_name) is None: + raise InfoSystemExit('No policy named %r' % options.policy_name) + elif args and args[0].endswith('.ring.gz'): + if os.path.exists(args[0]): + ring_path = args.pop(0) + else: + raise InfoSystemExit('Ring file does not exist') + + if options.quoted: + args = [urllib.parse.unquote(arg) for arg in args] + if len(args) == 1: + args = args[0].strip('/').split('/', 2) + + if not ring_path and not options.policy_name: + raise InfoSystemExit('Need to specify policy_name or ') + + if not (args or options.partition): + raise InfoSystemExit('No target specified') + + if len(args) > 3: + raise InfoSystemExit('Invalid arguments') + + return ring_path, args + + +def curl_head_command(ip, port, device, part, target, policy_index): + """ + Provide a string that is a well formatted curl command to HEAD an object + on a storage node. + + :param ip: the ip of the node + :param port: the port of the node + :param device: the device of the node + :param target: the path of the target resource + :param policy_index: the policy_index of the target resource (can be None) + + :returns: a string, a well formatted curl command + """ + if is_valid_ipv6(ip): + formatted_ip = '[%s]' % ip + else: + formatted_ip = ip + + cmd = 'curl -g -I -XHEAD "http://%s:%s/%s/%s/%s"' % ( + formatted_ip, port, device, part, urllib.parse.quote(target)) + if policy_index is not None: + cmd += ' -H "%s: %s"' % ('X-Backend-Storage-Policy-Index', + policy_index) + cmd += ' --path-as-is' + return cmd + + +def print_ring_locations(ring, datadir, account, container=None, obj=None, + tpart=None, all_nodes=False, policy_index=None): + """ + print out ring locations of specified type + + :param ring: ring instance + :param datadir: name of directory where things are stored. Usually one of + "accounts", "containers", "objects", or "objects-N". + :param account: account name + :param container: container name + :param obj: object name + :param tpart: target partition in ring + :param all_nodes: include all handoff nodes. If false, only the N primary + nodes and first N handoffs will be printed. + :param policy_index: include policy_index in curl headers + """ + if not ring: + raise ValueError("No ring specified") + if not datadir: + raise ValueError("No datadir specified") + if tpart is None and not account: + raise ValueError("No partition or account/container/object specified") + if not account and (container or obj): + raise ValueError("Container/object specified without account") + if obj and not container: + raise ValueError('Object specified without container') + + if obj: + target = '%s/%s/%s' % (account, container, obj) + elif container: + target = '%s/%s' % (account, container) + else: + target = '%s' % (account) + + if tpart: + part = int(tpart) + else: + part = ring.get_part(account, container, obj) + + primary_nodes = ring.get_part_nodes(part) + handoff_nodes = ring.get_more_nodes(part) + if not all_nodes: + handoff_nodes = itertools.islice(handoff_nodes, len(primary_nodes)) + handoff_nodes = list(handoff_nodes) + + if account and not tpart: + path_hash = hash_path(account, container, obj) + else: + path_hash = None + print('Partition\t%s' % part) + print('Hash \t%s\n' % path_hash) + + for node in primary_nodes: + print('Server:Port Device\t%s:%s %s' % (node['ip'], node['port'], + node['device'])) + for node in handoff_nodes: + print('Server:Port Device\t%s:%s %s\t [Handoff]' % ( + node['ip'], node['port'], node['device'])) + + print("\n") + + for node in primary_nodes: + cmd = curl_head_command(node['ip'], node['port'], node['device'], + part, target, policy_index) + print(cmd) + for node in handoff_nodes: + cmd = curl_head_command(node['ip'], node['port'], node['device'], + part, target, policy_index) + cmd += ' # [Handoff]' + print(cmd) + + print("\n\nUse your own device location of servers:") + print("such as \"export DEVICE=/srv/node\"") + if path_hash: + for node in primary_nodes: + print('ssh %s "ls -lah ${DEVICE:-/srv/node*}/%s/%s"' % + (node['ip'], node['device'], + storage_directory(datadir, part, path_hash))) + for node in handoff_nodes: + print('ssh %s "ls -lah ${DEVICE:-/srv/node*}/%s/%s" # [Handoff]' % + (node['ip'], node['device'], + storage_directory(datadir, part, path_hash))) + else: + for node in primary_nodes: + print('ssh %s "ls -lah ${DEVICE:-/srv/node*}/%s/%s/%d"' % + (node['ip'], node['device'], datadir, part)) + for node in handoff_nodes: + print('ssh %s "ls -lah ${DEVICE:-/srv/node*}/%s/%s/%d"' + ' # [Handoff]' % + (node['ip'], node['device'], datadir, part)) + + print('\nnote: `/srv/node*` is used as default value of `devices`, the ' + 'real value is set in the config file on each storage node.') + + +def get_max_len_sync_item(syncs, item, title): + def map_func(element): + return str(element[item]) + return max(list(map(len, map(map_func, syncs))) + [len(title)]) + + +def print_db_syncs(incoming, syncs): + max_sync_point_len = get_max_len_sync_item(syncs, 'sync_point', + "Sync Point") + max_remote_len = get_max_len_sync_item(syncs, 'remote_id', "Remote ID") + print('%s Syncs:' % ('Incoming' if incoming else 'Outgoing')) + print(' %s\t%s\t%s' % ("Sync Point".ljust(max_sync_point_len), + "Remote ID".ljust(max_remote_len), + "Updated At")) + for sync in syncs: + print(' %s\t%s\t%s (%s)' % ( + str(sync['sync_point']).ljust(max_sync_point_len), + sync['remote_id'].ljust(max_remote_len), + Timestamp(sync['updated_at']).isoformat, + sync['updated_at'])) + + +def print_db_info_metadata(db_type, info, metadata, drop_prefixes=False, + verbose=False): + """ + print out data base info/metadata based on its type + + :param db_type: database type, account or container + :param info: dict of data base info + :param metadata: dict of data base metadata + :param drop_prefixes: if True, strip "X-Account-Meta-", + "X-Container-Meta-", "X-Account-Sysmeta-", and + "X-Container-Sysmeta-" when displaying + User Metadata and System Metadata dicts + """ + if info is None: + raise ValueError('DB info is None') + + if db_type not in ['container', 'account']: + raise ValueError('Wrong DB type') + + try: + account = info['account'] + container = None + + if db_type == 'container': + container = info['container'] + path = '/%s/%s' % (account, container) + else: + path = '/%s' % account + + print('Path: %s' % path) + print(' Account: %s' % account) + + if db_type == 'container': + print(' Container: %s' % container) + + print(' Deleted: %s' % info['is_deleted']) + path_hash = hash_path(account, container) + if db_type == 'container': + print(' Container Hash: %s' % path_hash) + else: + print(' Account Hash: %s' % path_hash) + + print('Metadata:') + print(' Created at: %s (%s)' % + (Timestamp(info['created_at']).isoformat, + info['created_at'])) + print(' Put Timestamp: %s (%s)' % + (Timestamp(info['put_timestamp']).isoformat, + info['put_timestamp'])) + print(' Delete Timestamp: %s (%s)' % + (Timestamp(info['delete_timestamp']).isoformat, + info['delete_timestamp'])) + print(' Status Timestamp: %s (%s)' % + (Timestamp(info['status_changed_at']).isoformat, + info['status_changed_at'])) + if db_type == 'account': + print(' Container Count: %s' % info['container_count']) + print(' Object Count: %s' % info['object_count']) + print(' Bytes Used: %s' % info['bytes_used']) + if db_type == 'container': + try: + policy_name = POLICIES[info['storage_policy_index']].name + except KeyError: + policy_name = 'Unknown' + print(' Storage Policy: %s (%s)' % ( + policy_name, info['storage_policy_index'])) + print(' Reported Put Timestamp: %s (%s)' % + (Timestamp(info['reported_put_timestamp']).isoformat, + info['reported_put_timestamp'])) + print(' Reported Delete Timestamp: %s (%s)' % + (Timestamp(info['reported_delete_timestamp']).isoformat, + info['reported_delete_timestamp'])) + print(' Reported Object Count: %s' % + info['reported_object_count']) + print(' Reported Bytes Used: %s' % info['reported_bytes_used']) + print(' Chexor: %s' % info['hash']) + print(' UUID: %s' % info['id']) + except KeyError as e: + raise ValueError('Info is incomplete: %s' % e) + + meta_prefix = 'x_' + db_type + '_' + for key, value in info.items(): + if key.lower().startswith(meta_prefix): + title = key.replace('_', '-').title() + print(' %s: %s' % (title, value)) + user_metadata = {} + sys_metadata = {} + for key, (value, timestamp) in metadata.items(): + if is_user_meta(db_type, key): + if drop_prefixes: + key = strip_user_meta_prefix(db_type, key) + user_metadata[key] = value + elif is_sys_meta(db_type, key): + if drop_prefixes: + key = strip_sys_meta_prefix(db_type, key) + sys_metadata[key] = value + else: + title = key.replace('_', '-').title() + print(' %s: %s' % (title, value)) + if sys_metadata: + print(' System Metadata:') + for key, value in sys_metadata.items(): + print(' %s: %s' % (key, value)) + else: + print('No system metadata found in db file') + + if user_metadata: + print(' User Metadata:') + for key, value in user_metadata.items(): + print(' %s: %s' % (key, value)) + else: + print('No user metadata found in db file') + + if db_type == 'container': + print('Sharding Metadata:') + shard_type = 'root' if info['is_root'] else 'shard' + print(' Type: %s' % shard_type) + print(' State: %s' % info['db_state']) + if info.get('shard_ranges'): + num_shards = len(info['shard_ranges']) + print('Shard Ranges (%d):' % num_shards) + count_by_state = defaultdict(int) + for srange in info['shard_ranges']: + count_by_state[(srange.state, srange.state_text)] += 1 + print(' States:') + for key_state, count in sorted(count_by_state.items()): + key, state = key_state + print(' %9s: %s' % (state, count)) + if verbose: + for srange in info['shard_ranges']: + srange = dict(srange, state_text=srange.state_text) + print(' Name: %(name)s' % srange) + print(' lower: %(lower)r, upper: %(upper)r' % srange) + print(' Object Count: %(object_count)d, Bytes Used: ' + '%(bytes_used)d, State: %(state_text)s (%(state)d)' + % srange) + print(' Created at: %s (%s)' + % (Timestamp(srange['timestamp']).isoformat, + srange['timestamp'])) + print(' Meta Timestamp: %s (%s)' + % (Timestamp(srange['meta_timestamp']).isoformat, + srange['meta_timestamp'])) + else: + print('(Use -v/--verbose to show more Shard Ranges details)') + + +def print_obj_metadata(metadata, drop_prefixes=False): + """ + Print out basic info and metadata from object, as returned from + :func:`swift.obj.diskfile.read_metadata`. + + Metadata should include the keys: name, Content-Type, and + X-Timestamp. + + Additional metadata is displayed unmodified. + + :param metadata: dict of object metadata + :param drop_prefixes: if True, strip "X-Object-Meta-", "X-Object-Sysmeta-", + and "X-Object-Transient-Sysmeta-" when displaying + User Metadata, System Metadata, and Transient + System Metadata entries + + :raises ValueError: + """ + user_metadata = {} + sys_metadata = {} + transient_sys_metadata = {} + other_metadata = {} + + if not metadata: + raise ValueError('Metadata is None') + path = metadata.pop('name', '') + content_type = metadata.pop('Content-Type', '') + ts = Timestamp(metadata.pop('X-Timestamp', Timestamp.zero())) + account = container = obj = obj_hash = None + if path: + try: + account, container, obj = path.split('/', 3)[1:] + except ValueError: + raise ValueError('Path is invalid for object %r' % path) + else: + obj_hash = hash_path(account, container, obj) + print('Path: %s' % path) + print(' Account: %s' % account) + print(' Container: %s' % container) + print(' Object: %s' % obj) + print(' Object hash: %s' % obj_hash) + else: + print('Path: Not found in metadata') + if content_type: + print('Content-Type: %s' % content_type) + else: + print('Content-Type: Not found in metadata') + if ts: + print('Timestamp: %s (%s)' % (ts.isoformat, ts.internal)) + else: + print('Timestamp: Not found in metadata') + + for key, value in metadata.items(): + if is_user_meta('Object', key): + if drop_prefixes: + key = strip_user_meta_prefix('Object', key) + user_metadata[key] = value + elif is_sys_meta('Object', key): + if drop_prefixes: + key = strip_sys_meta_prefix('Object', key) + sys_metadata[key] = value + elif is_object_transient_sysmeta(key): + if drop_prefixes: + key = strip_object_transient_sysmeta_prefix(key) + transient_sys_metadata[key] = value + else: + other_metadata[key] = value + + def print_metadata(title, items): + print(title) + if items: + for key, value in sorted(items.items()): + print(' %s: %s' % (key, value)) + else: + print(' No metadata found') + + print_metadata('System Metadata:', sys_metadata) + print_metadata('Transient System Metadata:', transient_sys_metadata) + print_metadata('User Metadata:', user_metadata) + print_metadata('Other Metadata:', other_metadata) + for label, meta in [ + ('Data crypto details', + sys_metadata.get('X-Object-Sysmeta-Crypto-Body-Meta')), + ('Metadata crypto details', + transient_sys_metadata.get('X-Object-Transient-Sysmeta-Crypto-Meta')), + ]: + if meta is None: + continue + print('%s: %s' % ( + label, + json.dumps(load_crypto_meta(meta, b64decode=False), indent=2, + sort_keys=True, separators=(',', ': ')))) + + +def print_info(db_type, db_file, swift_dir='/etc/swift', stale_reads_ok=False, + drop_prefixes=False, verbose=False, sync=False): + if db_type not in ('account', 'container'): + print("Unrecognized DB type: internal error") + raise InfoSystemExit() + if not os.path.exists(db_file) or not db_file.endswith('.db'): + print("DB file doesn't exist") + raise InfoSystemExit() + if not db_file.startswith(('/', './')): + db_file = './' + db_file # don't break if the bare db file is given + if db_type == 'account': + broker = AccountBroker(db_file, stale_reads_ok=stale_reads_ok) + datadir = ABDATADIR + else: + broker = ContainerBroker(db_file, stale_reads_ok=stale_reads_ok) + datadir = CBDATADIR + try: + info = broker.get_info() + except sqlite3.OperationalError as err: + if 'no such table' in str(err): + print("Does not appear to be a DB of type \"%s\": %s" + % (db_type, db_file)) + raise InfoSystemExit() + raise + account = info['account'] + container = None + info['is_deleted'] = broker.is_deleted() + if db_type == 'container': + container = info['container'] + info['is_root'] = broker.is_root_container() + sranges = broker.get_shard_ranges() + if sranges: + info['shard_ranges'] = sranges + print_db_info_metadata( + db_type, info, broker.metadata, drop_prefixes, verbose) + if sync: + # Print incoming / outgoing sync tables. + for incoming in (True, False): + print_db_syncs(incoming, broker.get_syncs(incoming, + include_timestamp=True)) + try: + ring = Ring(swift_dir, ring_name=db_type) + except Exception: + ring = None + else: + print_ring_locations(ring, datadir, account, container) + + +def print_obj(datafile, check_etag=True, swift_dir='/etc/swift', + policy_name='', drop_prefixes=False): + """ + Display information about an object read from the datafile. + Optionally verify the datafile content matches the ETag metadata. + + :param datafile: path on disk to object file + :param check_etag: boolean, will read datafile content and verify + computed checksum matches value stored in + metadata. + :param swift_dir: the path on disk to rings + :param policy_name: optionally the name to use when finding the ring + :param drop_prefixes: if True, strip "X-Object-Meta-", "X-Object-Sysmeta-", + and "X-Object-Transient-Sysmeta-" when displaying + User Metadata, System Metadata, and Transient + System Metadata entries + """ + if not os.path.exists(datafile): + print("Data file doesn't exist") + raise InfoSystemExit() + if not datafile.startswith(('/', './')): + datafile = './' + datafile + + policy_index = None + ring = None + datadir = DATADIR_BASE + + # try to extract policy index from datafile disk path + fullpath = os.path.abspath(datafile) + policy_index = int(extract_policy(fullpath) or POLICIES.legacy) + + try: + if policy_index: + datadir += '-' + str(policy_index) + ring = Ring(swift_dir, ring_name='object-' + str(policy_index)) + elif policy_index == 0: + ring = Ring(swift_dir, ring_name='object') + except IOError: + # no such ring + pass + + if policy_name: + policy = POLICIES.get_by_name(policy_name) + if policy: + policy_index_for_name = policy.idx + if (policy_index is not None and + policy_index_for_name is not None and + policy_index != policy_index_for_name): + print('WARNING: Ring does not match policy!') + print('Double check your policy name!') + if not ring and policy_index_for_name: + ring = POLICIES.get_object_ring(policy_index_for_name, + swift_dir) + datadir = get_data_dir(policy_index_for_name) + + with open(datafile, 'rb') as fp: + try: + metadata = read_metadata(fp) + except EOFError: + print("Invalid metadata") + raise InfoSystemExit() + metadata = {wsgi_to_str(k): v if k == 'name' else wsgi_to_str(v) + for k, v in metadata.items()} + + etag = metadata.pop('ETag', '') + length = metadata.pop('Content-Length', '') + path = metadata.get('name', '') + print_obj_metadata(metadata, drop_prefixes) + + # Optional integrity check; it's useful, but slow. + file_len = None + if check_etag: + h = md5(usedforsecurity=False) + file_len = 0 + while True: + data = fp.read(64 * 1024) + if not data: + break + h.update(data) + file_len += len(data) + h = h.hexdigest() + if etag: + if h == etag: + print('ETag: %s (valid)' % etag) + else: + print("ETag: %s doesn't match file hash of %s!" % + (etag, h)) + else: + print('ETag: Not found in metadata') + else: + print('ETag: %s (not checked)' % etag) + file_len = os.fstat(fp.fileno()).st_size + + if length: + if file_len == int(length): + print('Content-Length: %s (valid)' % length) + else: + print("Content-Length: %s doesn't match file length of %s" + % (length, file_len)) + else: + print('Content-Length: Not found in metadata') + + account, container, obj = path.split('/', 3)[1:] + if ring: + print_ring_locations(ring, datadir, account, container, obj, + policy_index=policy_index) + + +def print_item_locations(ring, ring_name=None, account=None, container=None, + obj=None, **kwargs): + """ + Display placement information for an item based on ring lookup. + + If a ring is provided it always takes precedence, but warnings will be + emitted if it doesn't match other optional arguments like the policy_name + or ring_name. + + If no ring is provided the ring_name and/or policy_name will be used to + lookup the ring. + + :param ring: a ring instance + :param ring_name: server type, or storage policy ring name if object ring + :param account: account name + :param container: container name + :param obj: object name + :param partition: part number for non path lookups + :param policy_name: name of storage policy to use to lookup the ring + :param all_nodes: include all handoff nodes. If false, only the N primary + nodes and first N handoffs will be printed. + """ + + policy_name = kwargs.get('policy_name', None) + part = kwargs.get('partition', None) + all_nodes = kwargs.get('all', False) + swift_dir = kwargs.get('swift_dir', '/etc/swift') + + if ring and policy_name: + policy = POLICIES.get_by_name(policy_name) + if policy: + if ring_name != policy.ring_name: + print('WARNING: mismatch between ring and policy name!') + else: + print('WARNING: Policy %s is not valid' % policy_name) + + policy_index = None + if ring is None and (obj or part): + if not policy_name: + print('Need a ring or policy') + raise InfoSystemExit() + policy = POLICIES.get_by_name(policy_name) + if not policy: + print('No policy named %r' % policy_name) + raise InfoSystemExit() + policy_index = int(policy) + ring = POLICIES.get_object_ring(policy_index, swift_dir) + ring_name = (POLICIES.get_by_name(policy_name)).ring_name + + if (container or obj) and not account: + print('No account specified') + raise InfoSystemExit() + + if obj and not container: + print('No container specified') + raise InfoSystemExit() + + if not account and not part: + print('No target specified') + raise InfoSystemExit() + + loc = '' + if part and ring_name: + if '-' in ring_name and ring_name.startswith('object'): + loc = 'objects-' + ring_name.split('-', 1)[1] + else: + loc = ring_name + 's' + if account and container and obj: + loc = 'objects' + if '-' in ring_name and ring_name.startswith('object'): + policy_index = int(ring_name.rsplit('-', 1)[1]) + loc = 'objects-%d' % policy_index + if account and container and not obj: + loc = 'containers' + if not any([ring, ring_name]): + ring = Ring(swift_dir, ring_name='container') + else: + if ring_name != 'container': + print('WARNING: account/container specified ' + + 'but ring not named "container"') + if account and not container and not obj: + loc = 'accounts' + if not any([ring, ring_name]): + ring = Ring(swift_dir, ring_name='account') + else: + if ring_name != 'account': + print('WARNING: account specified ' + + 'but ring not named "account"') + + if account: + print('\nAccount \t%s' % urllib.parse.quote(account)) + if container: + print('Container\t%s' % urllib.parse.quote(container)) + if obj: + print('Object \t%s\n\n' % urllib.parse.quote(obj)) + print_ring_locations(ring, loc, account, container, obj, part, all_nodes, + policy_index=policy_index) + + +def obj_main(): + # Make stdout able to write escaped bytes + sys.stdout = codecs.getwriter("utf-8")( + sys.stdout.detach(), errors='surrogateescape') + + parser = OptionParser('%prog [options] OBJECT_FILE') + parser.add_option( + '-n', '--no-check-etag', default=True, + action="store_false", dest="check_etag", + help="Don't verify file contents against stored etag") + parser.add_option( + '-d', '--swift-dir', default='/etc/swift', dest='swift_dir', + help="Pass location of swift directory") + parser.add_option( + '--drop-prefixes', default=False, action="store_true", + help="When outputting metadata, drop the per-section common prefixes") + parser.add_option( + '-P', '--policy-name', dest='policy_name', + help="Specify storage policy name") + + options, args = parser.parse_args() + + if len(args) != 1: + sys.exit(parser.print_help()) + + if set_swift_dir(options.swift_dir): + reload_storage_policies() + + try: + print_obj(*args, **vars(options)) + except InfoSystemExit: + sys.exit(1) + + +def run_print_info(db_type, args, opts): + try: + print_info(db_type, *args, **opts) + except InfoSystemExit: + sys.exit(1) + except (sqlite3.OperationalError, LockTimeout) as e: + if not opts.get('stale_reads_ok'): + opts['stale_reads_ok'] = True + print('WARNING: Possibly Stale Data') + run_print_info(db_type, args, opts) + sys.exit(2) + else: + print('%s info failed: %s' % (db_type.title(), e)) + sys.exit(1) + + +def container_main(): + parser = OptionParser('%prog [options] CONTAINER_DB_FILE') + parser.add_option( + '-d', '--swift-dir', default='/etc/swift', + help="Pass location of swift directory") + parser.add_option( + '--drop-prefixes', default=False, action="store_true", + help="When outputting metadata, drop the per-section common prefixes") + parser.add_option( + '-v', '--verbose', default=False, action="store_true", + help="Show all shard ranges. By default, only the number of shard " + "ranges is displayed if there are many shards.") + parser.add_option( + '--sync', '-s', default=False, action="store_true", + help="Output the contents of the incoming/outging sync tables") + + options, args = parser.parse_args() + + if len(args) != 1: + sys.exit(parser.print_help()) + + run_print_info('container', args, vars(options)) + + +def account_main(): + parser = OptionParser('%prog [options] ACCOUNT_DB_FILE') + parser.add_option( + '-d', '--swift-dir', default='/etc/swift', + help="Pass location of swift directory") + parser.add_option( + '--drop-prefixes', default=False, action="store_true", + help="When outputting metadata, drop the per-section common prefixes") + parser.add_option( + '--sync', '-s', default=False, action="store_true", + help="Output the contents of the incoming/outging sync tables") + + options, args = parser.parse_args() + + if len(args) != 1: + sys.exit(parser.print_help()) + + run_print_info('account', args, vars(options)) diff --git a/swift/cli/manage_shard_ranges.py b/swift/cli/manage_shard_ranges.py new file mode 100644 index 0000000000..8bfe01a9ae --- /dev/null +++ b/swift/cli/manage_shard_ranges.py @@ -0,0 +1,1215 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy +# of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +""" +The ``swift-manage-shard-ranges`` tool provides commands for initiating +sharding of a container. ``swift-manage-shard-ranges`` operates directly on a +container database file. + +.. note:: + + ``swift-manage-shard-ranges`` must only be used on one replica of a + container database to avoid inconsistent results. The modifications made by + ``swift-manage-shard-ranges`` will be automatically copied to other + replicas of the container database via normal replication processes. + +There are three steps in the process of initiating sharding, each of which may +be performed in isolation or, as shown below, using a single command. + +#. The ``find`` sub-command scans the container database to identify how many + shard containers will be required and which objects they will manage. Each + shard container manages a range of the object namespace defined by a + ``lower`` and ``upper`` bound. The maximum number of objects to be allocated + to each shard container is specified on the command line. For example:: + + $ swift-manage-shard-ranges find 500000 + Loaded db broker for AUTH_test/c1. + [ + { + "index": 0, + "lower": "", + "object_count": 500000, + "upper": "o_01086834" + }, + { + "index": 1, + "lower": "o_01086834", + "object_count": 500000, + "upper": "o_01586834" + }, + { + "index": 2, + "lower": "o_01586834", + "object_count": 500000, + "upper": "o_02087570" + }, + { + "index": 3, + "lower": "o_02087570", + "object_count": 500000, + "upper": "o_02587572" + }, + { + "index": 4, + "lower": "o_02587572", + "object_count": 500000, + "upper": "o_03087572" + }, + { + "index": 5, + "lower": "o_03087572", + "object_count": 500000, + "upper": "o_03587572" + }, + { + "index": 6, + "lower": "o_03587572", + "object_count": 349194, + "upper": "" + } + ] + Found 7 ranges in 4.37222s (total object count 3349194) + + This command returns a list of shard ranges each of which describes the + namespace to be managed by a shard container. No other action is taken by + this command and the container database is unchanged. The output may be + redirected to a file for subsequent retrieval by the ``replace`` command. + For example:: + + $ swift-manage-shard-ranges find 500000 > my_shard_ranges + Loaded db broker for AUTH_test/c1. + Found 7 ranges in 2.448s (total object count 3349194) + +#. The ``replace`` sub-command deletes any shard ranges that might already be + in the container database and inserts shard ranges from a given file. The + file contents should be in the format generated by the ``find`` sub-command. + For example:: + + $ swift-manage-shard-ranges replace my_shard_ranges + Loaded db broker for AUTH_test/c1. + No shard ranges found to delete. + Injected 7 shard ranges. + Run container-replicator to replicate them to other nodes. + Use the enable sub-command to enable sharding. + + The container database is modified to store the shard ranges, but the + container will not start sharding until sharding is enabled. The ``info`` + sub-command may be used to inspect the state of the container database at + any point, and the ``show`` sub-command may be used to display the inserted + shard ranges. + + Shard ranges stored in the container database may be replaced using the + ``replace`` sub-command. This will first delete all existing shard ranges + before storing new shard ranges. Shard ranges may also be deleted from the + container database using the ``delete`` sub-command. + + Shard ranges should not be replaced or deleted using + ``swift-manage-shard-ranges`` once the next step of enabling sharding has + been taken. + +#. The ``enable`` sub-command enables the container for sharding. The sharder + daemon and/or container replicator daemon will replicate shard ranges to + other replicas of the container DB and the sharder daemon will proceed to + shard the container. This process may take some time depending on the size + of the container, the number of shard ranges and the underlying hardware. + + .. note:: + + Once the ``enable`` sub-command has been used there is no supported + mechanism to revert sharding. Do not use ``swift-manage-shard-ranges`` + to make any further changes to the shard ranges in the container DB. + + For example:: + + $ swift-manage-shard-ranges enable + Loaded db broker for AUTH_test/c1. + Container moved to state 'sharding' with epoch 1525345093.22908. + Run container-sharder on all nodes to shard the container. + + This does not shard the container - sharding is performed by the + :ref:`sharder_daemon` - but sets the necessary state in the database for the + daemon to subsequently start the sharding process. + + The ``epoch`` value displayed in the output is the time at which sharding + was enabled. When the :ref:`sharder_daemon` starts sharding this container + it creates a new container database file using the epoch in the filename to + distinguish it from the retiring DB that is being sharded. + +All three steps may be performed with one sub-command:: + + $ swift-manage-shard-ranges find_and_replace 500000 --enable \ +--force + Loaded db broker for AUTH_test/c1. + No shard ranges found to delete. + Injected 7 shard ranges. + Run container-replicator to replicate them to other nodes. + Container moved to state 'sharding' with epoch 1525345669.46153. + Run container-sharder on all nodes to shard the container. + +""" +import argparse +import json +import os.path +import sys +import time +from contextlib import contextmanager + +from swift.common.utils import Timestamp, get_logger, ShardRange, readconf, \ + ShardRangeList, non_negative_int, config_positive_int_value +from swift.container.backend import ContainerBroker, UNSHARDED +from swift.container.sharder import make_shard_ranges, sharding_enabled, \ + CleavingContext, process_compactible_shard_sequences, \ + find_compactible_shard_sequences, find_overlapping_ranges, \ + find_paths, rank_paths, finalize_shrinking, DEFAULT_SHARDER_CONF, \ + ContainerSharderConf, find_paths_with_gaps, combine_shard_ranges, \ + update_own_shard_range_stats + +EXIT_SUCCESS = 0 +EXIT_ERROR = 1 +EXIT_INVALID_ARGS = 2 # consistent with argparse exit code for invalid args +EXIT_USER_QUIT = 3 + +MIN_SHARD_RANGE_AGE_FOR_REPAIR = 4 * 3600 + +# Some CLI options derive their default values from DEFAULT_SHARDER_CONF if +# they have not been set. It is therefore important that the CLI parser +# provides None as a default so that we can detect that no value was set on the +# command line. We use this alias to act as a reminder. +USE_SHARDER_DEFAULT = object() + + +class ManageShardRangesException(Exception): + pass + + +class GapsFoundException(ManageShardRangesException): + pass + + +class InvalidStateException(ManageShardRangesException): + pass + + +class InvalidSolutionException(ManageShardRangesException): + def __init__(self, msg, acceptor_path, overlapping_donors): + super(InvalidSolutionException, self).__init__(msg) + self.acceptor_path = acceptor_path + self.overlapping_donors = overlapping_donors + + +def wrap_for_argparse(func, msg=None): + """ + Wrap the given ``func`` to catch any ``ValueError`` and raise an + ``argparse.ArgumentTypeError`` instead. + + :param func: a function. + :param msg: an optional message to use with any exception that is used; if + not given then the string representation of the ValueError will be + used. + :return: a function wrapper. + """ + def wrapped_func(*args, **kwargs): + try: + return func(*args, **kwargs) + except ValueError as err: + raise argparse.ArgumentTypeError(str(err) if msg is None else msg) + return wrapped_func + + +def _proceed(args): + if args.dry_run: + choice = 'no' + elif args.yes: + choice = 'yes' + else: + try: + choice = input('Do you want to apply these changes to the ' + 'container DB? [yes/N]') + except (EOFError, KeyboardInterrupt): + choice = 'no' + if choice != 'yes': + print('No changes applied') + + return choice == 'yes' + + +def _print_shard_range(sr, level=0): + indent = ' ' * level + print(indent + '%r' % sr.name) + print(indent + ' objects: %9d, tombstones: %9d, lower: %r' + % (sr.object_count, sr.tombstones, sr.lower_str)) + print(indent + ' state: %9s, deleted: %d upper: %r' + % (sr.state_text, sr.deleted, sr.upper_str)) + + +@contextmanager +def _open_input(args): + if args.input == '-': + args.input = '' + yield sys.stdin + else: + with open(args.input, 'r') as fd: + yield fd + + +def _load_and_validate_shard_data(args, require_index=True): + required_keys = ['lower', 'upper', 'object_count'] + if require_index: + required_keys.append('index') + try: + with _open_input(args) as fd: + try: + data = json.load(fd) + if not isinstance(data, list): + raise ValueError('Shard data must be a list of dicts') + for k in required_keys: + for shard in data: + shard[k] # trigger KeyError for missing required key + return data + except (TypeError, ValueError, KeyError) as err: + print('Failed to load valid shard range data: %r' % err, + file=sys.stderr) + exit(2) + except IOError as err: + print('Failed to open file %s: %s' % (args.input, err), + file=sys.stderr) + exit(2) + + +def _check_shard_ranges(own_shard_range, shard_ranges): + reasons = [] + + def reason(x, y): + if x != y: + reasons.append('%s != %s' % (x, y)) + + if not shard_ranges: + reasons.append('No shard ranges.') + else: + reason(own_shard_range.lower, shard_ranges[0].lower) + reason(own_shard_range.upper, shard_ranges[-1].upper) + for x, y in zip(shard_ranges, shard_ranges[1:]): + reason(x.upper, y.lower) + + if reasons: + print('WARNING: invalid shard ranges: %s.' % reasons) + print('Aborting.') + exit(EXIT_ERROR) + + +def _check_own_shard_range(broker, args): + # TODO: this check is weak - if the shards prefix changes then we may not + # identify a shard container. The goal is to not inadvertently create an + # entire namespace default shard range for a shard container. + is_shard = broker.account.startswith(args.shards_account_prefix) + own_shard_range = broker.get_own_shard_range(no_default=is_shard) + if not own_shard_range: + print('WARNING: shard container missing own shard range.') + print('Aborting.') + exit(2) + return own_shard_range + + +def _find_ranges(broker, args, status_file=None): + start = last_report = time.time() + limit = 5 if status_file else -1 + shard_data, last_found = broker.find_shard_ranges( + args.rows_per_shard, limit=limit, + minimum_shard_size=args.minimum_shard_size) + if shard_data: + while not last_found: + if last_report + 10 < time.time(): + print('Found %d ranges in %gs; looking for more...' % ( + len(shard_data), time.time() - start), file=status_file) + last_report = time.time() + # prefix doesn't matter since we aren't persisting it + found_ranges = make_shard_ranges(broker, shard_data, '.shards_') + more_shard_data, last_found = broker.find_shard_ranges( + args.rows_per_shard, existing_ranges=found_ranges, limit=5, + minimum_shard_size=args.minimum_shard_size) + shard_data.extend(more_shard_data) + return shard_data, time.time() - start + + +def find_ranges(broker, args): + shard_data, delta_t = _find_ranges(broker, args, sys.stderr) + print(json.dumps(shard_data, sort_keys=True, indent=2)) + print('Found %d ranges in %gs (total object count %s)' % + (len(shard_data), delta_t, + sum(r['object_count'] for r in shard_data)), + file=sys.stderr) + return EXIT_SUCCESS + + +def show_shard_ranges(broker, args): + shard_ranges = broker.get_shard_ranges( + includes=getattr(args, 'includes', None), + include_deleted=getattr(args, 'include_deleted', False)) + shard_data = [dict(sr, state=sr.state_text) + for sr in shard_ranges] + + if not shard_data: + print("No shard data found.", file=sys.stderr) + elif getattr(args, 'brief', False): + print("Existing shard ranges:", file=sys.stderr) + print(json.dumps([(sd['lower'], sd['upper']) for sd in shard_data], + sort_keys=True, indent=2)) + else: + print("Existing shard ranges:", file=sys.stderr) + print(json.dumps(shard_data, sort_keys=True, indent=2)) + return EXIT_SUCCESS + + +def db_info(broker, args): + print('Sharding enabled = %s' % sharding_enabled(broker)) + own_sr = broker.get_own_shard_range(no_default=True) + print('Own shard range: %s' % + (json.dumps(dict(own_sr, state=own_sr.state_text), + sort_keys=True, indent=2) + if own_sr else None)) + db_state = broker.get_db_state() + print('db_state = %s' % db_state) + info = broker.get_info() + print('object_count = %d' % info['object_count']) + print('bytes_used = %d' % info['bytes_used']) + if db_state == 'sharding': + print('Retiring db id: %s' % broker.get_brokers()[0].get_info()['id']) + print('Cleaving context: %s' % + json.dumps(dict(CleavingContext.load(broker)), + sort_keys=True, indent=2)) + print('Metadata:') + for k, (v, t) in broker.metadata.items(): + print(' %s = %s' % (k, v)) + return EXIT_SUCCESS + + +def delete_shard_ranges(broker, args): + shard_ranges = broker.get_shard_ranges() + if not shard_ranges: + print("No shard ranges found to delete.") + return EXIT_SUCCESS + + while not args.force: + print('This will delete existing %d shard ranges.' % len(shard_ranges)) + if broker.get_db_state() != UNSHARDED: + print('WARNING: Be very cautious about deleting existing shard ' + 'ranges. Deleting all ranges in this db does not guarantee ' + 'deletion of all ranges on all replicas of the db.') + print(' - this db is in state %s' % broker.get_db_state()) + print(' - %d existing shard ranges have started sharding' % + [sr.state != ShardRange.FOUND + for sr in shard_ranges].count(True)) + try: + choice = input('Do you want to show the existing ranges [s], ' + 'delete the existing ranges [yes] ' + 'or quit without deleting [q]? ') + except (EOFError, KeyboardInterrupt): + choice = 'q' + + if choice == 's': + show_shard_ranges(broker, args) + continue + elif choice == 'q': + return EXIT_USER_QUIT + elif choice == 'yes': + break + else: + print('Please make a valid choice.') + print() + + now = Timestamp.now() + for sr in shard_ranges: + sr.deleted = 1 + sr.timestamp = now + broker.merge_shard_ranges(shard_ranges) + print('Deleted %s existing shard ranges.' % len(shard_ranges)) + return EXIT_SUCCESS + + +def merge_shard_ranges(broker, args): + _check_own_shard_range(broker, args) + shard_data = _load_and_validate_shard_data(args, require_index=False) + new_shard_ranges = ShardRangeList([ShardRange.from_dict(sr) + for sr in shard_data]) + new_shard_ranges.sort(key=ShardRange.sort_key) + + # do some checks before merging... + existing_shard_ranges = ShardRangeList( + broker.get_shard_ranges(include_deleted=True)) + outcome = combine_shard_ranges(new_shard_ranges, existing_shard_ranges) + if args.verbose: + print('This change will result in the following shard ranges in the ' + 'affected namespace:') + print(json.dumps([dict(sr) for sr in outcome], indent=2)) + overlaps = find_overlapping_ranges(outcome) + if overlaps: + print('WARNING: this change will result in shard ranges overlaps!') + paths_with_gaps = find_paths_with_gaps(outcome) + gaps = [gap for start_path, gap, end_path in paths_with_gaps + if existing_shard_ranges.includes(gap)] + if gaps: + print('WARNING: this change will result in shard ranges gaps!') + + if not _proceed(args): + return EXIT_USER_QUIT + + with broker.updated_timeout(args.replace_timeout): + broker.merge_shard_ranges(new_shard_ranges) + print('Injected %d shard ranges.' % len(new_shard_ranges)) + print('Run container-replicator to replicate them to other nodes.') + return EXIT_SUCCESS + + +def _replace_shard_ranges(broker, args, shard_data, timeout=0): + own_shard_range = _check_own_shard_range(broker, args) + shard_ranges = make_shard_ranges( + broker, shard_data, args.shards_account_prefix) + _check_shard_ranges(own_shard_range, shard_ranges) + + if args.verbose > 0: + print('New shard ranges to be injected:') + print(json.dumps([dict(sr) for sr in shard_ranges], + sort_keys=True, indent=2)) + + # Crank up the timeout in an effort to *make sure* this succeeds + with broker.updated_timeout(max(timeout, args.replace_timeout)): + delete_status = delete_shard_ranges(broker, args) + if delete_status != EXIT_SUCCESS: + return delete_status + broker.merge_shard_ranges(shard_ranges) + + print('Injected %d shard ranges.' % len(shard_ranges)) + print('Run container-replicator to replicate them to other nodes.') + if args.enable: + return enable_sharding(broker, args) + else: + print('Use the enable sub-command to enable sharding.') + return EXIT_SUCCESS + + +def replace_shard_ranges(broker, args): + shard_data = _load_and_validate_shard_data(args) + return _replace_shard_ranges(broker, args, shard_data) + + +def find_replace_shard_ranges(broker, args): + shard_data, delta_t = _find_ranges(broker, args, sys.stdout) + # Since we're trying to one-shot this, and the previous step probably + # took a while, make the timeout for writing *at least* that long + return _replace_shard_ranges(broker, args, shard_data, timeout=delta_t) + + +def _enable_sharding(broker, own_shard_range, args): + if own_shard_range.update_state(ShardRange.SHARDING): + own_shard_range.epoch = Timestamp.now() + own_shard_range.state_timestamp = own_shard_range.epoch + # initialise own_shard_range with current broker object stats... + update_own_shard_range_stats(broker, own_shard_range) + + with broker.updated_timeout(args.enable_timeout): + broker.merge_shard_ranges([own_shard_range]) + broker.update_metadata({'X-Container-Sysmeta-Sharding': + ('True', Timestamp.now().normal)}) + return own_shard_range + + +def enable_sharding(broker, args): + own_shard_range = _check_own_shard_range(broker, args) + _check_shard_ranges(own_shard_range, broker.get_shard_ranges()) + + if own_shard_range.state == ShardRange.ACTIVE: + own_shard_range = _enable_sharding(broker, own_shard_range, args) + print('Container moved to state %r with epoch %s.' % + (own_shard_range.state_text, own_shard_range.epoch.internal)) + elif own_shard_range.state == ShardRange.SHARDING: + if own_shard_range.epoch: + print('Container already in state %r with epoch %s.' % + (own_shard_range.state_text, own_shard_range.epoch.internal)) + print('No action required.') + else: + print('Container already in state %r but missing epoch.' % + own_shard_range.state_text) + own_shard_range = _enable_sharding(broker, own_shard_range, args) + print('Container in state %r given epoch %s.' % + (own_shard_range.state_text, own_shard_range.epoch.internal)) + else: + print('WARNING: container in state %s (should be active or sharding).' + % own_shard_range.state_text) + print('Aborting.') + return EXIT_ERROR + + print('Run container-sharder on all nodes to shard the container.') + return EXIT_SUCCESS + + +def compact_shard_ranges(broker, args): + if not broker.is_root_container(): + print('WARNING: Shard containers cannot be compacted.') + print('This command should be used on a root container.') + return EXIT_ERROR + + if not broker.is_sharded(): + print('WARNING: Container is not yet sharded so cannot be compacted.') + return EXIT_ERROR + + shard_ranges = broker.get_shard_ranges() + if find_overlapping_ranges([sr for sr in shard_ranges if + sr.state != ShardRange.SHRINKING]): + print('WARNING: Container has overlapping shard ranges so cannot be ' + 'compacted.') + return EXIT_ERROR + + compactible = find_compactible_shard_sequences(broker, + args.shrink_threshold, + args.expansion_limit, + args.max_shrinking, + args.max_expanding) + if not compactible: + print('No shards identified for compaction.') + return EXIT_SUCCESS + + for sequence in compactible: + if sequence[-1].state not in (ShardRange.ACTIVE, ShardRange.SHARDED): + print('ERROR: acceptor not in correct state: %s' % sequence[-1], + file=sys.stderr) + return EXIT_ERROR + + for sequence in compactible: + acceptor = sequence[-1] + donors = sequence[:-1] + print('Donor shard range(s) with total of %d rows:' + % donors.row_count) + for donor in donors: + _print_shard_range(donor, level=1) + print('can be compacted into acceptor shard range:') + _print_shard_range(acceptor, level=1) + print('Total of %d shard sequences identified for compaction.' + % len(compactible)) + print('Once applied to the broker these changes will result in shard ' + 'range compaction the next time the sharder runs.') + + if not _proceed(args): + return EXIT_USER_QUIT + + process_compactible_shard_sequences(broker, compactible) + print('Updated %s shard sequences for compaction.' % len(compactible)) + print('Run container-replicator to replicate the changes to other ' + 'nodes.') + print('Run container-sharder on all nodes to compact shards.') + return EXIT_SUCCESS + + +def _remove_illegal_overlapping_donors( + acceptor_path, overlapping_donors, args): + # Check parent-children relationship in overlaps between acceptors and + # donors, remove any overlapping parent or child shard range from donors. + # Note: we can use set() here, since shard range object is hashed by + # id and all shard ranges in overlapping_donors are unique already. + parent_child_donors = set() + for acceptor in acceptor_path: + parent_child_donors.update( + [donor for donor in overlapping_donors + if acceptor.is_child_of(donor) or donor.is_child_of(acceptor)]) + if parent_child_donors: + overlapping_donors = ShardRangeList( + [sr for sr in overlapping_donors + if sr not in parent_child_donors]) + print('%d donor shards ignored due to parent-child relationship ' + 'checks' % len(parent_child_donors)) + + # Check minimum age requirement in overlaps between acceptors and donors. + if args.min_shard_age == 0: + return acceptor_path, overlapping_donors + ts_now = Timestamp.now() + # Remove overlapping donor shard ranges who were created recently within + # 'min_shard_age' age limit. + qualified_donors = ShardRangeList( + [sr for sr in overlapping_donors + if float(sr.timestamp) + args.min_shard_age < float(ts_now)]) + young_donors = len(overlapping_donors) - len(qualified_donors) + if young_donors > 0: + print('%d overlapping donor shards ignored due to minimum age ' + 'limit' % young_donors) + if not qualified_donors: + return acceptor_path, None + # Remove those overlapping donors whose overlapping acceptors were created + # within age limit. + donors_with_young_overlap_acceptor = set() + for acceptor_sr in acceptor_path: + if float(acceptor_sr.timestamp) + args.min_shard_age < float(ts_now): + continue + donors_with_young_overlap_acceptor.update( + [sr for sr in qualified_donors if acceptor_sr.overlaps(sr)]) + if donors_with_young_overlap_acceptor: + qualified_donors = ShardRangeList( + [sr for sr in qualified_donors + if sr not in donors_with_young_overlap_acceptor]) + print('%d donor shards ignored due to existence of overlapping young ' + 'acceptors' % len(donors_with_young_overlap_acceptor)) + + return acceptor_path, qualified_donors + + +def _find_overlapping_donors(shard_ranges, own_sr, args): + shard_ranges = ShardRangeList(shard_ranges) + if ShardRange.SHARDING in shard_ranges.states: + # This may be over-cautious, but for now we'll avoid dealing with + # SHARDING shards (which by design will temporarily overlap with their + # sub-shards) and require repair to be re-tried once sharding has + # completed. Note that once a shard ranges moves from SHARDING to + # SHARDED state and is deleted, some replicas of the shard may still be + # in the process of sharding but we cannot detect that at the root. + raise InvalidStateException('Found shard ranges in sharding state') + if ShardRange.SHRINKING in shard_ranges.states: + # Also stop now if there are SHRINKING shard ranges: we would need to + # ensure that these were not chosen as acceptors, but for now it is + # simpler to require repair to be re-tried once shrinking has + # completes. + raise InvalidStateException('Found shard ranges in shrinking state') + + paths = find_paths(shard_ranges) + ranked_paths = rank_paths(paths, own_sr) + if not (ranked_paths and ranked_paths[0].includes(own_sr)): + # individual paths do not have gaps within them; if no path spans the + # entire namespace then there must be a gap in the shard_ranges + raise GapsFoundException + + # simple repair strategy: choose the highest ranked complete sequence and + # shrink all other shard ranges into it + acceptor_path = ranked_paths[0] + acceptor_names = set(sr.name for sr in acceptor_path) + overlapping_donors = ShardRangeList([sr for sr in shard_ranges + if sr.name not in acceptor_names]) + + # check that the solution makes sense: if the acceptor path has the most + # progressed continuous cleaving, which has reached cleaved_upper, then we + # don't expect any shard ranges beyond cleaved_upper to be in states + # CLEAVED or ACTIVE, otherwise there should have been a better acceptor + # path that reached them. + cleaved_states = {ShardRange.CLEAVED, ShardRange.ACTIVE} + cleaved_upper = acceptor_path.find_lower( + lambda sr: sr.state not in cleaved_states) + beyond_cleaved = acceptor_path.filter(marker=cleaved_upper) + if beyond_cleaved.states.intersection(cleaved_states): + raise InvalidSolutionException( + 'Isolated cleaved and/or active shard ranges in acceptor path', + acceptor_path, overlapping_donors) + beyond_cleaved = overlapping_donors.filter(marker=cleaved_upper) + if beyond_cleaved.states.intersection(cleaved_states): + raise InvalidSolutionException( + 'Isolated cleaved and/or active shard ranges in donor ranges', + acceptor_path, overlapping_donors) + + return _remove_illegal_overlapping_donors( + acceptor_path, overlapping_donors, args) + + +def _fix_gaps(broker, args, paths_with_gaps): + timestamp = Timestamp.now() + solutions = [] + print('Found %d gaps:' % len(paths_with_gaps)) + for start_path, gap_range, end_path in paths_with_gaps: + if end_path[0].state == ShardRange.ACTIVE: + expanding_range = end_path[0] + solutions.append((gap_range, expanding_range)) + elif start_path[-1].state == ShardRange.ACTIVE: + expanding_range = start_path[-1] + solutions.append((gap_range, expanding_range)) + else: + expanding_range = None + print(' gap: %r - %r' + % (gap_range.lower, gap_range.upper)) + print(' apparent gap contents:') + for sr in broker.get_shard_ranges(marker=gap_range.lower, + end_marker=gap_range.upper, + include_deleted=True): + _print_shard_range(sr, 3) + if expanding_range: + print(' gap can be fixed by expanding neighbor range:') + _print_shard_range(expanding_range, 3) + else: + print('WARNING: cannot fix gap: non-ACTIVE neighbors') + + if args.max_expanding >= 0: + solutions = solutions[:args.max_expanding] + + # it's possible that an expanding range is used twice, expanding both down + # and up; if so, we only want one copy of it in our merged shard ranges + expanding_ranges = {} + for gap_range, expanding_range in solutions: + expanding_range.expand([gap_range]) + expanding_range.timestamp = timestamp + expanding_ranges[expanding_range.name] = expanding_range + + print('') + print('Repairs necessary to fill gaps.') + print('The following expanded shard range(s) will be applied to the DB:') + for expanding_range in sorted(expanding_ranges.values(), + key=lambda s: s.lower): + _print_shard_range(expanding_range, 2) + print('') + print( + 'It is recommended that no other concurrent changes are made to the \n' + 'shard ranges while fixing gaps. If necessary, abort this change \n' + 'and stop any auto-sharding processes before repeating this command.' + ) + print('') + + if not _proceed(args): + return EXIT_USER_QUIT + + broker.merge_shard_ranges(list(expanding_ranges.values())) + print('Run container-replicator to replicate the changes to other nodes.') + print('Run container-sharder on all nodes to fill gaps.') + return EXIT_SUCCESS + + +def repair_gaps(broker, args): + shard_ranges = broker.get_shard_ranges() + paths_with_gaps = find_paths_with_gaps(shard_ranges) + if paths_with_gaps: + return _fix_gaps(broker, args, paths_with_gaps) + else: + print('Found one complete sequence of %d shard ranges with no gaps.' + % len(shard_ranges)) + print('No repairs necessary.') + return EXIT_SUCCESS + + +def print_repair_solution(acceptor_path, overlapping_donors): + print('Donors:') + for donor in sorted(overlapping_donors): + _print_shard_range(donor, level=1) + print('Acceptors:') + for acceptor in acceptor_path: + _print_shard_range(acceptor, level=1) + + +def find_repair_solution(shard_ranges, own_sr, args): + try: + acceptor_path, overlapping_donors = _find_overlapping_donors( + shard_ranges, own_sr, args) + except GapsFoundException: + print('Found no complete sequence of shard ranges.') + print('Repairs necessary to fill gaps.') + print('Gap filling not supported by this tool. No repairs performed.') + raise + except InvalidStateException as exc: + print('WARNING: %s' % exc) + print('No repairs performed.') + raise + except InvalidSolutionException as exc: + print('ERROR: %s' % exc) + print_repair_solution(exc.acceptor_path, exc.overlapping_donors) + print('No repairs performed.') + raise + + if not overlapping_donors: + print('Found one complete sequence of %d shard ranges and no ' + 'overlapping shard ranges.' % len(acceptor_path)) + print('No repairs necessary.') + return None, None + + print('Repairs necessary to remove overlapping shard ranges.') + print('Chosen a complete sequence of %d shard ranges with current total ' + 'of %d object records to accept object records from %d overlapping ' + 'donor shard ranges.' % + (len(acceptor_path), acceptor_path.object_count, + len(overlapping_donors))) + if args.verbose: + print_repair_solution(acceptor_path, overlapping_donors) + + print('Once applied to the broker these changes will result in:') + print(' %d shard ranges being removed.' % len(overlapping_donors)) + print(' %d object records being moved to the chosen shard ranges.' + % overlapping_donors.object_count) + + return acceptor_path, overlapping_donors + + +def repair_overlaps(broker, args): + shard_ranges = broker.get_shard_ranges() + if not shard_ranges: + print('No shards found, nothing to do.') + return EXIT_SUCCESS + + own_sr = broker.get_own_shard_range() + try: + acceptor_path, overlapping_donors = find_repair_solution( + shard_ranges, own_sr, args) + except ManageShardRangesException: + return EXIT_ERROR + + if not acceptor_path: + return EXIT_SUCCESS + + if not _proceed(args): + return EXIT_USER_QUIT + + # merge changes to the broker... + # note: acceptors do not need to be modified since they already span the + # complete range + ts_now = Timestamp.now() + finalize_shrinking(broker, [], overlapping_donors, ts_now) + print('Updated %s donor shard ranges.' % len(overlapping_donors)) + print('Run container-replicator to replicate the changes to other nodes.') + print('Run container-sharder on all nodes to repair shards.') + return EXIT_SUCCESS + + +def repair_shard_ranges(broker, args): + if not broker.is_root_container(): + print('WARNING: Shard containers cannot be repaired.') + print('This command should be used on a root container.') + return EXIT_ERROR + if args.gaps: + return repair_gaps(broker, args) + else: + return repair_overlaps(broker, args) + + +def analyze_shard_ranges(args): + shard_data = _load_and_validate_shard_data(args, require_index=False) + for data in shard_data: + # allow for incomplete shard range data that may have been scraped from + # swift-container-info output + data.setdefault('epoch', None) + shard_ranges = [ShardRange.from_dict(data) for data in shard_data] + whole_sr = ShardRange('whole/namespace', 0) + try: + find_repair_solution(shard_ranges, whole_sr, args) + except ManageShardRangesException: + return EXIT_ERROR + return EXIT_SUCCESS + + +def _add_find_args(parser): + parser.add_argument( + 'rows_per_shard', nargs='?', type=int, default=USE_SHARDER_DEFAULT, + help='Target number of rows for newly created shards. ' + 'Default is half of the shard_container_threshold value if that is ' + 'given in a conf file specified with --config, otherwise %s.' + % DEFAULT_SHARDER_CONF['rows_per_shard']) + parser.add_argument( + '--minimum-shard-size', + type=wrap_for_argparse(config_positive_int_value, 'must be > 0'), + default=USE_SHARDER_DEFAULT, + help='Minimum size of the final shard range. If this is greater than ' + 'one then the final shard range may be extended to more than ' + 'rows_per_shard in order to avoid a further shard range with less ' + 'than minimum-shard-size rows.') + + +def _add_account_prefix_arg(parser): + parser.add_argument( + '--shards_account_prefix', metavar='shards_account_prefix', type=str, + required=False, default='.shards_', + help="Prefix for shards account. The default is '.shards_'. This " + "should only be changed if the auto_create_account_prefix option " + "has been similarly changed in swift.conf.") + + +def _add_replace_args(parser): + _add_account_prefix_arg(parser) + parser.add_argument( + '--replace-timeout', type=int, default=600, + help='Minimum DB timeout to use when replacing shard ranges.') + parser.add_argument( + '--force', '-f', action='store_true', default=False, + help='Delete existing shard ranges; no questions asked.') + parser.add_argument( + '--enable', action='store_true', default=False, + help='Enable sharding after adding shard ranges.') + + +def _add_enable_args(parser): + parser.add_argument( + '--enable-timeout', type=int, default=300, + help='DB timeout to use when enabling sharding.') + + +def _add_prompt_args(parser): + group = parser.add_mutually_exclusive_group() + group.add_argument( + '--yes', '-y', action='store_true', default=False, + help='Apply shard range changes to broker without prompting. ' + 'Cannot be used with --dry-run option.') + group.add_argument( + '--dry-run', '-n', action='store_true', default=False, + help='Do not apply any shard range changes to broker. ' + 'Cannot be used with --yes option.') + + +def _add_skip_or_force_commits_arg(parser): + """ + We merge in the pending file by default, this is always correct and + useful for probe tests where shard containers have unrealistically low + numbers of objects, of which a significant proportion may still be in the + pending file. If you have 10GB databases with 100M objects you can use + --skip-commits and the selected shard ranges probably won't be that + different. The --force-commits option is redundant and may be deprecated. + """ + group = parser.add_mutually_exclusive_group() + group.add_argument( + '--skip-commits', action='store_true', dest='skip_commits', + default=False, + help='Skip commits for pending object updates. By default the broker' + ' will commit pending object updates.') + group.add_argument( + '--force-commits', action='store_false', dest='skip_commits', + default=argparse.SUPPRESS, help=argparse.SUPPRESS) + + +def _add_max_expanding_arg(parser): + parser.add_argument( + '--max-expanding', nargs='?', + type=wrap_for_argparse(config_positive_int_value, 'must be > 0'), + default=USE_SHARDER_DEFAULT, + help='Maximum number of shards that should be ' + 'expanded. Defaults to unlimited.') + + +def _make_parser(): + parser = argparse.ArgumentParser(description='Manage shard ranges') + parser.add_argument('path_to_file', + help='Path to a container DB file or, for the analyze ' + 'subcommand, a shard data file.') + parser.add_argument('--config', dest='conf_file', required=False, + help='Path to config file with [container-sharder] ' + 'section. The following subcommand options will ' + 'be loaded from a config file if they are not ' + 'given on the command line: ' + 'rows_per_shard, ' + 'max_shrinking, ' + 'max_expanding, ' + 'shrink_threshold, ' + 'expansion_limit') + parser.add_argument('--verbose', '-v', action='count', default=0, + help='Increase output verbosity') + _add_skip_or_force_commits_arg(parser) + + subparsers = parser.add_subparsers( + dest='subcommand', help='Sub-command help', title='Sub-commands') + + # find + find_parser = subparsers.add_parser( + 'find', help='Find and display shard ranges') + _add_find_args(find_parser) + find_parser.set_defaults(func=find_ranges) + + # delete + delete_parser = subparsers.add_parser( + 'delete', help='Delete all existing shard ranges from db') + delete_parser.add_argument( + '--force', '-f', action='store_true', default=False, + help='Delete existing shard ranges; no questions asked.') + delete_parser.set_defaults(func=delete_shard_ranges) + + # show + show_parser = subparsers.add_parser( + 'show', help='Print shard range data') + show_parser.add_argument( + '--include_deleted', '-d', action='store_true', default=False, + help='Include deleted shard ranges in output.') + show_parser.add_argument( + '--brief', '-b', action='store_true', default=False, + help='Show only shard range bounds in output.') + show_parser.add_argument('--includes', + help='limit shard ranges to include key') + show_parser.set_defaults(func=show_shard_ranges) + + # info + info_parser = subparsers.add_parser( + 'info', help='Print container db info') + info_parser.set_defaults(func=db_info) + + # merge + merge_parser = subparsers.add_parser( + 'merge', + help='Merge shard range(s) from file with existing shard ranges. This ' + 'subcommand should only be used if you are confident that you ' + 'know what you are doing. Shard ranges should not typically be ' + 'modified in this way.') + merge_parser.add_argument('input', metavar='input_file', + type=str, help='Name of file') + merge_parser.add_argument( + '--replace-timeout', type=int, default=600, + help='Minimum DB timeout to use when merging shard ranges.') + _add_account_prefix_arg(merge_parser) + _add_prompt_args(merge_parser) + merge_parser.set_defaults(func=merge_shard_ranges) + + # replace + replace_parser = subparsers.add_parser( + 'replace', + help='Replace existing shard ranges. User will be prompted before ' + 'deleting any existing shard ranges.') + replace_parser.add_argument('input', metavar='input_file', + type=str, help='Name of file') + _add_replace_args(replace_parser) + replace_parser.set_defaults(func=replace_shard_ranges) + + # find_and_replace + find_replace_parser = subparsers.add_parser( + 'find_and_replace', + help='Find new shard ranges and replace existing shard ranges. ' + 'User will be prompted before deleting any existing shard ranges.' + ) + _add_find_args(find_replace_parser) + _add_replace_args(find_replace_parser) + _add_enable_args(find_replace_parser) + find_replace_parser.set_defaults(func=find_replace_shard_ranges) + + # enable + enable_parser = subparsers.add_parser( + 'enable', help='Enable sharding and move db to sharding state.') + _add_enable_args(enable_parser) + enable_parser.set_defaults(func=enable_sharding) + _add_replace_args(enable_parser) + + # compact + compact_parser = subparsers.add_parser( + 'compact', + help='Compact shard ranges with less than the shrink-threshold number ' + 'of rows. This command only works on root containers.') + _add_prompt_args(compact_parser) + compact_parser.add_argument( + '--shrink-threshold', nargs='?', + type=wrap_for_argparse(config_positive_int_value, 'must be > 0'), + default=USE_SHARDER_DEFAULT, + help='The number of rows below which a shard can qualify for ' + 'shrinking. ' + 'Defaults to %d' % DEFAULT_SHARDER_CONF['shrink_threshold']) + compact_parser.add_argument( + '--expansion-limit', nargs='?', + type=wrap_for_argparse(config_positive_int_value, 'must be > 0'), + default=USE_SHARDER_DEFAULT, + help='Maximum number of rows for an expanding shard to have after ' + 'compaction has completed. ' + 'Defaults to %d' % DEFAULT_SHARDER_CONF['expansion_limit']) + # If just one donor shard is chosen to shrink to an acceptor then the + # expanded acceptor will handle object listings as soon as the donor shard + # has shrunk. If more than one donor shard are chosen to shrink to an + # acceptor then the acceptor may not handle object listings for some donor + # shards that have shrunk until *all* donors have shrunk, resulting in + # temporary gap(s) in object listings where the shrunk donors are missing. + compact_parser.add_argument( + '--max-shrinking', nargs='?', + type=wrap_for_argparse(config_positive_int_value, 'must be > 0'), + default=USE_SHARDER_DEFAULT, + help='Maximum number of shards that should be ' + 'shrunk into each expanding shard. ' + 'Defaults to 1. Using values greater ' + 'than 1 may result in temporary gaps in ' + 'object listings until all selected ' + 'shards have shrunk.') + _add_max_expanding_arg(compact_parser) + compact_parser.set_defaults(func=compact_shard_ranges) + + # repair + repair_parser = subparsers.add_parser( + 'repair', + help='Repair overlapping shard ranges. No action will be taken ' + 'without user confirmation unless the -y option is used.') + _add_prompt_args(repair_parser) + repair_parser.add_argument( + '--min-shard-age', nargs='?', + type=wrap_for_argparse(non_negative_int, 'must be >= 0'), + default=MIN_SHARD_RANGE_AGE_FOR_REPAIR, + help='Minimum age of a shard for it to be considered as an overlap ' + 'that is due for repair. Overlapping shards younger than this ' + 'age will be ignored. Value of 0 means no recent shards will be ' + 'ignored. Defaults to %d.' % MIN_SHARD_RANGE_AGE_FOR_REPAIR) + # TODO: maybe this should be a separate subcommand given that it needs + # some extra options vs repairing overlaps? + repair_parser.add_argument( + '--gaps', action='store_true', default=False, + help='Repair gaps in shard ranges.') + _add_max_expanding_arg(repair_parser) + repair_parser.set_defaults(func=repair_shard_ranges) + + # analyze + analyze_parser = subparsers.add_parser( + 'analyze', + help='Analyze shard range json data read from file. Use -v to see ' + 'more detailed analysis.') + analyze_parser.add_argument( + '--min-shard-age', nargs='?', + type=wrap_for_argparse(non_negative_int, 'must be >= 0'), + default=0, + help='Minimum age of a shard for it to be considered as an overlap ' + 'that is due for repair. Overlapping shards younger than this ' + 'age will be ignored. Value of 0 means no recent shards will be ' + 'ignored. Defaults to 0.') + analyze_parser.set_defaults(func=analyze_shard_ranges) + + return parser + + +def main(cli_args=None): + parser = _make_parser() + args = parser.parse_args(cli_args) + if not args.subcommand: + # On py2, subparsers are required; on py3 they are not; see + # https://bugs.python.org/issue9253. py37 added a `required` kwarg + # to let you control it, but prior to that, there was no choice in + # the matter. So, check whether the destination was set and bomb + # out if not. + parser.print_help() + print('\nA sub-command is required.', file=sys.stderr) + return EXIT_INVALID_ARGS + + try: + conf = {} + if args.conf_file: + conf = readconf(args.conf_file, 'container-sharder') + conf.update(dict((k, v) for k, v in vars(args).items() + if v != USE_SHARDER_DEFAULT)) + conf_args = ContainerSharderConf(conf) + except (OSError, IOError) as exc: + print('Error opening config file %s: %s' % (args.conf_file, exc), + file=sys.stderr) + return EXIT_ERROR + except (TypeError, ValueError) as exc: + print('Error loading config: %s' % exc, file=sys.stderr) + return EXIT_INVALID_ARGS + + for k, v in vars(args).items(): + # set any un-set cli args from conf_args + if v is USE_SHARDER_DEFAULT: + setattr(args, k, getattr(conf_args, k)) + + try: + ContainerSharderConf.validate_conf(args) + except ValueError as err: + print('Invalid config: %s' % err, file=sys.stderr) + return EXIT_INVALID_ARGS + + if args.func in (analyze_shard_ranges,): + args.input = args.path_to_file + return args.func(args) or 0 + + logger = get_logger({}, name='ContainerBroker', log_to_console=True) + broker = ContainerBroker(os.path.realpath(args.path_to_file), + logger=logger, + skip_commits=args.skip_commits) + try: + broker.get_info() + except Exception as exc: + print('Error opening container DB %s: %s' % (args.path_to_file, exc), + file=sys.stderr) + return EXIT_ERROR + print('Loaded db broker for %s' % broker.path, file=sys.stderr) + return args.func(broker, args) + + +if __name__ == '__main__': + exit(main()) diff --git a/swift/cli/oldies.py b/swift/cli/oldies.py new file mode 100755 index 0000000000..7c6abe0bb2 --- /dev/null +++ b/swift/cli/oldies.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import optparse +import subprocess +import sys + + +def main(): + parser = optparse.OptionParser(usage='''%prog [options] + +Lists old Swift processes. + '''.strip()) + parser.add_option('-a', '--age', dest='hours', type='int', default=720, + help='look for processes at least HOURS old; ' + 'default: 720 (30 days)') + parser.add_option('-p', '--pids', action='store_true', + help='only print the pids found; for example, to pipe ' + 'to xargs kill') + (options, args) = parser.parse_args() + + listing = [] + for line in subprocess.Popen( + ['ps', '-eo', 'etime,pid,args', '--no-headers'], + stdout=subprocess.PIPE).communicate()[0].split(b'\n'): + if not line: + continue + hours = 0 + try: + etime, pid, args = line.decode('ascii').split(None, 2) + except ValueError: + # This covers both decoding and not-enough-values-to-unpack errors + sys.exit('Could not process ps line %r' % line) + if not args.startswith(( + '/usr/bin/python /usr/bin/swift-', + '/usr/bin/python /usr/local/bin/swift-', + '/bin/python /usr/bin/swift-', + '/usr/bin/python3 /usr/bin/swift-', + '/usr/bin/python3 /usr/local/bin/swift-', + '/bin/python3 /usr/bin/swift-')): + continue + args = args.split('-', 1)[1] + etime = etime.split('-') + if len(etime) == 2: + hours = int(etime[0]) * 24 + etime = etime[1] + elif len(etime) == 1: + etime = etime[0] + else: + sys.exit('Could not process etime value from %r' % line) + etime = etime.split(':') + if len(etime) == 3: + hours += int(etime[0]) + elif len(etime) != 2: + sys.exit('Could not process etime value from %r' % line) + if hours >= options.hours: + listing.append((str(hours), pid, args)) + + if not listing: + sys.exit() + + if options.pids: + for hours, pid, args in listing: + print(pid) + else: + hours_len = len('Hours') + pid_len = len('PID') + args_len = len('Command') + for hours, pid, args in listing: + hours_len = max(hours_len, len(hours)) + pid_len = max(pid_len, len(pid)) + args_len = max(args_len, len(args)) + args_len = min(args_len, 78 - hours_len - pid_len) + + print('%*s %*s %s' % (hours_len, 'Hours', pid_len, 'PID', 'Command')) + for hours, pid, args in listing: + print('%*s %*s %s' % (hours_len, hours, pid_len, + pid, args[:args_len])) + + +if __name__ == '__main__': + main() diff --git a/bin/swift-orphans b/swift/cli/orphans.py similarity index 64% rename from bin/swift-orphans rename to swift/cli/orphans.py index 3eb799c0e5..d957afd7d4 100755 --- a/bin/swift-orphans +++ b/swift/cli/orphans.py @@ -1,13 +1,28 @@ #!/usr/bin/env python +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. import optparse import os +import re import signal import subprocess import sys +from swift.common.manager import RUN_DIR -if __name__ == '__main__': + +def main(): parser = optparse.OptionParser(usage='''%prog [options] Lists and optionally kills orphaned Swift processes. This is done by scanning @@ -28,36 +43,44 @@ parser.add_option('-w', '--wide', dest='wide', default=False, action='store_true', help="don't clip the listing at 80 characters") + parser.add_option('-r', '--run-dir', type="str", + dest="run_dir", default=RUN_DIR, + help="alternative directory to store running pid files " + "default: %s" % RUN_DIR) (options, args) = parser.parse_args() pids = [] - for root, directories, files in os.walk('/var/run/swift'): + + for root, directories, files in os.walk(options.run_dir): for name in files: - if name.endswith('.pid'): + if name.endswith(('.pid', '.pid.d')): pids.append(open(os.path.join(root, name)).read().strip()) pids.extend(subprocess.Popen( ['ps', '--ppid', pids[-1], '-o', 'pid', '--no-headers'], - stdout=subprocess.PIPE).communicate()[0].split()) + stdout=subprocess.PIPE).communicate()[0].decode().split()) listing = [] + swift_cmd_re = re.compile( + '^/usr/bin/python[23]? /usr(?:/local)?/bin/swift-') for line in subprocess.Popen( ['ps', '-eo', 'etime,pid,args', '--no-headers'], - stdout=subprocess.PIPE).communicate()[0].split('\n'): + stdout=subprocess.PIPE).communicate()[0].split(b'\n'): if not line: continue hours = 0 try: - etime, pid, args = line.split(None, 2) + etime, pid, args = line.decode('ascii').split(None, 2) except ValueError: sys.exit('Could not process ps line %r' % line) if pid in pids: continue - if (not args.startswith('/usr/bin/python /usr/bin/swift-') and - not args.startswith('/usr/bin/python /usr/local/bin/swift-')) or \ - 'swift-orphans' in args or \ - 'once' in args.split(): + if any([ + not swift_cmd_re.match(args), + 'swift-orphans' in args, + 'once' in args.split(), + ]): continue - args = args.split('-', 1)[1] + args = args.split('swift-', 1)[1] etime = etime.split('-') if len(etime) == 2: hours = int(etime[0]) * 24 @@ -75,7 +98,7 @@ listing.append((str(hours), pid, args)) if not listing: - exit() + sys.exit() hours_len = len('Hours') pid_len = len('PID') @@ -86,11 +109,11 @@ args_len = max(args_len, len(args)) args_len = min(args_len, 78 - hours_len - pid_len) - print ('%%%ds %%%ds %%s' % (hours_len, pid_len)) % \ - ('Hours', 'PID', 'Command') + print('%*s %*s %s' % + (hours_len, 'Hours', pid_len, 'PID', 'Command')) for hours, pid, args in listing: - print ('%%%ds %%%ds %%s' % (hours_len, pid_len)) % \ - (hours, pid, args[:args_len]) + print('%*s %*s %s' % + (hours_len, hours, pid_len, pid, args[:args_len])) if options.signal: try: @@ -102,7 +125,12 @@ if not signum: sys.exit('Could not translate %r to a signal number.' % options.signal) - print 'Sending processes %s (%d) signal...' % (options.signal, signum), + print('Sending processes %s (%d) signal...' % (options.signal, signum), + end='') for hours, pid, args in listing: os.kill(int(pid), signum) - print 'Done.' + print('Done.') + + +if __name__ == '__main__': + main() diff --git a/swift/cli/recon.py b/swift/cli/recon.py new file mode 100644 index 0000000000..d693be287a --- /dev/null +++ b/swift/cli/recon.py @@ -0,0 +1,1295 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" + cmdline utility to perform cluster reconnaissance +""" + + +from eventlet.green import socket +from urllib.parse import urlparse + +from swift.common.utils import ( + SWIFT_CONF_FILE, md5_hash_for_file, set_swift_dir) +from swift.common.ring import Ring +from swift.common.storage_policy import POLICIES, reload_storage_policies +import eventlet +import json +import optparse +import time +import sys +import os + +from eventlet.green.urllib import request as urllib_request + + +def seconds2timeunit(seconds): + elapsed = seconds + unit = 'seconds' + if elapsed >= 60: + elapsed = elapsed / 60.0 + unit = 'minutes' + if elapsed >= 60: + elapsed = elapsed / 60.0 + unit = 'hours' + if elapsed >= 24: + elapsed = elapsed / 24.0 + unit = 'days' + return elapsed, unit + + +def size_suffix(size): + suffixes = ['bytes', 'kB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'] + for suffix in suffixes: + if size < 1000: + return "%s %s" % (size, suffix) + size = size // 1000 + return "%s %s" % (size, suffix) + + +class Scout(object): + """ + Obtain swift recon information + """ + + def __init__(self, recon_type, verbose=False, suppress_errors=False, + timeout=5): + self.recon_type = recon_type + self.verbose = verbose + self.suppress_errors = suppress_errors + self.timeout = timeout + + def scout_host(self, base_url, recon_type): + """ + Perform the actual HTTP request to obtain swift recon telemetry. + + :param base_url: the base url of the host you wish to check. str of the + format 'http://127.0.0.1:6200/recon/' + :param recon_type: the swift recon check to request. + :returns: tuple of (recon url used, response body, and status) + """ + url = base_url + recon_type + try: + body = urllib_request.urlopen(url, timeout=self.timeout).read() + if isinstance(body, bytes): + body = body.decode('utf8') + content = json.loads(body) + if self.verbose: + print("-> %s: %s" % (url, content)) + status = 200 + except urllib_request.HTTPError as err: + if not self.suppress_errors or self.verbose: + print("-> %s: %s" % (url, err)) + content = err + status = err.code + except (urllib_request.URLError, socket.timeout) as err: + if not self.suppress_errors or self.verbose: + print("-> %s: %s" % (url, err)) + content = err + status = -1 + return url, content, status + + def scout(self, host): + """ + Obtain telemetry from a host running the swift recon middleware. + + :param host: host to check + :returns: tuple of (recon url used, response body, status, time start + and time end) + """ + base_url = "http://%s:%s/recon/" % (host[0], host[1]) + ts_start = time.time() + url, content, status = self.scout_host(base_url, self.recon_type) + ts_end = time.time() + return url, content, status, ts_start, ts_end + + def scout_server_type(self, host): + """ + Obtain Server header by calling OPTIONS. + + :param host: host to check + :returns: Server type, status + """ + try: + url = "http://%s:%s/" % (host[0], host[1]) + req = urllib_request.Request(url) + req.get_method = lambda: 'OPTIONS' + conn = urllib_request.urlopen(req) + header = conn.info().get('Server') + server_header = header.split('/') + content = server_header[0] + status = 200 + except urllib_request.HTTPError as err: + if not self.suppress_errors or self.verbose: + print("-> %s: %s" % (url, err)) + content = err + status = err.code + except (urllib_request.URLError, socket.timeout) as err: + if not self.suppress_errors or self.verbose: + print("-> %s: %s" % (url, err)) + content = err + status = -1 + return url, content, status + + +class SwiftRecon(object): + """ + Retrieve and report cluster info from hosts running recon middleware. + """ + + def __init__(self): + self.verbose = False + self.suppress_errors = False + self.timeout = 5 + self.pool_size = 30 + self.pool = eventlet.GreenPool(self.pool_size) + self.check_types = ['account', 'container', 'object'] + self.server_type = 'object' + + def _gen_stats(self, stats, name=None): + """Compute various stats from a list of values.""" + cstats = [x for x in stats if x is not None] + if len(cstats) > 0: + ret_dict = {'low': min(cstats), 'high': max(cstats), + 'total': sum(cstats), 'reported': len(cstats), + 'number_none': len(stats) - len(cstats), 'name': name} + ret_dict['average'] = ret_dict['total'] / float(len(cstats)) + ret_dict['perc_none'] = \ + ret_dict['number_none'] * 100.0 / len(stats) + else: + ret_dict = {'reported': 0} + return ret_dict + + def _print_stats(self, stats): + """ + print out formatted stats to console + + :param stats: dict of stats generated by _gen_stats + """ + print('[%(name)s] low: %(low)d, high: %(high)d, avg: ' + '%(average).1f, total: %(total)d, ' + 'Failed: %(perc_none).1f%%, no_result: %(number_none)d, ' + 'reported: %(reported)d' % stats) + + def _ptime(self, timev=None): + """ + :param timev: a unix timestamp or None + :returns: a pretty string of the current time or provided time in UTC + """ + if timev: + return time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(timev)) + else: + return time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) + + def get_hosts(self, region_filter, zone_filter, swift_dir, ring_names): + """ + Get a list of hosts in the rings. + + :param region_filter: Only list regions matching given filter + :param zone_filter: Only list zones matching given filter + :param swift_dir: Directory of swift config, usually /etc/swift + :param ring_names: Collection of ring names, such as + ['object', 'object-2'] + :returns: a set of tuples containing the ip and port of hosts + """ + rings = [Ring(swift_dir, ring_name=n) for n in ring_names] + devs = [d for r in rings for d in r.devs if d] + if region_filter is not None: + devs = [d for d in devs if d['region'] == region_filter] + if zone_filter is not None: + devs = [d for d in devs if d['zone'] == zone_filter] + return set((d['ip'], d['port']) for d in devs) + + def get_ringmd5(self, hosts, swift_dir): + """ + Compare ring md5sum's with those on remote host + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6220), ('127.0.0.2', 6230)]) + :param swift_dir: The local directory with the ring files. + """ + matches = 0 + errors = 0 + ring_names = set() + if self.server_type == 'object': + for ring_name in os.listdir(swift_dir): + if ring_name.startswith('object') and \ + ring_name.endswith('.ring.gz'): + ring_names.add(ring_name) + else: + ring_name = '%s.ring.gz' % self.server_type + ring_names.add(ring_name) + rings = {} + for ring_name in ring_names: + rings[ring_name] = md5_hash_for_file( + os.path.join(swift_dir, ring_name)) + recon = Scout("ringmd5", self.verbose, self.suppress_errors, + self.timeout) + print("[%s] Checking ring md5sums" % self._ptime()) + if self.verbose: + for ring_file, ring_sum in rings.items(): + print("-> On disk %s md5sum: %s" % (ring_file, ring_sum)) + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): + if status != 200: + errors = errors + 1 + continue + success = True + for remote_ring_file, remote_ring_sum in response.items(): + remote_ring_name = os.path.basename(remote_ring_file) + if not remote_ring_name.startswith(self.server_type): + continue + ring_sum = rings.get(remote_ring_name, None) + if remote_ring_sum != ring_sum: + success = False + print("!! %s (%s => %s) doesn't match on disk md5sum" % ( + url, remote_ring_name, remote_ring_sum)) + if not success: + errors += 1 + continue + matches += 1 + if self.verbose: + print("-> %s matches." % url) + print("%s/%s hosts matched, %s error[s] while checking hosts." % ( + matches, len(hosts), errors)) + print("=" * 79) + + def get_swiftconfmd5(self, hosts, printfn=print): + """ + Compare swift.conf md5sum with that on remote hosts + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6220), ('127.0.0.2', 6230)]) + :param printfn: function to print text; defaults to print() + """ + matches = 0 + errors = 0 + conf_sum = md5_hash_for_file(SWIFT_CONF_FILE) + recon = Scout("swiftconfmd5", self.verbose, self.suppress_errors, + self.timeout) + printfn("[%s] Checking swift.conf md5sum" % self._ptime()) + if self.verbose: + printfn("-> On disk swift.conf md5sum: %s" % (conf_sum,)) + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): + if status == 200: + if response[SWIFT_CONF_FILE] != conf_sum: + printfn("!! %s (%s) doesn't match on disk md5sum" % + (url, response[SWIFT_CONF_FILE])) + else: + matches = matches + 1 + if self.verbose: + printfn("-> %s matches." % url) + else: + errors = errors + 1 + printfn("%s/%s hosts matched, %s error[s] while checking hosts." + % (matches, len(hosts), errors)) + printfn("=" * 79) + + def async_check(self, hosts): + """ + Obtain and print async pending statistics + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6220), ('127.0.0.2', 6230)]) + """ + scan = {} + recon = Scout("async", self.verbose, self.suppress_errors, + self.timeout) + print("[%s] Checking async pendings" % self._ptime()) + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): + if status == 200: + scan[url] = response['async_pending'] + stats = self._gen_stats(scan.values(), 'async_pending') + if stats['reported'] > 0: + self._print_stats(stats) + else: + print("[async_pending] - No hosts returned valid data.") + print("=" * 79) + + def driveaudit_check(self, hosts): + """ + Obtain and print drive audit error statistics + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6220), ('127.0.0.2', 6230)] + """ + scan = {} + recon = Scout("driveaudit", self.verbose, self.suppress_errors, + self.timeout) + print("[%s] Checking drive-audit errors" % self._ptime()) + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): + if status == 200: + scan[url] = response['drive_audit_errors'] + stats = self._gen_stats(scan.values(), 'drive_audit_errors') + if stats['reported'] > 0: + self._print_stats(stats) + else: + print("[drive_audit_errors] - No hosts returned valid data.") + print("=" * 79) + + def umount_check(self, hosts): + """ + Check for and print unmounted drives + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6220), ('127.0.0.2', 6230)]) + """ + unmounted = {} + errors = {} + recon = Scout("unmounted", self.verbose, self.suppress_errors, + self.timeout) + print("[%s] Getting unmounted drives from %s hosts..." % + (self._ptime(), len(hosts))) + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): + if status == 200: + unmounted[url] = [] + errors[url] = [] + for i in response: + if not isinstance(i['mounted'], bool): + errors[url].append(i['device']) + else: + unmounted[url].append(i['device']) + for host in unmounted: + node = urlparse(host).netloc + for entry in unmounted[host]: + print("Not mounted: %s on %s" % (entry, node)) + for host in errors: + node = urlparse(host).netloc + for entry in errors[host]: + print("Device errors: %s on %s" % (entry, node)) + print("=" * 79) + + def server_type_check(self, hosts): + """ + Check for server types on the ring + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6220), ('127.0.0.2', 6230)]) + """ + errors = {} + recon = Scout("server_type_check", self.verbose, self.suppress_errors, + self.timeout) + print("[%s] Validating server type '%s' on %s hosts..." % + (self._ptime(), self.server_type, len(hosts))) + for url, response, status in self.pool.imap( + recon.scout_server_type, hosts): + if status == 200: + if response != self.server_type + '-server': + errors[url] = response + print("%s/%s hosts ok, %s error[s] while checking hosts." % ( + len(hosts) - len(errors), len(hosts), len(errors))) + for host in errors: + print("Invalid: %s is %s" % (host, errors[host])) + print("=" * 79) + + def expirer_check(self, hosts): + """ + Obtain and print expirer statistics + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6220), ('127.0.0.2', 6230)]) + """ + stats = {'object_expiration_pass': [], 'expired_last_pass': []} + recon = Scout("expirer/%s" % self.server_type, self.verbose, + self.suppress_errors, self.timeout) + print("[%s] Checking on expirers" % self._ptime()) + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): + if status == 200: + stats['object_expiration_pass'].append( + response.get('object_expiration_pass')) + stats['expired_last_pass'].append( + response.get('expired_last_pass')) + for k in stats: + if stats[k]: + computed = self._gen_stats(stats[k], name=k) + if computed['reported'] > 0: + self._print_stats(computed) + else: + print("[%s] - No hosts returned valid data." % k) + else: + print("[%s] - No hosts returned valid data." % k) + print("=" * 79) + + def _calculate_least_and_most_recent(self, url_time_data): + """calulate and print the least and most recent urls + + Given a list of url and time tuples calulate the most and least + recent timings and print it out. + :param url_time_data: list of url and time tuples: [(url, time_), ..] + """ + least_recent_time = 9999999999 + least_recent_url = None + most_recent_time = 0 + most_recent_url = None + + for url, last in url_time_data: + if last is None: + continue + if last < least_recent_time: + least_recent_time = last + least_recent_url = url + if last > most_recent_time: + most_recent_time = last + most_recent_url = url + + if least_recent_url is not None: + host = urlparse(least_recent_url).netloc + if not least_recent_time: + print('Oldest completion was NEVER by %s.' % host) + else: + elapsed = time.time() - least_recent_time + elapsed, elapsed_unit = seconds2timeunit(elapsed) + print('Oldest completion was %s (%d %s ago) by %s.' % ( + self._ptime(least_recent_time), + elapsed, elapsed_unit, host)) + if most_recent_url is not None: + host = urlparse(most_recent_url).netloc + elapsed = time.time() - most_recent_time + elapsed, elapsed_unit = seconds2timeunit(elapsed) + print('Most recent completion was %s (%d %s ago) by %s.' % ( + self._ptime(most_recent_time), + elapsed, elapsed_unit, host)) + + def reconstruction_check(self, hosts): + """ + Obtain and print reconstructon statistics + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6020), ('127.0.0.2', 6030)]) + """ + stats = [] + last_stats = [] + recon = Scout("reconstruction/%s" % self.server_type, self.verbose, + self.suppress_errors, self.timeout) + print("[%s] Checking on reconstructors" % self._ptime()) + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): + if status == 200: + stats.append(response.get('object_reconstruction_time')) + last = response.get('object_reconstruction_last', 0) + last_stats.append((url, last)) + if stats: + computed = self._gen_stats(stats, + name='object_reconstruction_time') + if computed['reported'] > 0: + self._print_stats(computed) + else: + print("[object_reconstruction_time] - No hosts returned " + "valid data.") + else: + print("[object_reconstruction_time] - No hosts returned " + "valid data.") + self._calculate_least_and_most_recent(last_stats) + print("=" * 79) + + def replication_check(self, hosts): + """ + Obtain and print replication statistics + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6220), ('127.0.0.2', 6230)]) + """ + stats = {'replication_time': [], 'failure': [], 'success': [], + 'attempted': []} + last_stats = [] + recon = Scout("replication/%s" % self.server_type, self.verbose, + self.suppress_errors, self.timeout) + print("[%s] Checking on replication" % self._ptime()) + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): + if status == 200: + stats['replication_time'].append( + response.get('replication_time', + response.get('object_replication_time', 0))) + repl_stats = response.get('replication_stats') + if repl_stats: + for stat_key in ['attempted', 'failure', 'success']: + stats[stat_key].append(repl_stats.get(stat_key)) + last = response.get('replication_last', + response.get('object_replication_last', 0)) + last_stats.append((url, last)) + for k in stats: + if stats[k]: + if k != 'replication_time': + computed = self._gen_stats(stats[k], + name='replication_%s' % k) + else: + computed = self._gen_stats(stats[k], name=k) + if computed['reported'] > 0: + self._print_stats(computed) + else: + print("[%s] - No hosts returned valid data." % k) + else: + print("[%s] - No hosts returned valid data." % k) + self._calculate_least_and_most_recent(last_stats) + print("=" * 79) + + def updater_check(self, hosts): + """ + Obtain and print updater statistics + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6220), ('127.0.0.2', 6230)]) + """ + stats = [] + recon = Scout("updater/%s" % self.server_type, self.verbose, + self.suppress_errors, self.timeout) + print("[%s] Checking updater times" % self._ptime()) + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): + if status == 200: + if response['%s_updater_sweep' % self.server_type]: + stats.append(response['%s_updater_sweep' % + self.server_type]) + if len(stats) > 0: + computed = self._gen_stats(stats, name='updater_last_sweep') + if computed['reported'] > 0: + self._print_stats(computed) + else: + print("[updater_last_sweep] - No hosts returned valid data.") + else: + print("[updater_last_sweep] - No hosts returned valid data.") + print("=" * 79) + + def auditor_check(self, hosts): + """ + Obtain and print obj auditor statistics + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6220), ('127.0.0.2', 6230)]) + """ + scan = {} + adone = '%s_auditor_pass_completed' % self.server_type + afail = '%s_audits_failed' % self.server_type + apass = '%s_audits_passed' % self.server_type + asince = '%s_audits_since' % self.server_type + recon = Scout("auditor/%s" % self.server_type, self.verbose, + self.suppress_errors, self.timeout) + print("[%s] Checking auditor stats" % self._ptime()) + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): + if status == 200: + scan[url] = response + if len(scan) < 1: + print("Error: No hosts available") + return + stats = {} + stats[adone] = [scan[i][adone] for i in scan + if scan[i][adone] is not None] + stats[afail] = [scan[i][afail] for i in scan + if scan[i][afail] is not None] + stats[apass] = [scan[i][apass] for i in scan + if scan[i][apass] is not None] + stats[asince] = [scan[i][asince] for i in scan + if scan[i][asince] is not None] + for k in stats: + if len(stats[k]) < 1: + print("[%s] - No hosts returned valid data." % k) + else: + if k != asince: + computed = self._gen_stats(stats[k], k) + if computed['reported'] > 0: + self._print_stats(computed) + if len(stats[asince]) >= 1: + low = min(stats[asince]) + high = max(stats[asince]) + total = sum(stats[asince]) + average = total / len(stats[asince]) + print('[last_pass] oldest: %s, newest: %s, avg: %s' % + (self._ptime(low), self._ptime(high), self._ptime(average))) + print("=" * 79) + + def nested_get_value(self, key, recon_entry): + """ + Generator that yields all values for given key in a recon cache entry. + This is for use with object auditor recon cache entries. If the + object auditor has run in parallel, the recon cache will have entries + of the form: {'object_auditor_stats_ALL': { 'disk1': {..}, + 'disk2': {..}, + 'disk3': {..}, + ...}} + If the object auditor hasn't run in parallel, the recon cache will have + entries of the form: {'object_auditor_stats_ALL': {...}}. + The ZBF auditor doesn't run in parallel. However, if a subset of + devices is selected for auditing, the recon cache will have an entry + of the form: {'object_auditor_stats_ZBF': { 'disk1disk2..diskN': {}} + We use this generator to find all instances of a particular key in + these multi-level dictionaries. + """ + for k, v in recon_entry.items(): + if isinstance(v, dict): + for value in self.nested_get_value(key, v): + yield value + if k == key: + yield v + + def object_auditor_check(self, hosts): + """ + Obtain and print obj auditor statistics + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6220), ('127.0.0.2', 6230)]) + """ + all_scan = {} + zbf_scan = {} + atime = 'audit_time' + bprocessed = 'bytes_processed' + passes = 'passes' + errors = 'errors' + quarantined = 'quarantined' + recon = Scout("auditor/object", self.verbose, self.suppress_errors, + self.timeout) + print("[%s] Checking auditor stats " % self._ptime()) + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): + if status == 200: + if response['object_auditor_stats_ALL']: + all_scan[url] = response['object_auditor_stats_ALL'] + if response['object_auditor_stats_ZBF']: + zbf_scan[url] = response['object_auditor_stats_ZBF'] + if len(all_scan) > 0: + stats = {} + stats[atime] = [sum(self.nested_get_value(atime, all_scan[i])) + for i in all_scan] + stats[bprocessed] = [sum(self.nested_get_value(bprocessed, + all_scan[i])) for i in all_scan] + stats[passes] = [sum(self.nested_get_value(passes, all_scan[i])) + for i in all_scan] + stats[errors] = [sum(self.nested_get_value(errors, all_scan[i])) + for i in all_scan] + stats[quarantined] = [sum(self.nested_get_value(quarantined, + all_scan[i])) for i in all_scan] + for k in stats: + if None in stats[k]: + stats[k] = [x for x in stats[k] if x is not None] + if len(stats[k]) < 1: + print("[Auditor %s] - No hosts returned valid data." % k) + else: + computed = self._gen_stats(stats[k], + name='ALL_%s_last_path' % k) + if computed['reported'] > 0: + self._print_stats(computed) + else: + print("[ALL_auditor] - No hosts returned valid data.") + else: + print("[ALL_auditor] - No hosts returned valid data.") + if len(zbf_scan) > 0: + stats = {} + stats[atime] = [sum(self.nested_get_value(atime, zbf_scan[i])) + for i in zbf_scan] + stats[bprocessed] = [sum(self.nested_get_value(bprocessed, + zbf_scan[i])) for i in zbf_scan] + stats[errors] = [sum(self.nested_get_value(errors, zbf_scan[i])) + for i in zbf_scan] + stats[quarantined] = [sum(self.nested_get_value(quarantined, + zbf_scan[i])) for i in zbf_scan] + for k in stats: + if None in stats[k]: + stats[k] = [x for x in stats[k] if x is not None] + if len(stats[k]) < 1: + print("[Auditor %s] - No hosts returned valid data." % k) + else: + computed = self._gen_stats(stats[k], + name='ZBF_%s_last_path' % k) + if computed['reported'] > 0: + self._print_stats(computed) + else: + print("[ZBF_auditor] - No hosts returned valid data.") + else: + print("[ZBF_auditor] - No hosts returned valid data.") + print("=" * 79) + + def sharding_check(self, hosts): + """ + Obtain and print sharding statistics + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6221), ('127.0.0.2', 6231)]) + """ + stats = {'sharding_time': [], + 'attempted': [], 'failure': [], 'success': []} + recon = Scout("sharding", self.verbose, + self.suppress_errors, self.timeout) + print("[%s] Checking on sharders" % self._ptime()) + least_recent_time = 9999999999 + least_recent_url = None + most_recent_time = 0 + most_recent_url = None + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): + if status == 200: + stats['sharding_time'].append(response.get('sharding_time', 0)) + shard_stats = response.get('sharding_stats') + if shard_stats: + # Sharding has a ton more stats, like "no_change". + # Not sure if we need them at all, or maybe for -v. + for stat_key in ['attempted', 'failure', 'success']: + stats[stat_key].append(shard_stats.get(stat_key)) + last = response.get('sharding_last', 0) + if last is None: + continue + if last < least_recent_time: + least_recent_time = last + least_recent_url = url + if last > most_recent_time: + most_recent_time = last + most_recent_url = url + for k in stats: + if stats[k]: + computed = self._gen_stats(stats[k], name=k) + if computed['reported'] > 0: + self._print_stats(computed) + else: + print("[%s] - No hosts returned valid data." % k) + else: + print("[%s] - No hosts returned valid data." % k) + if least_recent_url is not None: + host = urlparse(least_recent_url).netloc + if not least_recent_time: + print('Oldest completion was NEVER by %s.' % host) + else: + elapsed = time.time() - least_recent_time + elapsed, elapsed_unit = seconds2timeunit(elapsed) + print('Oldest completion was %s (%d %s ago) by %s.' % ( + self._ptime(least_recent_time), + elapsed, elapsed_unit, host)) + if most_recent_url is not None: + host = urlparse(most_recent_url).netloc + elapsed = time.time() - most_recent_time + elapsed, elapsed_unit = seconds2timeunit(elapsed) + print('Most recent completion was %s (%d %s ago) by %s.' % ( + self._ptime(most_recent_time), + elapsed, elapsed_unit, host)) + print("=" * 79) + + def load_check(self, hosts): + """ + Obtain and print load average statistics + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6220), ('127.0.0.2', 6230)]) + """ + load1 = {} + load5 = {} + load15 = {} + recon = Scout("load", self.verbose, self.suppress_errors, + self.timeout) + print("[%s] Checking load averages" % self._ptime()) + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): + if status == 200: + load1[url] = response['1m'] + load5[url] = response['5m'] + load15[url] = response['15m'] + stats = {"1m": load1, "5m": load5, "15m": load15} + for item in stats: + if len(stats[item]) > 0: + computed = self._gen_stats(stats[item].values(), + name='%s_load_avg' % item) + self._print_stats(computed) + else: + print("[%s_load_avg] - No hosts returned valid data." % item) + print("=" * 79) + + def quarantine_check(self, hosts): + """ + Obtain and print quarantine statistics + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6220), ('127.0.0.2', 6230)]) + """ + objq = {} + conq = {} + acctq = {} + stats = {} + recon = Scout("quarantined", self.verbose, self.suppress_errors, + self.timeout) + print("[%s] Checking quarantine" % self._ptime()) + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): + if status == 200: + objq[url] = response['objects'] + conq[url] = response['containers'] + acctq[url] = response['accounts'] + for key in response.get('policies', {}): + pkey = "objects_%s" % key + stats.setdefault(pkey, {}) + stats[pkey][url] = response['policies'][key]['objects'] + stats.update({"objects": objq, "containers": conq, "accounts": acctq}) + for item in stats: + if len(stats[item]) > 0: + computed = self._gen_stats(stats[item].values(), + name='quarantined_%s' % item) + self._print_stats(computed) + else: + print("No hosts returned valid data.") + print("=" * 79) + + def socket_usage(self, hosts): + """ + Obtain and print /proc/net/sockstat statistics + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6220), ('127.0.0.2', 6230)]) + """ + inuse4 = {} + mem = {} + inuse6 = {} + timewait = {} + orphan = {} + recon = Scout("sockstat", self.verbose, self.suppress_errors, + self.timeout) + print("[%s] Checking socket usage" % self._ptime()) + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): + if status == 200: + inuse4[url] = response['tcp_in_use'] + mem[url] = response['tcp_mem_allocated_bytes'] + inuse6[url] = response.get('tcp6_in_use', 0) + timewait[url] = response['time_wait'] + orphan[url] = response['orphan'] + stats = {"tcp_in_use": inuse4, "tcp_mem_allocated_bytes": mem, + "tcp6_in_use": inuse6, "time_wait": timewait, + "orphan": orphan} + for item in stats: + if len(stats[item]) > 0: + computed = self._gen_stats(stats[item].values(), item) + self._print_stats(computed) + else: + print("No hosts returned valid data.") + print("=" * 79) + + def disk_usage(self, hosts, top=0, lowest=0, human_readable=False): + """ + Obtain and print disk usage statistics + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6220), ('127.0.0.2', 6230)]) + """ + stats = {} + highs = [] + lows = [] + raw_total_used = [] + raw_total_avail = [] + percents = {} + top_percents = [(None, 0)] * top + low_percents = [(None, 100)] * lowest + recon = Scout("diskusage", self.verbose, self.suppress_errors, + self.timeout) + # We want to only query each host once, but we don't care + # which of the available ports we use. So we filter hosts by + # constructing a host->port dictionary, since the dict + # constructor ensures each key is unique, thus each host + # appears only once in filtered_hosts. + filtered_hosts = set(dict(hosts).items()) + print("[%s] Checking disk usage now" % self._ptime()) + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, filtered_hosts): + if status == 200: + hostusage = [] + for entry in response: + if not isinstance(entry['mounted'], bool): + print("-> %s/%s: Error: %s" % (url, entry['device'], + entry['mounted'])) + elif entry['mounted']: + used = float(entry['used']) / float(entry['size']) \ + * 100.0 + raw_total_used.append(entry['used']) + raw_total_avail.append(entry['avail']) + hostusage.append(round(used, 2)) + for ident, oused in top_percents: + if oused < used: + top_percents.append( + (url + ' ' + entry['device'], used)) + top_percents.sort(key=lambda x: -x[1]) + top_percents.pop() + break + for ident, oused in low_percents: + if oused > used: + low_percents.append( + (url + ' ' + entry['device'], used)) + low_percents.sort(key=lambda x: x[1]) + low_percents.pop() + break + stats[url] = hostusage + + for url in stats: + if len(stats[url]) > 0: + # get per host hi/los for another day + low = min(stats[url]) + high = max(stats[url]) + highs.append(high) + lows.append(low) + for percent in stats[url]: + percents[int(percent)] = percents.get(int(percent), 0) + 1 + else: + print("-> %s: Error. No drive info available." % url) + + if len(lows) > 0: + low = min(lows) + high = max(highs) + # dist graph shamelessly stolen from https://github.com/gholt/tcod + print("Distribution Graph:") + mul = 69.0 / max(percents.values()) + for percent in sorted(percents): + print('% 3d%%%5d %s' % (percent, percents[percent], + '*' * int(percents[percent] * mul))) + raw_used = sum(raw_total_used) + raw_avail = sum(raw_total_avail) + raw_total = raw_used + raw_avail + avg_used = 100.0 * raw_used / raw_total + if human_readable: + raw_used = size_suffix(raw_used) + raw_avail = size_suffix(raw_avail) + raw_total = size_suffix(raw_total) + print("Disk usage: space used: %s of %s" % (raw_used, raw_total)) + print("Disk usage: space free: %s of %s" % (raw_avail, raw_total)) + print("Disk usage: lowest: %s%%, highest: %s%%, avg: %s%%" % + (low, high, avg_used)) + else: + print("No hosts returned valid data.") + print("=" * 79) + if top_percents: + print('TOP %s' % top) + for ident, used in top_percents: + if ident: + url, device = ident.split() + host = urlparse(url).netloc.split(':')[0] + print('%.02f%% %s' % (used, '%-15s %s' % (host, device))) + if low_percents: + print('LOWEST %s' % lowest) + for ident, used in low_percents: + if ident: + url, device = ident.split() + host = urlparse(url).netloc.split(':')[0] + print('%.02f%% %s' % (used, '%-15s %s' % (host, device))) + + def time_check(self, hosts, jitter=0.0): + """ + Check a time synchronization of hosts with current time + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6220), ('127.0.0.2', 6230)]) + :param jitter: Maximal allowed time jitter + """ + + jitter = abs(jitter) + matches = 0 + errors = 0 + recon = Scout("time", self.verbose, self.suppress_errors, + self.timeout) + print("[%s] Checking time-sync" % self._ptime()) + for url, ts_remote, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): + if status != 200: + errors = errors + 1 + continue + if (ts_remote + jitter < ts_start or ts_remote - jitter > ts_end): + diff = abs(ts_end - ts_remote) + ts_end_f = self._ptime(ts_end) + ts_remote_f = self._ptime(ts_remote) + + print("!! %s current time is %s, but remote is %s, " + "differs by %.4f sec" % ( + url, + ts_end_f, + ts_remote_f, + diff)) + continue + matches += 1 + if self.verbose: + print("-> %s matches." % url) + print("%s/%s hosts matched, %s error[s] while checking hosts." % ( + matches, len(hosts), errors)) + print("=" * 79) + + def version_check(self, hosts): + """ + Check OS Swift version of hosts. Inform if differs. + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6220), ('127.0.0.2', 6230)]) + """ + versions = set() + errors = 0 + print("[%s] Checking versions" % self._ptime()) + recon = Scout("version", self.verbose, self.suppress_errors, + self.timeout) + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): + if status != 200: + errors = errors + 1 + continue + versions.add(response['version']) + if self.verbose: + print("-> %s installed version %s" % ( + url, response['version'])) + + if not len(versions): + print("No hosts returned valid data.") + elif len(versions) == 1: + print("Versions matched (%s), " + "%s error[s] while checking hosts." % ( + versions.pop(), errors)) + else: + print("Versions not matched (%s), " + "%s error[s] while checking hosts." % ( + ", ".join(sorted(versions)), errors)) + + print("=" * 79) + + def _get_ring_names(self, policy=None): + """ + Retrieve name of ring files. + + If no policy is passed and the server type is object, + the ring names of all storage-policies are retrieved. + + :param policy: name or index of storage policy, only applicable + with server_type==object. + :returns: list of ring names. + """ + if self.server_type == 'object': + ring_names = [p.ring_name for p in POLICIES if ( + p.name == policy or not policy or ( + policy.isdigit() and int(policy) == int(p) or + (isinstance(policy, str) + and policy in p.aliases)))] + else: + ring_names = [self.server_type] + + return ring_names + + def main(self): + """ + Retrieve and report cluster info from hosts running recon middleware. + """ + print("=" * 79) + usage = ''' + usage: %prog [ []] + [-v] [--suppress] [-a] [-r] [-u] [-d] [-R] + [-l] [-T] [--md5] [--auditor] [--updater] [--expirer] [--sockstat] + [--human-readable] + + \taccount|container|object + Defaults to object server. + + ex: %prog container -l --auditor + ''' + args = optparse.OptionParser(usage) + args.add_option('--verbose', '-v', action="store_true", + help="Print verbose info") + args.add_option('--suppress', action="store_true", + help="Suppress most connection related errors") + args.add_option('--async', '-a', + action="store_true", dest="async_check", + help="Get async stats") + args.add_option('--replication', '-r', action="store_true", + help="Get replication stats") + args.add_option('--reconstruction', '-R', action="store_true", + help="Get reconstruction stats") + args.add_option('--auditor', action="store_true", + help="Get auditor stats") + args.add_option('--updater', action="store_true", + help="Get updater stats") + args.add_option('--expirer', action="store_true", + help="Get expirer stats") + args.add_option('--sharding', action="store_true", + help="Get sharding stats") + args.add_option('--unmounted', '-u', action="store_true", + help="Check cluster for unmounted devices") + args.add_option('--diskusage', '-d', action="store_true", + help="Get disk usage stats") + args.add_option('--human-readable', action="store_true", + help="Use human readable suffix for disk usage stats") + args.add_option('--loadstats', '-l', action="store_true", + help="Get cluster load average stats") + args.add_option('--quarantined', '-q', action="store_true", + help="Get cluster quarantine stats") + args.add_option('--validate-servers', action="store_true", + help="Validate servers on the ring") + args.add_option('--md5', action="store_true", + help="Get md5sum of servers ring and compare to " + "local copy") + args.add_option('--sockstat', action="store_true", + help="Get cluster socket usage stats") + args.add_option('--driveaudit', action="store_true", + help="Get drive audit error stats") + args.add_option('--time', '-T', action="store_true", + help="Check time synchronization") + args.add_option('--jitter', type="float", default=0.0, + help="Maximal allowed time jitter") + args.add_option('--swift-versions', action="store_true", + help="Check swift versions") + args.add_option('--top', type='int', metavar='COUNT', default=0, + help='Also show the top COUNT entries in rank order.') + args.add_option('--lowest', type='int', metavar='COUNT', default=0, + help='Also show the lowest COUNT entries in rank \ + order.') + args.add_option('--all', action="store_true", + help="Perform all checks. Equal to \t\t\t-arRudlqT " + "--md5 --sockstat --auditor --updater --expirer " + "--driveaudit --validate-servers --swift-versions") + args.add_option('--region', type="int", + help="Only query servers in specified region") + args.add_option('--zone', '-z', type="int", + help="Only query servers in specified zone") + args.add_option('--timeout', '-t', type="int", metavar="SECONDS", + help="Time to wait for a response from a server", + default=5) + args.add_option('--swiftdir', default="/etc/swift", + help="Default = /etc/swift") + args.add_option('--policy', '-p', + help='Only query object servers in specified ' + 'storage policy (specified as name or index).') + options, arguments = args.parse_args() + + if len(sys.argv) <= 1 or len(arguments) > len(self.check_types): + args.print_help() + sys.exit(0) + + if arguments: + arguments = set(arguments) + if arguments.issubset(self.check_types): + server_types = arguments + else: + print("Invalid Server Type") + args.print_help() + sys.exit(1) + else: # default + server_types = ['object'] + + swift_dir = options.swiftdir + if set_swift_dir(swift_dir): + reload_storage_policies() + + self.verbose = options.verbose + self.suppress_errors = options.suppress + self.timeout = options.timeout + + for server_type in server_types: + self.server_type = server_type + ring_names = self._get_ring_names(options.policy) + if not ring_names: + print('Invalid Storage Policy: %s' % options.policy) + args.print_help() + sys.exit(0) + hosts = self.get_hosts(options.region, options.zone, + swift_dir, ring_names) + print("--> Starting reconnaissance on %s hosts (%s)" % + (len(hosts), self.server_type)) + print("=" * 79) + if options.all: + if self.server_type == 'object': + self.async_check(hosts) + self.object_auditor_check(hosts) + self.updater_check(hosts) + self.expirer_check(hosts) + self.reconstruction_check(hosts) + elif self.server_type == 'container': + self.auditor_check(hosts) + self.updater_check(hosts) + self.sharding_check(hosts) + elif self.server_type == 'account': + self.auditor_check(hosts) + self.replication_check(hosts) + self.umount_check(hosts) + self.load_check(hosts) + self.disk_usage(hosts, options.top, options.lowest, + options.human_readable) + self.get_ringmd5(hosts, swift_dir) + self.get_swiftconfmd5(hosts) + self.quarantine_check(hosts) + self.socket_usage(hosts) + self.server_type_check(hosts) + self.driveaudit_check(hosts) + self.time_check(hosts, options.jitter) + self.version_check(hosts) + else: + if options.async_check: + if self.server_type == 'object': + self.async_check(hosts) + else: + print("Error: Can't check asyncs on non object " + "servers.") + print("=" * 79) + if options.unmounted: + self.umount_check(hosts) + if options.replication: + self.replication_check(hosts) + if options.auditor: + if self.server_type == 'object': + self.object_auditor_check(hosts) + else: + self.auditor_check(hosts) + if options.updater: + if self.server_type == 'account': + print("Error: Can't check updaters on account " + "servers.") + print("=" * 79) + else: + self.updater_check(hosts) + if options.expirer: + if self.server_type == 'object': + self.expirer_check(hosts) + else: + print("Error: Can't check expirer on non object " + "servers.") + print("=" * 79) + if options.sharding: + if self.server_type == 'container': + self.sharding_check(hosts) + else: + print("Error: Can't check sharding on non container " + "servers.") + print("=" * 79) + if options.reconstruction: + if self.server_type == 'object': + self.reconstruction_check(hosts) + else: + print("Error: Can't check reconstruction stats on " + "non object servers.") + print("=" * 79) + if options.validate_servers: + self.server_type_check(hosts) + if options.loadstats: + self.load_check(hosts) + if options.diskusage: + self.disk_usage(hosts, options.top, options.lowest, + options.human_readable) + if options.md5: + self.get_ringmd5(hosts, swift_dir) + self.get_swiftconfmd5(hosts) + if options.quarantined: + self.quarantine_check(hosts) + if options.sockstat: + self.socket_usage(hosts) + if options.driveaudit: + self.driveaudit_check(hosts) + if options.time: + self.time_check(hosts, options.jitter) + if options.swift_versions: + self.version_check(hosts) + + +def main(): + try: + reconnoiter = SwiftRecon() + reconnoiter.main() + except KeyboardInterrupt: + print('\n') diff --git a/swift/cli/recon_cron.py b/swift/cli/recon_cron.py new file mode 100644 index 0000000000..bd1dd22052 --- /dev/null +++ b/swift/cli/recon_cron.py @@ -0,0 +1,73 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import time + +from eventlet import Timeout + +from swift.common.utils import get_logger, dump_recon_cache, readconf, \ + lock_path, listdir +from swift.common.recon import RECON_OBJECT_FILE, DEFAULT_RECON_CACHE_PATH +from swift.obj.diskfile import ASYNCDIR_BASE + + +def get_async_count(device_dir): + async_count = 0 + for i in listdir(device_dir): + device = os.path.join(device_dir, i) + if not os.path.isdir(device): + continue + for asyncdir in listdir(device): + # skip stuff like "accounts", "containers", etc. + if not (asyncdir == ASYNCDIR_BASE or + asyncdir.startswith(ASYNCDIR_BASE + '-')): + continue + async_pending = os.path.join(device, asyncdir) + + if os.path.isdir(async_pending): + for entry in listdir(async_pending): + if os.path.isdir(os.path.join(async_pending, entry)): + async_hdir = os.path.join(async_pending, entry) + async_count += len(listdir(async_hdir)) + return async_count + + +def main(): + try: + conf_path = sys.argv[1] + except Exception: + print("Usage: %s CONF_FILE" % sys.argv[0].split('/')[-1]) + print("ex: swift-recon-cron /etc/swift/object-server.conf") + return 1 + conf = readconf(conf_path, 'filter:recon') + device_dir = conf.get('devices', '/srv/node') + recon_cache_path = conf.get('recon_cache_path', DEFAULT_RECON_CACHE_PATH) + recon_lock_path = conf.get('recon_lock_path', '/var/lock') + cache_file = os.path.join(recon_cache_path, RECON_OBJECT_FILE) + lock_dir = os.path.join(recon_lock_path, "swift-recon-object-cron") + conf['log_name'] = conf.get('log_name', 'recon-cron') + logger = get_logger(conf, log_route='recon-cron') + try: + with lock_path(lock_dir): + asyncs = get_async_count(device_dir) + dump_recon_cache({ + 'async_pending': asyncs, + 'async_pending_last': time.time(), + }, cache_file, logger) + except (Exception, Timeout) as err: + msg = 'Exception during recon-cron while accessing devices' + logger.exception(msg) + print('%s: %s' % (msg, err)) + return 1 diff --git a/swift/cli/reconciler_enqueue.py b/swift/cli/reconciler_enqueue.py new file mode 100644 index 0000000000..367b0b5e08 --- /dev/null +++ b/swift/cli/reconciler_enqueue.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +from optparse import OptionParser + +import eventlet.debug + +from swift.common.ring import Ring +from swift.common.utils import split_path +from swift.common.storage_policy import POLICIES + +from swift.container.reconciler import add_to_reconciler_queue +""" +This tool is primarily for debugging and development but can be used an example +of how an operator could enqueue objects manually if a problem is discovered - +might be particularly useful if you need to hack a fix into the reconciler +and re-run it. +""" + +USAGE = """ +%prog [options] + +This script enqueues an object to be evaluated by the reconciler. + +Arguments: +policy_index: the policy the object is currently stored in. + /a/c/o: the full path of the object - utf-8 + timestamp: the timestamp of the datafile/tombstone. + +""".strip() + +parser = OptionParser(USAGE) +parser.add_option('-X', '--op', default='PUT', choices=('PUT', 'DELETE'), + help='the method of the misplaced operation') +parser.add_option('-f', '--force', action='store_true', + help='force an object to be re-enqueued') + + +def main(): + eventlet.debug.hub_exceptions(True) + options, args = parser.parse_args() + try: + policy_index, path, timestamp = args + except ValueError: + sys.exit(parser.print_help()) + container_ring = Ring('/etc/swift/container.ring.gz') + policy = POLICIES.get_by_index(policy_index) + if not policy: + return 'ERROR: invalid storage policy index: %s' % policy + try: + account, container, obj = split_path(path, 3, 3, True) + except ValueError as e: + return 'ERROR: %s' % e + container_name = add_to_reconciler_queue( + container_ring, account, container, obj, + policy.idx, timestamp, options.op, force=options.force) + if not container_name: + return 'ERROR: unable to enqueue!' + print(container_name) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/swift/cli/relinker.py b/swift/cli/relinker.py new file mode 100644 index 0000000000..38e82fa6cf --- /dev/null +++ b/swift/cli/relinker.py @@ -0,0 +1,953 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import argparse +import datetime +import errno +import fcntl +import json +import logging +import os +import time +from collections import defaultdict + +from eventlet import hubs + +from swift.common.exceptions import LockTimeout +from swift.common.storage_policy import POLICIES +from swift.common.utils import replace_partition_in_path, config_true_value, \ + audit_location_generator, get_logger, readconf, drop_privileges, \ + RateLimitedIterator, distribute_evenly, get_prefixed_logger, \ + non_negative_float, non_negative_int, config_auto_int_value, \ + dump_recon_cache, get_partition_from_path, get_hub +from swift.common.utils.logs import SwiftLogAdapter +from swift.obj import diskfile +from swift.common.recon import RECON_RELINKER_FILE, DEFAULT_RECON_CACHE_PATH + + +LOCK_FILE = '.relink.{datadir}.lock' +STATE_FILE = 'relink.{datadir}.json' +STATE_TMP_FILE = '.relink.{datadir}.json.tmp' +STEP_RELINK = 'relink' +STEP_CLEANUP = 'cleanup' +EXIT_SUCCESS = 0 +EXIT_NO_APPLICABLE_POLICY = 2 +EXIT_ERROR = 1 +DEFAULT_STATS_INTERVAL = 300.0 + + +def recursive_defaultdict(): + return defaultdict(recursive_defaultdict) + + +def policy(policy_name_or_index): + value = POLICIES.get_by_name_or_index(policy_name_or_index) + if value is None: + raise ValueError + return value + + +def _aggregate_stats(base_stats, update_stats): + for key, value in update_stats.items(): + base_stats.setdefault(key, 0) + base_stats[key] += value + + return base_stats + + +def _aggregate_recon_stats(base_stats, updated_stats): + for k, v in updated_stats.items(): + if k == 'stats': + base_stats['stats'] = _aggregate_stats(base_stats['stats'], v) + elif k == "start_time": + base_stats[k] = min(base_stats.get(k, v), v) + elif k in ("timestamp", "total_time"): + base_stats[k] = max(base_stats.get(k, 0), v) + elif k in ('parts_done', 'total_parts'): + base_stats[k] += v + + return base_stats + + +def _zero_stats(): + return { + 'hash_dirs': 0, + 'files': 0, + 'linked': 0, + 'removed': 0, + 'errors': 0} + + +def _zero_collated_stats(): + return { + 'parts_done': 0, + 'total_parts': 0, + 'total_time': 0, + 'stats': _zero_stats()} + + +class Relinker(object): + def __init__(self, conf, logger, device_list=None, do_cleanup=False): + self.conf = conf + self.recon_cache = os.path.join(self.conf['recon_cache_path'], + RECON_RELINKER_FILE) + self.logger = logger + self.device_list = device_list or [] + self.do_cleanup = do_cleanup + self.root = self.conf['devices'] + if len(self.device_list) == 1: + self.root = os.path.join(self.root, list(self.device_list)[0]) + self.part_power = self.next_part_power = None + self.diskfile_mgr = None + self.dev_lock = None + self._last_recon_update = time.time() + self.stats_interval = float(conf.get( + 'stats_interval', DEFAULT_STATS_INTERVAL)) + self.diskfile_router = diskfile.DiskFileRouter(self.conf, self.logger) + self.stats = _zero_stats() + self.devices_data = recursive_defaultdict() + self.policy_count = 0 + self.pid = os.getpid() + self.linked_into_partitions = set() + + def _aggregate_dev_policy_stats(self): + for dev_data in self.devices_data.values(): + dev_data.update(_zero_collated_stats()) + for policy_data in dev_data.get('policies', {}).values(): + _aggregate_recon_stats(dev_data, policy_data) + + def _update_recon(self, device=None, force_dump=False): + if not force_dump and self._last_recon_update + self.stats_interval \ + > time.time(): + # not time yet! + return + if device: + # dump recon stats for the device + num_parts_done = sum( + 1 for part_done in self.states["state"].values() + if part_done) + num_total_parts = len(self.states["state"]) + step = STEP_CLEANUP if self.do_cleanup else STEP_RELINK + policy_dev_progress = {'step': step, + 'parts_done': num_parts_done, + 'total_parts': num_total_parts, + 'timestamp': time.time()} + self.devices_data[device]['policies'][self.policy.idx].update( + policy_dev_progress) + + # aggregate device policy level values into device level + self._aggregate_dev_policy_stats() + + # We want to periodically update the worker recon timestamp so we know + # it's still running + recon_data = self._update_worker_stats(recon_dump=False) + + recon_data.update({'devices': self.devices_data}) + if device: + self.logger.debug("Updating recon for %s", device) + else: + self.logger.debug("Updating recon") + self._last_recon_update = time.time() + dump_recon_cache(recon_data, self.recon_cache, self.logger) + + @property + def total_errors(self): + # first make sure the policy data is aggregated down to the device + # level + self._aggregate_dev_policy_stats() + return sum([sum([ + dev.get('stats', {}).get('errors', 0), + dev.get('stats', {}).get('unmounted', 0), + dev.get('stats', {}).get('unlistable_partitions', 0)]) + for dev in self.devices_data.values()]) + + def devices_filter(self, _, devices): + if self.device_list: + devices = [d for d in devices if d in self.device_list] + + return set(devices) + + def hook_pre_device(self, device_path): + lock_file = os.path.join(device_path, + LOCK_FILE.format(datadir=self.datadir)) + + fd = os.open(lock_file, os.O_CREAT | os.O_WRONLY) + fcntl.flock(fd, fcntl.LOCK_EX) + self.dev_lock = fd + + state_file = os.path.join(device_path, + STATE_FILE.format(datadir=self.datadir)) + self.states["state"].clear() + try: + with open(state_file, 'rt') as f: + state_from_disk = json.load(f) + if state_from_disk["next_part_power"] != \ + self.states["next_part_power"]: + raise ValueError + on_disk_part_power = state_from_disk["part_power"] + if on_disk_part_power != self.states["part_power"]: + self.states["prev_part_power"] = on_disk_part_power + raise ValueError + self.states["state"].update(state_from_disk["state"]) + except (ValueError, TypeError, KeyError): + # Bad state file: remove the file to restart from scratch + os.unlink(state_file) + except IOError as err: + # Ignore file not found error + if err.errno != errno.ENOENT: + raise + + # initialise the device in recon. + device = os.path.basename(device_path) + self.devices_data[device]['policies'][self.policy.idx] = { + 'start_time': time.time(), 'stats': _zero_stats(), + 'part_power': self.states["part_power"], + 'next_part_power': self.states["next_part_power"]} + self.stats = \ + self.devices_data[device]['policies'][self.policy.idx]['stats'] + self._update_recon(device) + + def hook_post_device(self, device_path): + os.close(self.dev_lock) + self.dev_lock = None + device = os.path.basename(device_path) + pol_stats = self.devices_data[device]['policies'][self.policy.idx] + total_time = time.time() - pol_stats['start_time'] + pol_stats.update({'total_time': total_time, 'stats': self.stats}) + self._update_recon(device, force_dump=True) + + def partitions_filter(self, datadir_path, partitions): + # Remove all non partitions first (eg: auditor_status_ALL.json) + partitions = [p for p in partitions if p.isdigit()] + + relinking = (self.part_power != self.next_part_power) + if relinking: + # All partitions in the upper half are new partitions and there is + # nothing to relink there + partitions = [part for part in partitions + if int(part) < 2 ** self.part_power] + elif "prev_part_power" in self.states: + # All partitions in the upper half are new partitions and there is + # nothing to clean up there + partitions = [part for part in partitions + if int(part) < 2 ** self.states["prev_part_power"]] + + # Format: { 'part': processed } + if self.states["state"]: + missing = list(set(partitions) - set(self.states["state"].keys())) + if missing: + # All missing partitions were created after the first run of + # the relinker with this part_power/next_part_power pair. This + # is expected when relinking, where new partitions appear that + # are appropriate for the target part power. In such cases, + # there's nothing to be done. Err on the side of caution + # during cleanup, however. + for part in missing: + self.states["state"][part] = relinking + partitions = [ + str(part) for part, processed in self.states["state"].items() + if not processed] + else: + self.states["state"].update({ + str(part): False for part in partitions}) + + # Always scan the partitions in reverse order to minimize the amount + # of IO (it actually only matters for relink, not for cleanup). + # + # Initial situation: + # objects/0/000/00000000...00000000/12345.data + # -> relinked to objects/1/000/10000000...00000000/12345.data + # + # If the relinker then scan partition 1, it will listdir that object + # while it's unnecessary. By working in reverse order of partitions, + # this is avoided. + partitions = sorted(partitions, key=int, reverse=True) + + # do this last so that self.states, and thus the state file, has been + # initiated with *all* partitions before partitions are restricted for + # this particular run... + conf_partitions = self.conf.get('partitions') + if conf_partitions: + partitions = [p for p in partitions if int(p) in conf_partitions] + + return partitions + + def hook_pre_partition(self, partition_path): + self.pre_partition_errors = self.total_errors + self.linked_into_partitions = set() + + def hook_post_partition(self, partition_path): + datadir_path, partition = os.path.split( + os.path.abspath(partition_path)) + device_path, datadir_name = os.path.split(datadir_path) + device = os.path.basename(device_path) + state_tmp_file = os.path.join( + device_path, STATE_TMP_FILE.format(datadir=datadir_name)) + state_file = os.path.join( + device_path, STATE_FILE.format(datadir=datadir_name)) + + # We started with a partition space like + # |0 N| + # |ABCDEFGHIJKLMNOP| + # + # After relinking, it will be more like + # |0 2N| + # |AABBCCDDEEFFGGHHIIJJKKLLMMNNOOPP| + # + # We want to hold off on rehashing until after cleanup, since that is + # the point at which we've finished with filesystem manipulations. But + # there's a slight complication: we know the upper half has nothing to + # clean up, so the cleanup phase only looks at + # |0 2N| + # |AABBCCDDEEFFGGHH | + # + # To ensure that the upper half gets rehashed, too, do it as part of + # relinking; as we finish + # |0 N| + # | IJKLMNOP| + # shift to the new partition space and rehash + # |0 2N| + # | IIJJKKLLMMNNOOPP| + for dirty_partition in self.linked_into_partitions: + if self.do_cleanup or \ + dirty_partition >= 2 ** self.states['part_power']: + self.diskfile_mgr.get_hashes( + device, dirty_partition, [], self.policy) + + if self.do_cleanup: + try: + hashes = self.diskfile_mgr.get_hashes( + device, int(partition), [], self.policy) + except LockTimeout: + hashes = 1 # truthy, but invalid + # In any reasonably-large cluster, we'd expect all old + # partitions P to be empty after cleanup (i.e., it's unlikely + # that there's another partition Q := P//2 that also has data + # on this device). + # + # Try to clean up empty partitions now, so operators can use + # existing rebalance-complete metrics to monitor relinking + # progress (provided there are few/no handoffs when relinking + # starts and little data is written to handoffs during the + # increase). + if not hashes: + try: + with self.diskfile_mgr.replication_lock( + device, self.policy, partition), \ + self.diskfile_mgr.partition_lock( + device, self.policy, partition): + # Order here is somewhat important for crash-tolerance + for f in ('hashes.pkl', 'hashes.invalid', '.lock', + '.lock-replication'): + try: + os.unlink(os.path.join(partition_path, f)) + except OSError as e: + if e.errno != errno.ENOENT: + raise + # Note that as soon as we've deleted the lock files, some + # other process could come along and make new ones -- so + # this may well complain that the directory is not empty + os.rmdir(partition_path) + except (OSError, LockTimeout): + # Most likely, some data landed in here or we hit an error + # above. Let the replicator deal with things; it was worth + # a shot. + pass + + # If there were no errors, mark this partition as done. This is handy + # in case the process is interrupted and needs to resume, or there + # were errors and the relinker needs to run again. + if self.pre_partition_errors == self.total_errors: + self.states["state"][partition] = True + with open(state_tmp_file, 'wt') as f: + json.dump(self.states, f) + os.fsync(f.fileno()) + os.rename(state_tmp_file, state_file) + num_parts_done = sum( + 1 for part in self.states["state"].values() + if part) + step = STEP_CLEANUP if self.do_cleanup else STEP_RELINK + num_total_parts = len(self.states["state"]) + self.logger.info( + "Step: %s Device: %s Policy: %s Partitions: %d/%d", + step, device, self.policy.name, num_parts_done, num_total_parts) + self._update_recon(device) + + def hashes_filter(self, suff_path, hashes): + hashes = list(hashes) + for hsh in hashes: + fname = os.path.join(suff_path, hsh) + if fname == replace_partition_in_path( + self.conf['devices'], fname, self.next_part_power): + hashes.remove(hsh) + return hashes + + def do_relink(self, device, hash_path, new_hash_path, filename, + already_quarantined=False): + """ + Attempt to relink a file from old location to new location. + + :param device: device name + :param hash_path: source hash directory path + :param new_hash_path: destination hash directory path + :param filename: filename to relink + :param already_quarantined: whether quarantine has already been + attempted + :returns: tuple of (success, created) where success is True if the link + is successfully verified, and created is True if a new link + needed to be created for successful verification (if created + is True for any new link in any hash_path some caller above + us should ideally invalidate the whole suffix) + """ + old_file = os.path.join(hash_path, filename) + new_file = os.path.join(new_hash_path, filename) + success = created = False + try: + created = diskfile.relink_paths(old_file, new_file) + success = True + except FileExistsError: + # we've detected a hardlink collision, so we need to handle it + # depending on what kind of file it is and our mode and + # configuration + if filename.endswith('.ts'): + # special case for duplicate tombstones, see: + # https://bugs.launchpad.net/swift/+bug/1921718 + # https://bugs.launchpad.net/swift/+bug/1934142 + self.logger.debug( + "Relinking%s: tolerating different inodes for " + "tombstone with same timestamp: %s to %s", + ' (cleanup)' if self.do_cleanup else '', + old_file, new_file) + success = True + elif self.conf['clobber_hardlink_collisions']: + if self.do_cleanup: + # At this point your clients are already *in* the new part + # dir, if the "better" data was in the old part dir you're + # already hurting and maybe flipped back to retry the + # relink phase again? If you're moving forward with the + # cleanup presumably you're ready for this circus to be + # over and doing extra io to quarantine the data you're + # currently using and replace it with old data seems less + # attractive than letting the un-referenced data get + # cleaned up. But there might be a case to argue that + # clobber_hardlink_collision should quarantine old_file + # here before returning success. + self.logger.debug( + "Relinking%s: tolerating hardlink collision: " + "%s to %s", + ' (cleanup)' if self.do_cleanup else '', + old_file, new_file) + success = True + elif already_quarantined: + # Already attempted quarantine, this is a failure, but user + # can retry (or already_quarantined becomes a counter?) + # N.B. this can exit non-zero w/o logging at "error" + self.logger.warning( + "Relinking%s: hardlink collision persists after " + "quarantine: %s to %s", + ' (cleanup)' if self.do_cleanup else '', + old_file, new_file) + else: + # During relink phase, quarantine and retry once + dev_path = os.path.join(self.diskfile_mgr.devices, device) + to_dir = diskfile.quarantine_renamer(dev_path, new_file) + self.logger.info( + "Relinking%s: clobbering hardlink collision: " + "%s moved to %s", + ' (cleanup)' if self.do_cleanup else '', + new_file, to_dir) + # retry with quarantine flag set + return self.do_relink( + device, hash_path, new_hash_path, filename, + already_quarantined=True) + else: + self.logger.error( + "Error relinking%s: hardlink collision: " + "%s to %s (consider enabling clobber_hardlink_collisions)", + ' (cleanup)' if self.do_cleanup else '', + old_file, new_file) + except Exception as exc: + # Depending on what kind of errors these are, it might be + # reasonable to consider them "warnings" if we expect re-running + # the relinker would be able to fix them (like if it's just a + # general file-system corruption error and your auditor is still + # running maybe it will quarantine bad paths to clear the way). + # But AFAIK all currently known/observed error conditions are + # enumerated above and any unknown error conditions may not be + # fixable by simply re-running the relinker: so we log them as + # error to match the expected non-zero return code. + self.logger.error( + "Error relinking%s: failed to relink %s to %s: %s", + ' (cleanup)' if self.do_cleanup else '', + old_file, new_file, exc) + if created: + self.logger.debug( + "Relinking%s created link: %s to %s", + ' (cleanup)' if self.do_cleanup else '', + old_file, new_file) + return success, created + + def process_location(self, device, hash_path, new_hash_path): + """ + Handle relink of all files in a hash_dir path. + + Compare the contents of each hash dir with contents of same hash + dir in its new partition to verify that the new location has the + most up to date set of files. The new location may have newer + files if it has been updated since relinked. + + If any new links are created the suffix will be invalidated. + In cleanup mode, the unwanted files in the old hash_path will be + removed as long as there are no errors. + + :param device: device name + :param hash_path: old hash directory path + :param new_hash_path: new hash directory path + """ + self.stats['hash_dirs'] += 1 + + # Get on disk data for new and old locations, cleaning up any + # reclaimable or obsolete files in each. The new location is + # cleaned up *before* the old location to prevent false negatives + # where the old still has a file that has been cleaned up in the + # new; cleaning up the new location first ensures that the old will + # always be 'cleaner' than the new. + new_df_data = self.diskfile_mgr.cleanup_ondisk_files(new_hash_path) + old_df_data = self.diskfile_mgr.cleanup_ondisk_files(hash_path) + # Now determine the most up to date set of on disk files would be + # given the content of old and new locations... + new_files = set(new_df_data['files']) + old_files = set(old_df_data['files']) + union_files = new_files.union(old_files) + union_data = self.diskfile_mgr.get_ondisk_files( + union_files, '', verify=False) + obsolete_files = set(info['filename'] + for info in union_data.get('obsolete', [])) + # drop 'obsolete' files but retain 'unexpected' files which might + # be misplaced diskfiles from another policy + required_files = union_files.difference(obsolete_files) + required_links = required_files.intersection(old_files) + + missing_links = 0 + created_links = 0 + unwanted_files = [] + for filename in required_links: + # Before removing old files, be sure that the corresponding + # required new files exist by calling relink_paths again. There + # are several possible outcomes: + # - The common case is that the new file exists, in which case + # relink_paths checks that the new file has the same inode + # as the old file. An exception is raised if the inode of + # the new file is not the same as the old file. + # - The new file may not exist because the relinker failed to + # create the link to the old file and has erroneously moved + # on to cleanup. In this case the relink_paths will create + # the link now or raise an exception if that fails. + # - The new file may not exist because some other process, + # such as an object server handling a request, has cleaned + # it up since we called cleanup_ondisk_files(new_hash_path). + # In this case a new link will be created to the old file. + # This is unnecessary but simpler than repeating the + # evaluation of what links are now required and safer than + # assuming that a non-existent file that *was* required is + # no longer required. The new file will eventually be + # cleaned up again. + self.stats['files'] += 1 + success, created = self.do_relink( + device, hash_path, new_hash_path, filename) + if success: + if created: + created_links += 1 + self.stats['linked'] += 1 + else: + self.stats['errors'] += 1 + missing_links += 1 + if created_links: + self.linked_into_partitions.add(get_partition_from_path( + self.conf['devices'], new_hash_path)) + try: + diskfile.invalidate_hash(os.path.dirname(new_hash_path)) + except (Exception, LockTimeout) as exc: + # at this point, the link's created. even if we counted it as + # an error, a subsequent run wouldn't find any work to do. so, + # don't bother; instead, wait for replication to be re-enabled + # so post-replication rehashing or periodic rehashing can + # eventually pick up the change + self.logger.warning( + 'Error invalidating suffix for %s: %r', + new_hash_path, exc) + + if self.do_cleanup and not missing_links: + # use the sorted list to help unit testing + unwanted_files = old_df_data['files'] + + # the new partition hash dir has the most up to date set of on + # disk files so it is safe to delete the old location... + rehash = False + for filename in unwanted_files: + old_file = os.path.join(hash_path, filename) + try: + os.remove(old_file) + except OSError as exc: + # N.B. if we want to allow old_file to get quarantined this + # should probably be robust to ENOENT + self.logger.warning('Error cleaning up %s: %r', old_file, exc) + self.stats['errors'] += 1 + else: + rehash = True + self.stats['removed'] += 1 + self.logger.debug("Removed %s", old_file) + + if rehash: + # Even though we're invalidating the suffix, don't update + # self.linked_into_partitions -- we only care about them for + # relinking into the new part-power space + try: + diskfile.invalidate_hash(os.path.dirname(hash_path)) + except (Exception, LockTimeout) as exc: + # note: not counted as an error + self.logger.warning( + 'Error invalidating suffix for %s: %r', + hash_path, exc) + + def place_policy_stat(self, dev, policy, stat, value): + stats = self.devices_data[dev]['policies'][policy.idx].setdefault( + "stats", _zero_stats()) + stats[stat] = stats.get(stat, 0) + value + + def process_policy(self, policy): + self.logger.info( + 'Processing files for policy %s under %s (cleanup=%s)', + policy.name, self.root, self.do_cleanup) + self.part_power = policy.object_ring.part_power + self.next_part_power = policy.object_ring.next_part_power + self.diskfile_mgr = self.diskfile_router[policy] + self.datadir = diskfile.get_data_dir(policy) + self.states = { + "part_power": self.part_power, + "next_part_power": self.next_part_power, + "state": {}, + } + audit_stats = {} + + locations = audit_location_generator( + self.conf['devices'], + self.datadir, + mount_check=self.conf['mount_check'], + devices_filter=self.devices_filter, + hook_pre_device=self.hook_pre_device, + hook_post_device=self.hook_post_device, + partitions_filter=self.partitions_filter, + hook_pre_partition=self.hook_pre_partition, + hook_post_partition=self.hook_post_partition, + hashes_filter=self.hashes_filter, + logger=self.logger, + error_counter=audit_stats, + yield_hash_dirs=True + ) + if self.conf['files_per_second'] > 0: + locations = RateLimitedIterator( + locations, self.conf['files_per_second']) + for hash_path, device, _part_num in locations: + # note, in cleanup step next_part_power == part_power + new_hash_path = replace_partition_in_path( + self.conf['devices'], hash_path, self.next_part_power) + if new_hash_path == hash_path: + continue + self.process_location(device, hash_path, new_hash_path) + + # any unmounted devices don't trigger the pre_device trigger. + # so we'll deal with them here. + for dev in audit_stats.get('unmounted', []): + self.place_policy_stat(dev, policy, 'unmounted', 1) + + # Further unlistable_partitions doesn't trigger the post_device, so + # we also need to deal with them here. + for datadir in audit_stats.get('unlistable_partitions', []): + device_path, _ = os.path.split(datadir) + device = os.path.basename(device_path) + self.place_policy_stat(device, policy, 'unlistable_partitions', 1) + + def _update_worker_stats(self, recon_dump=True, return_code=None): + worker_stats = {'devices': self.device_list, + 'timestamp': time.time(), + 'return_code': return_code} + worker_data = {"workers": {str(self.pid): worker_stats}} + if recon_dump: + dump_recon_cache(worker_data, self.recon_cache, self.logger) + return worker_data + + def run(self): + num_policies = 0 + self._update_worker_stats() + for policy in self.conf['policies']: + self.policy = policy + policy.object_ring = None # Ensure it will be reloaded + policy.load_ring(self.conf['swift_dir']) + ring = policy.object_ring + if not ring.next_part_power: + continue + part_power_increased = ring.next_part_power == ring.part_power + if self.do_cleanup != part_power_increased: + continue + + num_policies += 1 + self.process_policy(policy) + + # Some stat collation happens during _update_recon and we want to force + # this to happen at the end of the run + self._update_recon(force_dump=True) + if not num_policies: + self.logger.warning( + "No policy found to increase the partition power.") + self._update_worker_stats(return_code=EXIT_NO_APPLICABLE_POLICY) + return EXIT_NO_APPLICABLE_POLICY + + if self.total_errors > 0: + log_method = self.logger.warning + # NB: audit_location_generator logs unmounted disks as warnings, + # but we want to treat them as errors + status = EXIT_ERROR + else: + log_method = self.logger.info + status = EXIT_SUCCESS + + stats = _zero_stats() + for dev_stats in self.devices_data.values(): + stats = _aggregate_stats(stats, dev_stats.get('stats', {})) + hash_dirs = stats.pop('hash_dirs') + files = stats.pop('files') + linked = stats.pop('linked') + removed = stats.pop('removed') + action_errors = stats.pop('errors') + unmounted = stats.pop('unmounted', 0) + if unmounted: + self.logger.warning('%d disks were unmounted', unmounted) + listdir_errors = stats.pop('unlistable_partitions', 0) + if listdir_errors: + self.logger.warning( + 'There were %d errors listing partition directories', + listdir_errors) + if stats: + self.logger.warning( + 'There were unexpected errors while enumerating disk ' + 'files: %r', stats) + + log_method( + '%d hash dirs processed (cleanup=%s) (%d files, %d linked, ' + '%d removed, %d errors)', hash_dirs, self.do_cleanup, files, + linked, removed, action_errors + listdir_errors) + + self._update_worker_stats(return_code=status) + return status + + +def _reset_recon(recon_cache, logger): + device_progress_recon = {'devices': {}, 'workers': {}} + dump_recon_cache(device_progress_recon, recon_cache, logger) + + +def parallel_process(do_cleanup, conf, logger, device_list=None): + """ + Fork Relinker workers based on config and wait for them to finish. + + :param do_cleanup: boolean, if workers should perform cleanup step + :param conf: dict, config options + :param logger: SwiftLogAdapter instance + :kwarg device_list: list of strings, optionally limit to specific devices + + :returns: int, exit code; zero on success + """ + + # initialise recon dump for collection + # Lets start by always deleting last run's stats + recon_cache = os.path.join(conf['recon_cache_path'], RECON_RELINKER_FILE) + _reset_recon(recon_cache, logger) + + device_list = sorted(set(device_list or os.listdir(conf['devices']))) + workers = conf['workers'] + if workers == 'auto': + workers = len(device_list) + else: + workers = min(workers, len(device_list)) + + start = time.time() + logger.info('Starting relinker (cleanup=%s) using %d workers: %s' % + (do_cleanup, workers, + time.strftime('%X %x %Z', time.gmtime(start)))) + if workers == 0 or len(device_list) in (0, 1): + ret = Relinker( + conf, logger, device_list, do_cleanup=do_cleanup).run() + logger.info('Finished relinker (cleanup=%s): %s (%s elapsed)' % + (do_cleanup, time.strftime('%X %x %Z', time.gmtime()), + datetime.timedelta(seconds=time.time() - start))) + return ret + + children = {} + for worker_devs in distribute_evenly(device_list, workers): + pid = os.fork() + if pid == 0: + logger = get_prefixed_logger(logger, '[pid=%s, devs=%s] ' % ( + os.getpid(), ','.join(worker_devs))) + os._exit(Relinker( + conf, logger, worker_devs, do_cleanup=do_cleanup).run()) + else: + children[pid] = worker_devs + + final_status = EXIT_SUCCESS + final_messages = [] + while children: + pid, status = os.wait() + sig = status & 0xff + status = status >> 8 + time_delta = time.time() - start + devs = children.pop(pid, ['unknown device']) + worker_desc = '(pid=%s, devs=%s)' % (pid, ','.join(devs)) + if sig != 0: + final_status = EXIT_ERROR + final_messages.append( + 'Worker %s exited in %.1fs after receiving signal: %s' + % (worker_desc, time_delta, sig)) + continue + + if status == EXIT_SUCCESS: + continue + + if status == EXIT_NO_APPLICABLE_POLICY: + if final_status == EXIT_SUCCESS: + final_status = status + continue + + final_status = EXIT_ERROR + if status == EXIT_ERROR: + final_messages.append( + 'Worker %s completed in %.1fs with errors' + % (worker_desc, time_delta)) + else: + final_messages.append( + 'Worker %s exited in %.1fs with unexpected status %s' + % (worker_desc, time_delta, status)) + + for msg in final_messages: + logger.warning(msg) + logger.info('Finished relinker (cleanup=%s): %s (%s elapsed)' % + (do_cleanup, time.strftime('%X %x %Z', time.gmtime()), + datetime.timedelta(seconds=time.time() - start))) + return final_status + + +def auto_or_int(value): + return config_auto_int_value(value, default='auto') + + +def main(args=None): + parser = argparse.ArgumentParser( + description='Relink and cleanup objects to increase partition power') + parser.add_argument('action', choices=['relink', 'cleanup']) + parser.add_argument('conf_file', nargs='?', help=( + 'Path to config file with [object-relinker] section')) + parser.add_argument('--swift-dir', default=None, + dest='swift_dir', help='Path to swift directory') + parser.add_argument( + '--policy', default=[], dest='policies', + action='append', type=policy, + help='Policy to relink; may specify multiple (default: all)') + parser.add_argument('--devices', default=None, + dest='devices', help='Path to swift device directory') + parser.add_argument('--user', default=None, dest='user', + help='Drop privileges to this user before relinking') + parser.add_argument('--device', + default=[], dest='device_list', action='append', + help='Device name to relink (default: all)') + parser.add_argument('--partition', '-p', default=[], dest='partitions', + type=non_negative_int, action='append', + help='Partition to relink (default: all)') + parser.add_argument('--skip-mount-check', default=False, + help='Don\'t test if disk is mounted', + action="store_true", dest='skip_mount_check') + parser.add_argument('--files-per-second', default=None, + type=non_negative_float, dest='files_per_second', + help='Used to limit I/O. Zero implies no limit ' + '(default: no limit).') + parser.add_argument('--stats-interval', default=None, + type=non_negative_float, dest='stats_interval', + help='Emit stats to recon roughly every N seconds. ' + '(default: %d).' % DEFAULT_STATS_INTERVAL) + parser.add_argument( + '--workers', default=None, type=auto_or_int, help=( + 'Process devices across N workers ' + '(default: one worker per device)')) + parser.add_argument('--logfile', default=None, dest='logfile', + help='Set log file name. Ignored if using conf_file.') + parser.add_argument('--debug', default=False, action='store_true', + help='Enable debug mode') + parser.add_argument('--clobber-hardlink-collisions', action='store_true', + help='Tolerate hard link collisions when relinking' + 'object files. If the action is relink then the ' + 'file in the new target part dir is quarantined ' + 'and the relink is retried. If the action is ' + 'cleanup then the file in the new target dir is ' + 'retained and the file in the old target dir is ' + 'removed. (default: false)') + + args = parser.parse_args(args) + hubs.use_hub(get_hub()) + if args.conf_file: + conf = readconf(args.conf_file, 'object-relinker') + if args.debug: + conf['log_level'] = 'DEBUG' + user = args.user or conf.get('user') + if user: + drop_privileges(user) + logger = get_logger(conf) + else: + level = 'DEBUG' if args.debug else 'INFO' + conf = {'log_level': level} + if args.user: + # Drop privs before creating log file + drop_privileges(args.user) + conf['user'] = args.user + logging.basicConfig( + format='%(message)s', + level=getattr(logging, level), + filename=args.logfile) + logger = SwiftLogAdapter(logging.getLogger(), server='relinker') + + conf.update({ + 'swift_dir': args.swift_dir or conf.get('swift_dir', '/etc/swift'), + 'devices': args.devices or conf.get('devices', '/srv/node'), + 'mount_check': (config_true_value(conf.get('mount_check', 'true')) + and not args.skip_mount_check), + 'files_per_second': ( + args.files_per_second if args.files_per_second is not None + else non_negative_float(conf.get('files_per_second', '0'))), + 'policies': set(args.policies) or POLICIES, + 'partitions': set(args.partitions), + 'workers': config_auto_int_value( + conf.get('workers') if args.workers is None else args.workers, + 'auto'), + 'recon_cache_path': conf.get('recon_cache_path', + DEFAULT_RECON_CACHE_PATH), + 'stats_interval': non_negative_float( + args.stats_interval or conf.get('stats_interval', + DEFAULT_STATS_INTERVAL)), + 'clobber_hardlink_collisions': ( + args.clobber_hardlink_collisions or + config_true_value(conf.get('clobber_hardlink_collisions', + 'false'))), + }) + return parallel_process( + args.action == 'cleanup', conf, logger, args.device_list) diff --git a/swift/cli/reload.py b/swift/cli/reload.py new file mode 100755 index 0000000000..9d84e39c71 --- /dev/null +++ b/swift/cli/reload.py @@ -0,0 +1,135 @@ +# Copyright (c) 2022 NVIDIA +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Safely reload WSGI servers while minimizing client downtime and errors by + + * validating that the process is a Swift WSGI server manager, + * checking that the configuration file used is valid, + * sending the "seamless reload" signal, and + * waiting for the reload to complete. +""" + +import argparse +import errno +import os +import os.path +import signal +import socket +import subprocess +import sys + +from swift.common.utils import NotificationServer + + +EXIT_BAD_PID = 2 # similar to argparse exiting 2 on an unknown arg +EXIT_RELOAD_FAILED = 1 +EXIT_RELOAD_TIMEOUT = 128 + errno.ETIMEDOUT + + +def validate_manager_pid(pid): + try: + with open('/proc/%d/cmdline' % pid, 'r') as fp: + cmd = fp.read().strip('\x00').split('\x00') + sid = os.getsid(pid) + except (IOError, OSError): + print("Failed to get process information for %s" % pid, + file=sys.stderr) + exit(EXIT_BAD_PID) + + scripts = [os.path.basename(c) for c in cmd + if '/bin/' in c and '/bin/python' not in c] + + if len(scripts) != 1 or not scripts[0].startswith("swift-"): + print("Non-swift process: %r" % ' '.join(cmd), file=sys.stderr) + exit(EXIT_BAD_PID) + + if scripts[0] not in {"swift-proxy-server", "swift-account-server", + "swift-container-server", "swift-object-server"}: + print("Process does not support config checks: %s" % scripts[0], + file=sys.stderr) + exit(EXIT_BAD_PID) + + if sid != pid: + print("Process appears to be a %s worker, not a manager. " + "Did you mean %s?" % (scripts[0], sid), file=sys.stderr) + exit(EXIT_BAD_PID) + + return cmd, scripts[0] + + +def main(args=None): + parser = argparse.ArgumentParser(__doc__) + parser.add_argument("pid", type=int, + help="server PID which should be reloaded") + wait_group = parser.add_mutually_exclusive_group() + wait_group.add_argument("-t", "--timeout", type=float, default=300.0, + help="max time to wait for reload to complete") + wait_group.add_argument("-w", "--no-wait", + action="store_false", dest="wait", + help="skip waiting for reload to complete") + parser.add_argument("-v", "--verbose", action="store_true", + help="display more information as the process reloads") + args = parser.parse_args(args) + + cmd, script = validate_manager_pid(args.pid) + + if args.verbose: + print("Checking config for %s" % script) + try: + subprocess.check_call(cmd + ["--test-config"]) + except subprocess.CalledProcessError: + print("Failed to validate config", file=sys.stderr) + exit(EXIT_RELOAD_FAILED) + + if args.wait: + try: + with NotificationServer(args.pid, args.timeout) as notifications: + if args.verbose: + print("Sending USR1 signal") + os.kill(args.pid, signal.SIGUSR1) + + try: + ready = False + while not ready: + data = notifications.receive() + for data in data.split(b"\n"): + if args.verbose: + if data in (b"READY=1", b"RELOADING=1", + b"STOPPING=1"): + print("Process is %s" % + data.decode("ascii")[:-2]) + else: + print("Received notification %r" % data) + + if data == b"READY=1": + ready = True + except socket.timeout: + print("Timed out reloading %s" % script, file=sys.stderr) + exit(EXIT_RELOAD_TIMEOUT) + except OSError as e: + print("Could not bind notification socket: %s" % e, + file=sys.stderr) + exit(EXIT_RELOAD_FAILED) + else: # --no-wait + if args.verbose: + print("Sending USR1 signal") + os.kill(args.pid, signal.SIGUSR1) + + print("Reloaded %s" % script) + + +if __name__ == "__main__": + main() diff --git a/swift/cli/ring_builder_analyzer.py b/swift/cli/ring_builder_analyzer.py new file mode 100644 index 0000000000..c385643389 --- /dev/null +++ b/swift/cli/ring_builder_analyzer.py @@ -0,0 +1,349 @@ +# Copyright (c) 2015 Samuel Merritt +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This is a tool for analyzing how well the ring builder performs its job +in a particular scenario. It is intended to help developers quantify any +improvements or regressions in the ring builder; it is probably not useful +to others. + +The ring builder analyzer takes a scenario file containing some initial +parameters for a ring builder plus a certain number of rounds. In each +round, some modifications are made to the builder, e.g. add a device, remove +a device, change a device's weight. Then, the builder is repeatedly +rebalanced until it settles down. Data about that round is printed, and the +next round begins. + +Scenarios are specified in JSON. Example scenario for a gradual device +addition:: + + { + "part_power": 12, + "replicas": 3, + "overload": 0.1, + "random_seed": 203488, + + "rounds": [ + [ + ["add", "r1z2-10.20.30.40:6200/sda", 8000], + ["add", "r1z2-10.20.30.40:6200/sdb", 8000], + ["add", "r1z2-10.20.30.40:6200/sdc", 8000], + ["add", "r1z2-10.20.30.40:6200/sdd", 8000], + + ["add", "r1z2-10.20.30.41:6200/sda", 8000], + ["add", "r1z2-10.20.30.41:6200/sdb", 8000], + ["add", "r1z2-10.20.30.41:6200/sdc", 8000], + ["add", "r1z2-10.20.30.41:6200/sdd", 8000], + + ["add", "r1z2-10.20.30.43:6200/sda", 8000], + ["add", "r1z2-10.20.30.43:6200/sdb", 8000], + ["add", "r1z2-10.20.30.43:6200/sdc", 8000], + ["add", "r1z2-10.20.30.43:6200/sdd", 8000], + + ["add", "r1z2-10.20.30.44:6200/sda", 8000], + ["add", "r1z2-10.20.30.44:6200/sdb", 8000], + ["add", "r1z2-10.20.30.44:6200/sdc", 8000] + ], [ + ["add", "r1z2-10.20.30.44:6200/sdd", 1000] + ], [ + ["set_weight", 15, 2000] + ], [ + ["remove", 3], + ["set_weight", 15, 3000] + ], [ + ["set_weight", 15, 4000] + ], [ + ["set_weight", 15, 5000] + ], [ + ["set_weight", 15, 6000] + ], [ + ["set_weight", 15, 7000] + ], [ + ["set_weight", 15, 8000] + ]] + } + +""" + +import argparse +import json +import sys + +from swift.common.ring import builder +from swift.common.ring.utils import parse_add_value + + +ARG_PARSER = argparse.ArgumentParser( + description='Put the ring builder through its paces') +ARG_PARSER.add_argument( + '--check', '-c', action='store_true', + help="Just check the scenario, don't execute it.") +ARG_PARSER.add_argument( + 'scenario_path', + help="Path to the scenario file") + + +class ParseCommandError(ValueError): + + def __init__(self, name, round_index, command_index, msg): + msg = "Invalid %s (round %s, command %s): %s" % ( + name, round_index, command_index, msg) + super(ParseCommandError, self).__init__(msg) + + +def _parse_weight(round_index, command_index, weight_str): + try: + weight = float(weight_str) + except ValueError as err: + raise ParseCommandError('weight', round_index, command_index, err) + if weight < 0: + raise ParseCommandError('weight', round_index, command_index, + 'cannot be negative') + return weight + + +def _parse_add_command(round_index, command_index, command): + if len(command) != 3: + raise ParseCommandError( + 'add command', round_index, command_index, + 'expected array of length 3, but got %r' % command) + + dev_str = command[1] + weight_str = command[2] + + try: + dev = parse_add_value(dev_str) + except ValueError as err: + raise ParseCommandError('device specifier', round_index, + command_index, err) + + dev['weight'] = _parse_weight(round_index, command_index, weight_str) + + if dev['region'] is None: + dev['region'] = 1 + + default_key_map = { + 'replication_ip': 'ip', + 'replication_port': 'port', + } + for empty_key, default_key in default_key_map.items(): + if dev[empty_key] is None: + dev[empty_key] = dev[default_key] + + return ['add', dev] + + +def _parse_remove_command(round_index, command_index, command): + if len(command) != 2: + raise ParseCommandError('remove commnd', round_index, command_index, + "expected array of length 2, but got %r" % + (command,)) + + dev_str = command[1] + + try: + dev_id = int(dev_str) + except ValueError as err: + raise ParseCommandError('device ID in remove', + round_index, command_index, err) + + return ['remove', dev_id] + + +def _parse_set_weight_command(round_index, command_index, command): + if len(command) != 3: + raise ParseCommandError('remove command', round_index, command_index, + "expected array of length 3, but got %r" % + (command,)) + + dev_str = command[1] + weight_str = command[2] + + try: + dev_id = int(dev_str) + except ValueError as err: + raise ParseCommandError('device ID in set_weight', + round_index, command_index, err) + + weight = _parse_weight(round_index, command_index, weight_str) + return ['set_weight', dev_id, weight] + + +def _parse_save_command(round_index, command_index, command): + if len(command) != 2: + raise ParseCommandError( + command, round_index, command_index, + "expected array of length 2 but got %r" % (command,)) + return ['save', command[1]] + + +def parse_scenario(scenario_data): + """ + Takes a serialized scenario and turns it into a data structure suitable + for feeding to run_scenario(). + + :returns: scenario + :raises ValueError: on invalid scenario + """ + + parsed_scenario = {} + + try: + raw_scenario = json.loads(scenario_data) + except ValueError as err: + raise ValueError("Invalid JSON in scenario file: %s" % err) + + if not isinstance(raw_scenario, dict): + raise ValueError("Scenario must be a JSON object, not array or string") + + if 'part_power' not in raw_scenario: + raise ValueError("part_power missing") + try: + parsed_scenario['part_power'] = int(raw_scenario['part_power']) + except ValueError as err: + raise ValueError("part_power not an integer: %s" % err) + if not 1 <= parsed_scenario['part_power'] <= 32: + raise ValueError("part_power must be between 1 and 32, but was %d" + % raw_scenario['part_power']) + + if 'replicas' not in raw_scenario: + raise ValueError("replicas missing") + try: + parsed_scenario['replicas'] = float(raw_scenario['replicas']) + except ValueError as err: + raise ValueError("replicas not a float: %s" % err) + if parsed_scenario['replicas'] < 1: + raise ValueError("replicas must be at least 1, but is %f" + % parsed_scenario['replicas']) + + if 'overload' not in raw_scenario: + raise ValueError("overload missing") + try: + parsed_scenario['overload'] = float(raw_scenario['overload']) + except ValueError as err: + raise ValueError("overload not a float: %s" % err) + if parsed_scenario['overload'] < 0: + raise ValueError("overload must be non-negative, but is %f" + % parsed_scenario['overload']) + + if 'random_seed' not in raw_scenario: + raise ValueError("random_seed missing") + try: + parsed_scenario['random_seed'] = int(raw_scenario['random_seed']) + except ValueError as err: + raise ValueError("replicas not an integer: %s" % err) + + if 'rounds' not in raw_scenario: + raise ValueError("rounds missing") + if not isinstance(raw_scenario['rounds'], list): + raise ValueError("rounds must be an array") + + parser_for_command = { + 'add': _parse_add_command, + 'remove': _parse_remove_command, + 'set_weight': _parse_set_weight_command, + 'save': _parse_save_command, + } + + parsed_scenario['rounds'] = [] + for round_index, raw_round in enumerate(raw_scenario['rounds']): + if not isinstance(raw_round, list): + raise ValueError("round %d not an array" % round_index) + + parsed_round = [] + for command_index, command in enumerate(raw_round): + if command[0] not in parser_for_command: + raise ValueError( + "Unknown command (round %d, command %d): " + "'%s' should be one of %s" % + (round_index, command_index, command[0], + parser_for_command.keys())) + parsed_round.append( + parser_for_command[command[0]]( + round_index, command_index, command)) + parsed_scenario['rounds'].append(parsed_round) + return parsed_scenario + + +def run_scenario(scenario): + """ + Takes a parsed scenario (like from parse_scenario()) and runs it. + """ + seed = scenario['random_seed'] + + rb = builder.RingBuilder(scenario['part_power'], scenario['replicas'], 1) + rb.set_overload(scenario['overload']) + + command_map = { + 'add': rb.add_dev, + 'remove': rb.remove_dev, + 'set_weight': rb.set_dev_weight, + 'save': rb.save, + } + + for round_index, commands in enumerate(scenario['rounds']): + print("Round %d" % (round_index + 1)) + + for command in commands: + key = command.pop(0) + try: + command_f = command_map[key] + except KeyError: + raise ValueError("unknown command %r" % key) + command_f(*command) + + rebalance_number = 1 + parts_moved, old_balance, removed_devs = rb.rebalance(seed=seed) + rb.pretend_min_part_hours_passed() + print("\tRebalance 1: moved %d parts, balance is %.6f, %d removed " + "devs" % (parts_moved, old_balance, removed_devs)) + + while True: + rebalance_number += 1 + parts_moved, new_balance, removed_devs = rb.rebalance(seed=seed) + rb.pretend_min_part_hours_passed() + print("\tRebalance %d: moved %d parts, balance is %.6f, " + "%d removed devs" % (rebalance_number, parts_moved, + new_balance, removed_devs)) + if parts_moved == 0 and removed_devs == 0: + break + if abs(new_balance - old_balance) < 1 and not ( + old_balance == builder.MAX_BALANCE and + new_balance == builder.MAX_BALANCE): + break + old_balance = new_balance + + +def main(argv=None): + args = ARG_PARSER.parse_args(argv) + + try: + with open(args.scenario_path) as sfh: + scenario_data = sfh.read() + except OSError as err: + sys.stderr.write("Error opening scenario %s: %s\n" % + (args.scenario_path, err)) + return 1 + + try: + scenario = parse_scenario(scenario_data) + except ValueError as err: + sys.stderr.write("Invalid scenario %s: %s\n" % + (args.scenario_path, err)) + return 1 + + if not args.check: + run_scenario(scenario) + return 0 diff --git a/swift/cli/ringbuilder.py b/swift/cli/ringbuilder.py new file mode 100644 index 0000000000..2f609ad018 --- /dev/null +++ b/swift/cli/ringbuilder.py @@ -0,0 +1,1779 @@ +# Copyright (c) 2010-2012 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +from collections import defaultdict +from errno import EEXIST +from itertools import islice +from operator import itemgetter +from os import mkdir +from os.path import basename, abspath, dirname, exists, join as pathjoin +import sys +from sys import argv as sys_argv, exit, stdout +from textwrap import wrap +from time import time +import traceback +from datetime import timedelta +import optparse +import math + +from swift.common import exceptions +from swift.common.ring import RingBuilder, Ring, RingData +from swift.common.ring.builder import MAX_BALANCE +from swift.common.ring.composite_builder import CompositeRingBuilder +from swift.common.ring.ring import RING_CODECS, DEFAULT_RING_FORMAT_VERSION +from swift.common.ring.utils import validate_args, \ + validate_and_normalize_ip, build_dev_from_opts, \ + parse_builder_ring_filename_args, parse_search_value, \ + parse_search_values_from_opts, parse_change_values_from_opts, \ + dispersion_report, parse_add_value +from swift.common.utils import lock_parent_directory, is_valid_ipv6 + +MAJOR_VERSION = 1 +MINOR_VERSION = 3 +EXIT_SUCCESS = 0 +EXIT_WARNING = 1 +EXIT_ERROR = 2 + +FORMAT_CHOICES = [str(v) for v in RING_CODECS] + +global argv, backup_dir, builder, builder_file, ring_file +argv = backup_dir = builder = builder_file = ring_file = None + + +def format_device(dev): + """ + Format a device for display. + """ + copy_dev = dev.copy() + for key in ('ip', 'replication_ip'): + if ':' in copy_dev[key]: + copy_dev[key] = '[' + copy_dev[key] + ']' + return ('d%(id)sr%(region)sz%(zone)s-%(ip)s:%(port)sR' + '%(replication_ip)s:%(replication_port)s/%(device)s_' + '"%(meta)s"' % copy_dev) + + +def _parse_search_values(argvish): + + new_cmd_format, opts, args = validate_args(argvish) + + # We'll either parse the all-in-one-string format or the + # --options format, + # but not both. If both are specified, raise an error. + try: + search_values = {} + if len(args) > 0: + if new_cmd_format or len(args) != 1: + print(Commands.search.__doc__.strip()) + exit(EXIT_ERROR) + search_values = parse_search_value(args[0]) + else: + search_values = parse_search_values_from_opts(opts) + return search_values + except ValueError as e: + print(e) + exit(EXIT_ERROR) + + +def _find_parts(devs): + devs = [d['id'] for d in devs] + if not devs or not builder._replica2part2dev: + return None + + partition_count = {} + for replica in builder._replica2part2dev: + for partition, device in enumerate(replica): + if device in devs: + if partition not in partition_count: + partition_count[partition] = 0 + partition_count[partition] += 1 + + # Sort by number of found replicas to keep the output format + sorted_partition_count = sorted( + partition_count.items(), key=itemgetter(1), reverse=True) + + return sorted_partition_count + + +def _parse_list_parts_values(argvish): + + new_cmd_format, opts, args = validate_args(argvish) + + # We'll either parse the all-in-one-string format or the + # --options format, + # but not both. If both are specified, raise an error. + try: + devs = [] + if len(args) > 0: + if new_cmd_format: + print(Commands.list_parts.__doc__.strip()) + exit(EXIT_ERROR) + + for arg in args: + devs.extend( + builder.search_devs(parse_search_value(arg)) or []) + else: + devs.extend(builder.search_devs( + parse_search_values_from_opts(opts)) or []) + + return devs + except ValueError as e: + print(e) + exit(EXIT_ERROR) + + +def _parse_add_values(argvish): + """ + Parse devices to add as specified on the command line. + + Will exit on error and spew warnings. + + :returns: array of device dicts + """ + new_cmd_format, opts, args = validate_args(argvish) + + # We'll either parse the all-in-one-string format or the + # --options format, + # but not both. If both are specified, raise an error. + parsed_devs = [] + if len(args) > 0: + if new_cmd_format or len(args) % 2 != 0: + print(Commands.add.__doc__.strip()) + exit(EXIT_ERROR) + + devs_and_weights = zip(islice(args, 0, len(args), 2), + islice(args, 1, len(args), 2)) + + for devstr, weightstr in devs_and_weights: + dev_dict = parse_add_value(devstr) + + if dev_dict['region'] is None: + print('WARNING: No region specified for %s. ' + 'Defaulting to region 1.\n' % devstr, file=sys.stderr) + dev_dict['region'] = 1 + + if dev_dict['replication_ip'] is None: + dev_dict['replication_ip'] = dev_dict['ip'] + + if dev_dict['replication_port'] is None: + dev_dict['replication_port'] = dev_dict['port'] + + weight = float(weightstr) + if weight < 0: + raise ValueError('Invalid weight value: %s' % devstr) + dev_dict['weight'] = weight + + parsed_devs.append(dev_dict) + else: + parsed_devs.append(build_dev_from_opts(opts)) + + return parsed_devs + + +def check_devs(devs, input_question, opts, abort_msg): + + if not devs: + print('Search value matched 0 devices.\n' + 'The on-disk ring builder is unchanged.') + exit(EXIT_ERROR) + + if len(devs) > 1: + print('Matched more than one device:') + for dev in devs: + print(' %s' % format_device(dev)) + try: + abort = not opts.yes and input(input_question) != 'y' + except (EOFError, KeyboardInterrupt): + abort = True + if abort: + print(abort_msg) + exit(EXIT_ERROR) + + +def _set_weight_values(devs, weight, opts): + + input_question = 'Are you sure you want to update the weight for these ' \ + '%s devices? (y/N) ' % len(devs) + abort_msg = 'Aborting device modifications' + check_devs(devs, input_question, opts, abort_msg) + + for dev in devs: + builder.set_dev_weight(dev['id'], weight) + print('%s weight set to %s' % (format_device(dev), + dev['weight'])) + + +def _set_region_values(devs, region, opts): + + input_question = 'Are you sure you want to update the region for these ' \ + '%s devices? (y/N) ' % len(devs) + abort_msg = 'Aborting device modifications' + check_devs(devs, input_question, opts, abort_msg) + + for dev in devs: + builder.set_dev_region(dev['id'], region) + print('%s region set to %s' % (format_device(dev), + dev['region'])) + + +def _set_zone_values(devs, zone, opts): + + input_question = 'Are you sure you want to update the zone for these ' \ + '%s devices? (y/N) ' % len(devs) + abort_msg = 'Aborting device modifications' + check_devs(devs, input_question, opts, abort_msg) + + for dev in devs: + builder.set_dev_zone(dev['id'], zone) + print('%s zone set to %s' % (format_device(dev), + dev['zone'])) + + +def _parse_set_weight_values(argvish): + + new_cmd_format, opts, args = validate_args(argvish) + + # We'll either parse the all-in-one-string format or the + # --options format, + # but not both. If both are specified, raise an error. + try: + if not new_cmd_format: + if len(args) % 2 != 0: + print(Commands.set_weight.__doc__.strip()) + exit(EXIT_ERROR) + + devs_and_weights = zip(islice(argvish, 0, len(argvish), 2), + islice(argvish, 1, len(argvish), 2)) + for devstr, weightstr in devs_and_weights: + devs = (builder.search_devs( + parse_search_value(devstr)) or []) + weight = float(weightstr) + _set_weight_values(devs, weight, opts) + else: + if len(args) != 1: + print(Commands.set_weight.__doc__.strip()) + exit(EXIT_ERROR) + + devs = (builder.search_devs( + parse_search_values_from_opts(opts)) or []) + weight = float(args[0]) + _set_weight_values(devs, weight, opts) + except ValueError as e: + print(e) + exit(EXIT_ERROR) + + +def _set_info_values(devs, change, opts): + + input_question = 'Are you sure you want to update the info for these ' \ + '%s devices? (y/N) ' % len(devs) + abort_msg = 'Aborting device modifications' + check_devs(devs, input_question, opts, abort_msg) + + for dev in devs: + orig_dev_string = format_device(dev) + test_dev = dict(dev) + for key in change: + test_dev[key] = change[key] + for check_dev in builder.devs: + if not check_dev or check_dev['id'] == test_dev['id']: + continue + if check_dev['ip'] == test_dev['ip'] and \ + check_dev['port'] == test_dev['port'] and \ + check_dev['device'] == test_dev['device']: + print('Device %d already uses %s:%d/%s.' % + (check_dev['id'], check_dev['ip'], + check_dev['port'], check_dev['device'])) + exit(EXIT_ERROR) + for key in change: + dev[key] = change[key] + print('Device %s is now %s' % (orig_dev_string, + format_device(dev))) + + +def calculate_change_value(change_value, change, v_name, v_name_port): + ip = '' + if change_value and change_value[0].isdigit(): + i = 1 + while (i < len(change_value) and + change_value[i] in '0123456789.'): + i += 1 + ip = change_value[:i] + change_value = change_value[i:] + elif change_value and change_value.startswith('['): + i = 1 + while i < len(change_value) and change_value[i] != ']': + i += 1 + i += 1 + ip = change_value[:i].lstrip('[').rstrip(']') + change_value = change_value[i:] + if ip: + change[v_name] = validate_and_normalize_ip(ip) + if change_value.startswith(':'): + i = 1 + while i < len(change_value) and change_value[i].isdigit(): + i += 1 + change[v_name_port] = int(change_value[1:i]) + change_value = change_value[i:] + return change_value + + +def _parse_set_region_values(argvish): + + new_cmd_format, opts, args = validate_args(argvish) + + # We'll either parse the all-in-one-string format or the + # --options format, + # but not both. If both are specified, raise an error. + try: + devs = [] + if not new_cmd_format: + if len(args) % 2 != 0: + print(Commands.set_region.__doc__.strip()) + exit(EXIT_ERROR) + + devs_and_regions = zip(islice(argvish, 0, len(argvish), 2), + islice(argvish, 1, len(argvish), 2)) + for devstr, regionstr in devs_and_regions: + devs.extend(builder.search_devs( + parse_search_value(devstr)) or []) + region = int(regionstr) + _set_region_values(devs, region, opts) + else: + if len(args) != 1: + print(Commands.set_region.__doc__.strip()) + exit(EXIT_ERROR) + + devs.extend(builder.search_devs( + parse_search_values_from_opts(opts)) or []) + region = int(args[0]) + _set_region_values(devs, region, opts) + except ValueError as e: + print(e) + exit(EXIT_ERROR) + + +def _parse_set_zone_values(argvish): + + new_cmd_format, opts, args = validate_args(argvish) + + # We'll either parse the all-in-one-string format or the + # --options format, + # but not both. If both are specified, raise an error. + try: + devs = [] + if not new_cmd_format: + if len(args) % 2 != 0: + print(Commands.set_zone.__doc__.strip()) + exit(EXIT_ERROR) + + devs_and_zones = zip(islice(argvish, 0, len(argvish), 2), + islice(argvish, 1, len(argvish), 2)) + for devstr, zonestr in devs_and_zones: + devs.extend(builder.search_devs( + parse_search_value(devstr)) or []) + zone = int(zonestr) + _set_zone_values(devs, zone, opts) + else: + if len(args) != 1: + print(Commands.set_zone.__doc__.strip()) + exit(EXIT_ERROR) + + devs.extend(builder.search_devs( + parse_search_values_from_opts(opts)) or []) + zone = int(args[0]) + _set_zone_values(devs, zone, opts) + except ValueError as e: + print(e) + exit(EXIT_ERROR) + + +def _parse_set_info_values(argvish): + + new_cmd_format, opts, args = validate_args(argvish) + + # We'll either parse the all-in-one-string format or the + # --options format, + # but not both. If both are specified, raise an error. + if not new_cmd_format: + if len(args) % 2 != 0: + print(Commands.search.__doc__.strip()) + exit(EXIT_ERROR) + + searches_and_changes = zip(islice(argvish, 0, len(argvish), 2), + islice(argvish, 1, len(argvish), 2)) + + for search_value, change_value in searches_and_changes: + devs = builder.search_devs(parse_search_value(search_value)) + change = {} + + change_value = calculate_change_value(change_value, change, + 'ip', 'port') + + if change_value.startswith('R'): + change_value = change_value[1:] + change_value = calculate_change_value(change_value, change, + 'replication_ip', + 'replication_port') + if change_value.startswith('/'): + i = 1 + while i < len(change_value) and change_value[i] != '_': + i += 1 + change['device'] = change_value[1:i] + change_value = change_value[i:] + if change_value.startswith('_'): + change['meta'] = change_value[1:] + change_value = '' + if change_value or not change: + raise ValueError('Invalid set info change value: %s' % + repr(argvish[1])) + _set_info_values(devs, change, opts) + else: + devs = builder.search_devs(parse_search_values_from_opts(opts)) + change = parse_change_values_from_opts(opts) + _set_info_values(devs, change, opts) + + +def _parse_remove_values(argvish): + + new_cmd_format, opts, args = validate_args(argvish) + + # We'll either parse the all-in-one-string format or the + # --options format, + # but not both. If both are specified, raise an error. + try: + devs = [] + if len(args) > 0: + if new_cmd_format: + print(Commands.remove.__doc__.strip()) + exit(EXIT_ERROR) + + for arg in args: + devs.extend(builder.search_devs( + parse_search_value(arg)) or []) + else: + devs.extend(builder.search_devs( + parse_search_values_from_opts(opts))) + + return (devs, opts) + except ValueError as e: + print(e) + exit(EXIT_ERROR) + + +def _make_display_device_table(builder): + ip_width = 10 + port_width = 4 + rep_ip_width = 14 + rep_port_width = 4 + ip_ipv6 = rep_ipv6 = False + weight_width = 6 + for dev in builder._iter_devs(): + if is_valid_ipv6(dev['ip']): + ip_ipv6 = True + if is_valid_ipv6(dev['replication_ip']): + rep_ipv6 = True + ip_width = max(len(dev['ip']), ip_width) + rep_ip_width = max(len(dev['replication_ip']), rep_ip_width) + port_width = max(len(str(dev['port'])), port_width) + rep_port_width = max(len(str(dev['replication_port'])), + rep_port_width) + weight_width = max(len('%6.02f' % dev['weight']), + weight_width) + if ip_ipv6: + ip_width += 2 + if rep_ipv6: + rep_ip_width += 2 + header_line = ('Devices:%5s %6s %4s %' + str(ip_width) + + 's:%-' + str(port_width) + 's %' + + str(rep_ip_width) + 's:%-' + str(rep_port_width) + + 's %5s %' + str(weight_width) + 's %10s %7s %5s %s') % ( + 'id', 'region', 'zone', 'ip address', + 'port', 'replication ip', 'port', 'name', + 'weight', 'partitions', 'balance', 'flags', + 'meta') + + def print_dev_f(dev, balance_per_dev=0.00, flags=''): + def get_formated_ip(key): + value = dev[key] + if ':' in value: + value = '[%s]' % value + return value + dev_ip = get_formated_ip('ip') + dev_replication_ip = get_formated_ip('replication_ip') + format_string = ''.join(['%13d %6d %4d ', + '%', str(ip_width), 's:%-', + str(port_width), 'd ', '%', + str(rep_ip_width), 's', ':%-', + str(rep_port_width), 'd %5s %', + str(weight_width), '.02f' + ' %10s %7.02f %5s %s']) + args = (dev['id'], dev['region'], dev['zone'], dev_ip, dev['port'], + dev_replication_ip, dev['replication_port'], dev['device'], + dev['weight'], dev['parts'], balance_per_dev, flags, + dev['meta']) + print(format_string % args) + + return header_line, print_dev_f + + +class Commands(object): + @staticmethod + def unknown(): + print('Unknown command: %s' % argv[2]) + exit(EXIT_ERROR) + + @staticmethod + def create(): + """ +swift-ring-builder create + + Creates with 2^ partitions and . + is number of hours to restrict moving a partition more + than once. + """ + if len(argv) < 6: + print(Commands.create.__doc__.strip()) + exit(EXIT_ERROR) + try: + builder = RingBuilder(int(argv[3]), float(argv[4]), int(argv[5])) + except ValueError as e: + print(e) + exit(EXIT_ERROR) + backup_dir = pathjoin(dirname(builder_file), 'backups') + try: + mkdir(backup_dir) + except OSError as err: + if err.errno != EEXIST: + raise + builder.save(pathjoin(backup_dir, + '%d.' % time() + basename(builder_file))) + builder.save(builder_file) + exit(EXIT_SUCCESS) + + @staticmethod + def default(): + """ +swift-ring-builder + Shows information about the ring and the devices within. Output + includes a table that describes the report parameters (id, region, + port, flags, etc). + flags: possible values are 'DEL' and '' + DEL - indicates that the device is marked for removal from + ring and will be removed in next rebalance. + """ + try: + builder_id = builder.id + except AttributeError: + builder_id = "(not assigned)" + print('%s, build version %d, id %s' % + (builder_file, builder.version, builder_id)) + balance = 0 + ring_empty_error = None + regions = len(set(d['region'] for d in builder.devs + if d is not None)) + zones = len(set((d['region'], d['zone']) for d in builder.devs + if d is not None)) + dev_count = len([dev for dev in builder.devs + if dev is not None]) + try: + balance = builder.get_balance() + except exceptions.EmptyRingError as e: + ring_empty_error = str(e) + dispersion_trailer = '' if builder.dispersion is None else ( + ', %.02f dispersion' % (builder.dispersion)) + print('%d partitions, %.6f replicas, %d regions, %d zones, ' + '%d devices, %d-byte IDs, %.02f balance%s' % ( + builder.parts, builder.replicas, regions, zones, dev_count, + builder.dev_id_bytes, balance, dispersion_trailer)) + print('The minimum number of hours before a partition can be ' + 'reassigned is %s (%s remaining)' % ( + builder.min_part_hours, + timedelta(seconds=builder.min_part_seconds_left))) + print('The overload factor is %0.2f%% (%.6f)' % ( + builder.overload * 100, builder.overload)) + + ring_dict = None + builder_dict = builder.get_ring().to_dict() + + # compare ring file against builder file + if not exists(ring_file): + print('Ring file %s not found, ' + 'probably it hasn\'t been written yet' % ring_file) + else: + try: + ring_dict = RingData.load(ring_file).to_dict() + except Exception as exc: + print('Ring file %s is invalid: %r' % (ring_file, exc)) + else: + # mostly just an implementation detail + builder_dict.pop('dev_id_bytes', None) + ring_dict.pop('dev_id_bytes', None) + if builder_dict == ring_dict: + print('Ring file %s is up-to-date' % ring_file) + else: + print('Ring file %s is obsolete' % ring_file) + + if ring_empty_error: + balance_per_dev = defaultdict(int) + else: + balance_per_dev = builder._build_balance_per_dev() + header_line, print_dev_f = _make_display_device_table(builder) + print(header_line) + for dev in sorted( + builder._iter_devs(), + key=lambda x: (x['region'], x['zone'], x['ip'], x['device']) + ): + flags = 'DEL' if dev in builder._remove_devs else '' + print_dev_f(dev, balance_per_dev[dev['id']], flags) + + # Print some helpful info if partition power increase in progress + if (builder.next_part_power and + builder.next_part_power == (builder.part_power + 1)): + print('\nPreparing increase of partition power (%d -> %d)' % ( + builder.part_power, builder.next_part_power)) + print('Run "swift-object-relinker relink" on all nodes before ' + 'moving on to increase_partition_power.') + if (builder.next_part_power and + builder.part_power == builder.next_part_power): + print('\nIncreased partition power (%d -> %d)' % ( + builder.part_power, builder.next_part_power)) + if builder_dict != ring_dict: + print('First run "swift-ring-builder write_ring"' + ' now and copy the updated .ring.gz file to all nodes.') + print('Run "swift-object-relinker cleanup" on all nodes before ' + 'moving on to finish_increase_partition_power.') + + if ring_empty_error: + print(ring_empty_error) + exit(EXIT_SUCCESS) + + @staticmethod + def version(): + """ +swift-ring-builder version + """ + if len(argv) < 3: + print(Commands.create.__doc__.strip()) + exit(EXIT_ERROR) + try: + rd = RingData.load(ring_file, metadata_only=True) + except ValueError as e: + print(e) + exit(EXIT_ERROR) + print('%s: Serialization version: %d (%d-byte IDs), ' + 'build version: %d' % + (ring_file, rd.format_version, rd.dev_id_bytes, rd.version)) + exit(EXIT_SUCCESS) + + @staticmethod + def search(): + """ +swift-ring-builder search + +or + +swift-ring-builder search + --region --zone --ip --port + --replication-ip --replication-port + --device --meta --weight + + Where , and are replication ip, hostname + and port. + Any of the options are optional in both cases. + + Shows information about matching devices. + """ + if len(argv) < 4: + print(Commands.search.__doc__.strip()) + print() + print(parse_search_value.__doc__.strip()) + exit(EXIT_ERROR) + + devs = builder.search_devs(_parse_search_values(argv[3:])) + + if not devs: + print('No matching devices found') + exit(EXIT_ERROR) + print('Devices: id region zone ip address port ' + 'replication ip replication port name weight partitions ' + 'balance meta') + weighted_parts = builder.parts * builder.replicas / \ + sum(d['weight'] for d in builder.devs if d is not None) + for dev in devs: + if not dev['weight']: + if dev['parts']: + balance = MAX_BALANCE + else: + balance = 0 + else: + balance = 100.0 * dev['parts'] / \ + (dev['weight'] * weighted_parts) - 100.0 + print(' %5d %7d %5d %15s %5d %15s %17d %9s %6.02f %10s ' + '%7.02f %s' % + (dev['id'], dev['region'], dev['zone'], dev['ip'], + dev['port'], dev['replication_ip'], dev['replication_port'], + dev['device'], dev['weight'], dev['parts'], balance, + dev['meta'])) + exit(EXIT_SUCCESS) + + @staticmethod + def list_parts(): + """ +swift-ring-builder list_parts [] .. + +or + +swift-ring-builder list_parts + --region --zone --ip --port + --replication-ip --replication-port + --device --meta --weight + + Where , and are replication ip, hostname + and port. + Any of the options are optional in both cases. + + Returns a 2 column list of all the partitions that are assigned to any of + the devices matching the search values given. The first column is the + assigned partition number and the second column is the number of device + matches for that partition. The list is ordered from most number of matches + to least. If there are a lot of devices to match against, this command + could take a while to run. + """ + if len(argv) < 4: + print(Commands.list_parts.__doc__.strip()) + print() + print(parse_search_value.__doc__.strip()) + exit(EXIT_ERROR) + + if not builder._replica2part2dev: + print('Specified builder file \"%s\" is not rebalanced yet. ' + 'Please rebalance first.' % builder_file) + exit(EXIT_ERROR) + + devs = _parse_list_parts_values(argv[3:]) + if not devs: + print('No matching devices found') + exit(EXIT_ERROR) + + sorted_partition_count = _find_parts(devs) + + if not sorted_partition_count: + print('No matching devices found') + exit(EXIT_ERROR) + + print('Partition Matches') + for partition, count in sorted_partition_count: + print('%9d %7d' % (partition, count)) + exit(EXIT_SUCCESS) + + @staticmethod + def add(): + """ +swift-ring-builder add + [r]z-:[R:]/_ + + [[r]z-:[R:]/_ + ] ... + + Where and are replication ip and port. + +or + +swift-ring-builder add + --region --zone --ip --port + [--replication-ip ] [--replication-port ] + --device --weight + [--meta ] + + Adds devices to the ring with the given information. No partitions will be + assigned to the new device until after running 'rebalance'. This is so you + can make multiple device changes and rebalance them all just once. + """ + if len(argv) < 5: + print(Commands.add.__doc__.strip()) + exit(EXIT_ERROR) + + if builder.next_part_power: + print('Partition power increase in progress. You need ') + print('to finish the increase first before adding devices.') + exit(EXIT_ERROR) + + try: + for new_dev in _parse_add_values(argv[3:]): + for dev in builder.devs: + if dev is None: + continue + if dev['ip'] == new_dev['ip'] and \ + dev['port'] == new_dev['port'] and \ + dev['device'] == new_dev['device']: + print('Device %d already uses %s:%d/%s.' % + (dev['id'], dev['ip'], + dev['port'], dev['device'])) + print("The on-disk ring builder is unchanged.\n") + exit(EXIT_ERROR) + dev_id = builder.add_dev(new_dev) + print('Device %s with %s weight got id %s' % + (format_device(new_dev), new_dev['weight'], dev_id)) + except ValueError as err: + print(err) + print('The on-disk ring builder is unchanged.') + exit(EXIT_ERROR) + + builder.save(builder_file) + exit(EXIT_SUCCESS) + + @staticmethod + def set_weight(): + """ +swift-ring-builder set_weight + [ ] ... + [--yes] + +or + +swift-ring-builder set_weight + --region --zone --ip --port + --replication-ip --replication-port + --device --meta --weight + [--yes] + + Where , and are replication ip, hostname + and port. and are the search weight and new + weight values respectively. + Any of the options are optional in both cases. + + Resets the devices' weights. No partitions will be reassigned to or from + the device until after running 'rebalance'. This is so you can make + multiple device changes and rebalance them all just once. + + Option --yes assume a yes response to all questions. + """ + # if len(argv) < 5 or len(argv) % 2 != 1: + if len(argv) < 5: + print(Commands.set_weight.__doc__.strip()) + print() + print(parse_search_value.__doc__.strip()) + exit(EXIT_ERROR) + + _parse_set_weight_values(argv[3:]) + + builder.save(builder_file) + exit(EXIT_SUCCESS) + + @staticmethod + def set_region(): + """ +swift-ring-builder set_region + [ set_region + --region --zone --ip --port + --replication-ip --replication-port + --device --meta [--yes] + + Where , and are replication ip, hostname + and port. + Any of the options are optional in both cases. + + Resets the devices' regions. No partitions will be reassigned to or from + the device until after running 'rebalance'. This is so you can make + multiple device changes and rebalance them all just once. + + Option --yes assume a yes response to all questions. + """ + if len(argv) < 5: + print(Commands.set_region.__doc__.strip()) + print() + print(parse_search_value.__doc__.strip()) + exit(EXIT_ERROR) + + _parse_set_region_values(argv[3:]) + + builder.save(builder_file) + exit(EXIT_SUCCESS) + + @staticmethod + def set_zone(): + """ +swift-ring-builder set_zone + [ set_zone + --region --zone --ip --port + --replication-ip --replication-port + --device --meta [--yes] + + Where , and are replication ip, hostname + and port. + Any of the options are optional in both cases. + + Resets the devices' zones. No partitions will be reassigned to or from + the device until after running 'rebalance'. This is so you can make + multiple device changes and rebalance them all just once. + + Option --yes assume a yes response to all questions. + """ + # if len(argv) < 5 or len(argv) % 2 != 1: + if len(argv) < 5: + print(Commands.set_zone.__doc__.strip()) + print() + print(parse_search_value.__doc__.strip()) + exit(EXIT_ERROR) + + _parse_set_zone_values(argv[3:]) + + builder.save(builder_file) + exit(EXIT_SUCCESS) + + @staticmethod + def set_info(): + """ +swift-ring-builder set_info + :[R:]/_ + [ :[R:]/_] ... + [--yes] + +or + +swift-ring-builder set_info + --ip --port + --replication-ip --replication-port + --device --meta + --change-ip --change-port + --change-replication-ip + --change-replication-port + --change-device + --change-meta + [--yes] + + Where , and are replication ip, hostname + and port. + Any of the options are optional in both cases. + + For each search-value, resets the matched device's information. + This information isn't used to assign partitions, so you can use + 'write_ring' afterward to rewrite the current ring with the newer + device information. Any of the parts are optional in the final + :/_ parameter; just give what you + want to change. For instance set_info d74 _"snet: 5.6.7.8" would + just update the meta data for device id 74. + + Option --yes assume a yes response to all questions. + """ + if len(argv) < 5: + print(Commands.set_info.__doc__.strip()) + print() + print(parse_search_value.__doc__.strip()) + exit(EXIT_ERROR) + + try: + _parse_set_info_values(argv[3:]) + except ValueError as err: + print(err) + exit(EXIT_ERROR) + + builder.save(builder_file) + exit(EXIT_SUCCESS) + + @staticmethod + def remove(): + """ +swift-ring-builder remove [search-value ...] + [--yes] + +or + +swift-ring-builder remove + --region --zone --ip --port + --replication-ip --replication-port + --device --meta --weight + [--yes] + + Where , and are replication ip, hostname + and port. + Any of the options are optional in both cases. + + Removes the device(s) from the ring. This should normally just be used for + a device that has failed. For a device you wish to decommission, it's best + to set its weight to 0, wait for it to drain all its data, then use this + remove command. This will not take effect until after running 'rebalance'. + This is so you can make multiple device changes and rebalance them all just + once. + + Option --yes assume a yes response to all questions. + """ + if len(argv) < 4: + print(Commands.remove.__doc__.strip()) + print() + print(parse_search_value.__doc__.strip()) + exit(EXIT_ERROR) + + if builder.next_part_power: + print('Partition power increase in progress. You need ') + print('to finish the increase first before removing devices.') + exit(EXIT_ERROR) + + devs, opts = _parse_remove_values(argv[3:]) + + input_question = 'Are you sure you want to remove these ' \ + '%s devices? (y/N) ' % len(devs) + abort_msg = 'Aborting device removals' + check_devs(devs, input_question, opts, abort_msg) + + for dev in devs: + try: + builder.remove_dev(dev['id']) + except exceptions.RingBuilderError as e: + print('-' * 79) + print( + 'An error occurred while removing device with id %d\n' + 'This usually means that you attempted to remove\n' + 'the last device in a ring. If this is the case,\n' + 'consider creating a new ring instead.\n' + 'The on-disk ring builder is unchanged.\n' + 'Original exception message: %s' % + (dev['id'], e)) + print('-' * 79) + exit(EXIT_ERROR) + + print('%s marked for removal and will ' + 'be removed next rebalance.' % format_device(dev)) + builder.save(builder_file) + exit(EXIT_SUCCESS) + + @staticmethod + def rebalance(): + """ +swift-ring-builder rebalance [options] + Attempts to rebalance the ring by reassigning partitions that haven't been + recently reassigned. + """ + usage = Commands.rebalance.__doc__.strip() + parser = optparse.OptionParser(usage) + parser.add_option('-f', '--force', action='store_true', + help='Force a rebalanced ring to save even ' + 'if < 1% of parts changed') + parser.add_option('-s', '--seed', help="seed to use for rebalance") + parser.add_option('-d', '--debug', action='store_true', + help="print debug information") + parser.add_option('--format-version', + choices=FORMAT_CHOICES, default=None, + help="specify ring format version") + options, args = parser.parse_args(argv) + if options.format_version is None: + print("Defaulting to --format-version=1. This ensures the ring\n" + "written will be readable by older versions of Swift.\n" + "In a future release, the default will change to\n" + "--format-version=2\n") + options.format_version = DEFAULT_RING_FORMAT_VERSION + else: + # N.B. choices doesn't work with type=int + options.format_version = int(options.format_version) + + def get_seed(index): + if options.seed: + return options.seed + try: + return args[index] + except IndexError: + pass + + if options.debug: + logger = logging.getLogger("swift.ring.builder") + logger.disabled = False + logger.setLevel(logging.DEBUG) + handler = logging.StreamHandler(stdout) + formatter = logging.Formatter("%(levelname)s: %(message)s") + handler.setFormatter(formatter) + logger.addHandler(handler) + + if builder.next_part_power: + print('Partition power increase in progress.') + print('You need to finish the increase first before rebalancing.') + exit(EXIT_ERROR) + + devs_changed = builder.devs_changed + min_part_seconds_left = builder.min_part_seconds_left + try: + last_balance = builder.get_balance() + last_dispersion = builder.dispersion + parts, balance, removed_devs = builder.rebalance(seed=get_seed(3)) + dispersion = builder.dispersion + except exceptions.RingBuilderError as e: + print('-' * 79) + print("An error has occurred during ring validation. Common\n" + "causes of failure are rings that are empty or do not\n" + "have enough devices to accommodate the replica count.\n" + "Original exception message:\n %s" % + (e,)) + print('-' * 79) + exit(EXIT_ERROR) + if not (parts or options.force or removed_devs): + print('No partitions could be reassigned.') + if min_part_seconds_left > 0: + print('The time between rebalances must be at least ' + 'min_part_hours: %s hours (%s remaining)' % ( + builder.min_part_hours, + timedelta(seconds=builder.min_part_seconds_left))) + else: + print('There is no need to do so at this time') + exit(EXIT_WARNING) + # If we set device's weight to zero, currently balance will be set + # special value(MAX_BALANCE) until zero weighted device return all + # its partitions. So we cannot check balance has changed. + # Thus we need to check balance or last_balance is special value. + be_cowardly = True + if options.force: + # User said save it, so we save it. + be_cowardly = False + elif devs_changed: + # We must save if a device changed; this could be something like + # a changed IP address. + be_cowardly = False + else: + # If balance or dispersion changed (presumably improved), then + # we should save to get the improvement. + balance_changed = ( + abs(last_balance - balance) >= 1 or + (last_balance == MAX_BALANCE and balance == MAX_BALANCE)) + dispersion_changed = last_dispersion is None or ( + abs(last_dispersion - dispersion) >= 1) + if balance_changed or dispersion_changed: + be_cowardly = False + + if be_cowardly: + print('Cowardly refusing to save rebalance as it did not change ' + 'at least 1%.') + exit(EXIT_WARNING) + try: + builder.validate() + except exceptions.RingValidationError as e: + print('-' * 79) + print("An error has occurred during ring validation. Common\n" + "causes of failure are rings that are empty or do not\n" + "have enough devices to accommodate the replica count.\n" + "Original exception message:\n %s" % + (e,)) + print('-' * 79) + exit(EXIT_ERROR) + print('Reassigned %d (%.02f%%) partitions. ' + 'Balance is now %.02f. ' + 'Dispersion is now %.02f' % ( + parts, 100.0 * parts / builder.parts, + balance, + builder.dispersion)) + status = EXIT_SUCCESS + if builder.dispersion > 0: + print('-' * 79) + print( + 'NOTE: Dispersion of %.06f indicates some parts are not\n' + ' optimally dispersed.\n\n' + ' You may want to adjust some device weights, increase\n' + ' the overload or review the dispersion report.' % + builder.dispersion) + status = EXIT_WARNING + print('-' * 79) + elif balance > 5 and balance / 100.0 > builder.overload: + print('-' * 79) + print('NOTE: Balance of %.02f indicates you should push this ' % + balance) + print(' ring, wait at least %d hours, and rebalance/repush.' + % builder.min_part_hours) + print('-' * 79) + status = EXIT_WARNING + ts = time() + builder.get_ring().save( + pathjoin(backup_dir, '%d.' % ts + basename(ring_file)), + format_version=options.format_version) + builder.save(pathjoin(backup_dir, '%d.' % ts + basename(builder_file))) + builder.get_ring().save( + ring_file, format_version=options.format_version) + builder.save(builder_file) + exit(status) + + @staticmethod + def dispersion(): + r""" +swift-ring-builder dispersion [options] + + Output report on dispersion. + + --recalculate option will rebuild cached dispersion info and save builder + --verbose option will display dispersion graph broken down by tier + + You can filter which tiers are evaluated to drill down using a regex + in the optional search_filter argument. i.e. + + swift-ring-builder dispersion "r\d+z\d+$" -v + + ... would only display rows for the zone tiers + + swift-ring-builder dispersion ".*\-[^/]*$" -v + + ... would only display rows for the server tiers + + The reports columns are: + + Tier : the name of the tier + parts : the total number of partitions with assignment in the tier + % : the percentage of parts in the tier with replicas over assigned + max : maximum replicas a part should have assigned at the tier + 0 - N : the number of parts with that many replicas assigned + + e.g. + Tier: parts % max 0 1 2 3 + r1z1 1022 79.45 1 2 210 784 28 + + r1z1 has 1022 total parts assigned, 79% of them have more than the + recommend max replica count of 1 assigned. Only 2 parts in the ring + are *not* assigned in this tier (0 replica count), 210 parts have + the recommend replica count of 1, 784 have 2 replicas, and 28 sadly + have all three replicas in this tier. + """ + status = EXIT_SUCCESS + if not builder._replica2part2dev: + print('Specified builder file \"%s\" is not rebalanced yet. ' + 'Please rebalance first.' % builder_file) + exit(EXIT_ERROR) + usage = Commands.dispersion.__doc__.strip() + parser = optparse.OptionParser(usage) + parser.add_option('--recalculate', action='store_true', + help='Rebuild cached dispersion info and save') + parser.add_option('-v', '--verbose', action='store_true', + help='Display dispersion report for tiers') + options, args = parser.parse_args(argv) + if args[3:]: + search_filter = args[3] + else: + search_filter = None + orig_version = builder.version + report = dispersion_report(builder, search_filter=search_filter, + verbose=options.verbose, + recalculate=options.recalculate) + if builder.version != orig_version: + # we've already done the work, better go ahead and save it! + builder.save(builder_file) + print('Dispersion is %.06f, Balance is %.06f, Overload is %0.2f%%' % ( + builder.dispersion, builder.get_balance(), builder.overload * 100)) + print('Required overload is %.6f%%' % ( + builder.get_required_overload() * 100)) + if report['worst_tier']: + status = EXIT_WARNING + print('Worst tier is %.06f (%s)' % (report['max_dispersion'], + report['worst_tier'])) + if report['graph']: + replica_range = list(range(int(math.ceil(builder.replicas + 1)))) + part_count_width = '%%%ds' % max(len(str(builder.parts)), 5) + replica_counts_tmpl = ' '.join(part_count_width for i in + replica_range) + tiers = (tier for tier, _junk in report['graph']) + tier_width = max(max(map(len, tiers)), 30) + header_line = ('%-' + str(tier_width) + + 's ' + part_count_width + + ' %6s %6s ' + replica_counts_tmpl) % tuple( + ['Tier', 'Parts', '%', 'Max'] + replica_range) + underline = '-' * len(header_line) + print(underline) + print(header_line) + print(underline) + for tier_name, dispersion in report['graph']: + replica_counts_repr = replica_counts_tmpl % tuple( + dispersion['replicas']) + template = ''.join([ + '%-', str(tier_width), 's ', + part_count_width, + ' %6.02f %6d %s', + ]) + args = ( + tier_name, + dispersion['placed_parts'], + dispersion['dispersion'], + dispersion['max_replicas'], + replica_counts_repr, + ) + print(template % args) + exit(status) + + @staticmethod + def validate(): + """ +swift-ring-builder validate + Just runs the validation routines on the ring. + """ + builder.validate() + exit(EXIT_SUCCESS) + + @staticmethod + def write_ring(): + """ +swift-ring-builder write_ring + Just rewrites the distributable ring file. This is done automatically after + a successful rebalance, so really this is only useful after one or more + 'set_info' calls when no rebalance is needed but you want to send out the + new device information. + """ + usage = Commands.write_ring.__doc__.strip() + parser = optparse.OptionParser(usage) + parser.add_option('--format-version', + choices=FORMAT_CHOICES, default=None, + help="specify ring format version") + options, args = parser.parse_args(argv) + if options.format_version is None: + print("Defaulting to --format-version=1. This ensures the ring\n" + "written will be readable by older versions of Swift.\n" + "In a future release, the default will change to\n" + "--format-version=2\n") + options.format_version = DEFAULT_RING_FORMAT_VERSION + else: + # N.B. choices doesn't work with type=int + options.format_version = int(options.format_version) + + if not builder.devs: + print('Unable to write empty ring.') + exit(EXIT_ERROR) + + ring_data = builder.get_ring() + if not ring_data._replica2part2dev_id: + if ring_data.devs: + print('WARNING: Writing a ring with no partition ' + 'assignments but with devices; did you forget to run ' + '"rebalance"?', file=sys.stderr) + ring_data.save( + pathjoin(backup_dir, '%d.' % time() + basename(ring_file)), + format_version=options.format_version) + ring_data.save(ring_file, format_version=options.format_version) + exit(EXIT_SUCCESS) + + @staticmethod + def write_builder(): + """ +swift-ring-builder write_builder [min_part_hours] + Recreate a builder from a ring file (lossy) if you lost your builder + backups. (Protip: don't lose your builder backups). + [min_part_hours] is one of those numbers lost to the builder, + you can change it with set_min_part_hours. + """ + if exists(builder_file): + print('Cowardly refusing to overwrite existing ' + 'Ring Builder file: %s' % builder_file) + exit(EXIT_ERROR) + if len(argv) > 3: + min_part_hours = int(argv[3]) + else: + print("WARNING: default min_part_hours may not match " + "the value in the lost builder.\n", file=sys.stderr) + min_part_hours = 24 + ring = Ring(ring_file) + for dev in ring.devs: + if dev is None: + continue + dev.update({ + 'parts': 0, + 'parts_wanted': 0, + }) + builder_dict = { + 'part_power': 32 - ring._part_shift, + 'replicas': float(ring.replica_count), + 'min_part_hours': min_part_hours, + 'parts': ring.partition_count, + 'devs': ring.devs, + 'devs_changed': False, + 'version': ring.version or 0, + '_replica2part2dev': ring._replica2part2dev_id, + '_last_part_moves_epoch': None, + '_last_part_moves': None, + '_last_part_gather_start': 0, + '_remove_devs': [], + } + builder = RingBuilder.from_dict(builder_dict) + for parts in builder._replica2part2dev: + for dev_id in parts: + builder.devs[dev_id]['parts'] += 1 + builder.save(builder_file) + + @staticmethod + def pretend_min_part_hours_passed(): + """ +swift-ring-builder pretend_min_part_hours_passed + Resets the clock on the last time a rebalance happened, thus + circumventing the min_part_hours check. + + ***************************** + USE THIS WITH EXTREME CAUTION + ***************************** + + If you run this command and deploy rebalanced rings before a replication + pass completes, you may introduce unavailability in your cluster. This + has an end-user impact. + """ + builder.pretend_min_part_hours_passed() + builder.save(builder_file) + exit(EXIT_SUCCESS) + + @staticmethod + def set_min_part_hours(): + """ +swift-ring-builder set_min_part_hours + Changes the to the given . This should be set to + however long a full replication/update cycle takes. We're working on a way + to determine this more easily than scanning logs. + """ + if len(argv) < 4: + print(Commands.set_min_part_hours.__doc__.strip()) + exit(EXIT_ERROR) + builder.change_min_part_hours(int(argv[3])) + print('The minimum number of hours before a partition can be ' + 'reassigned is now set to %s' % argv[3]) + builder.save(builder_file) + exit(EXIT_SUCCESS) + + @staticmethod + def set_replicas(): + """ +swift-ring-builder set_replicas + Changes the replica count to the given . may + be a floating-point value, in which case some partitions will have + floor() replicas and some will have ceiling() + in the correct proportions. + + A rebalance is needed to make the change take effect. + """ + if len(argv) < 4: + print(Commands.set_replicas.__doc__.strip()) + exit(EXIT_ERROR) + + new_replicas = argv[3] + try: + new_replicas = float(new_replicas) + except ValueError: + print(Commands.set_replicas.__doc__.strip()) + print("\"%s\" is not a valid number." % new_replicas) + exit(EXIT_ERROR) + + if new_replicas < 1: + print("Replica count must be at least 1.") + exit(EXIT_ERROR) + + builder.set_replicas(new_replicas) + print('The replica count is now %.6f.' % builder.replicas) + print('The change will take effect after the next rebalance.') + builder.save(builder_file) + exit(EXIT_SUCCESS) + + @staticmethod + def set_overload(): + """ +swift-ring-builder set_overload [%] + Changes the overload factor to the given . + + A rebalance is needed to make the change take effect. + """ + if len(argv) < 4: + print(Commands.set_overload.__doc__.strip()) + exit(EXIT_ERROR) + + new_overload = argv[3] + if new_overload.endswith('%'): + percent = True + new_overload = new_overload.rstrip('%') + else: + percent = False + try: + new_overload = float(new_overload) + except ValueError: + print(Commands.set_overload.__doc__.strip()) + print("%r is not a valid number." % new_overload) + exit(EXIT_ERROR) + + if percent: + new_overload *= 0.01 + if new_overload < 0: + print("Overload must be non-negative.") + exit(EXIT_ERROR) + + if new_overload > 1 and not percent: + print("!?! Warning overload is greater than 100% !?!") + status = EXIT_WARNING + else: + status = EXIT_SUCCESS + + builder.set_overload(new_overload) + print('The overload factor is now %0.2f%% (%.6f)' % ( + builder.overload * 100, builder.overload)) + print('The change will take effect after the next rebalance.') + builder.save(builder_file) + exit(status) + + @staticmethod + def prepare_increase_partition_power(): + """ +swift-ring-builder prepare_increase_partition_power + Prepare the ring to increase the partition power by one. + + A write_ring command is needed to make the change take effect. + + Once the updated rings have been deployed to all servers you need to run + the swift-object-relinker tool to relink existing data. + + ***************************** + USE THIS WITH EXTREME CAUTION + ***************************** + + If you increase the partition power and deploy changed rings, you may + introduce unavailability in your cluster. This has an end-user impact. Make + sure you execute required operations to increase the partition power + accurately. + + """ + if len(argv) < 3: + print(Commands.prepare_increase_partition_power.__doc__.strip()) + exit(EXIT_ERROR) + + if "object" not in basename(builder_file): + print( + 'Partition power increase is only supported for object rings.') + exit(EXIT_ERROR) + + if not builder.prepare_increase_partition_power(): + print('Ring is already prepared for partition power increase.') + exit(EXIT_ERROR) + + builder.save(builder_file) + + print('The next partition power is now %d.' % builder.next_part_power) + print('The change will take effect after the next write_ring.') + print('Ensure your proxy-servers, object-replicators and ') + print('reconstructors are using the changed rings and relink ') + print('(using swift-object-relinker) your existing data') + print('before the partition power increase') + exit(EXIT_SUCCESS) + + @staticmethod + def increase_partition_power(): + """ +swift-ring-builder increase_partition_power + Increases the partition power by one. Needs to be run after + prepare_increase_partition_power has been run and all existing data has + been relinked using the swift-object-relinker tool. + + A write_ring command is needed to make the change take effect. + + Once the updated rings have been deployed to all servers you need to run + the swift-object-relinker tool to cleanup old data. + + ***************************** + USE THIS WITH EXTREME CAUTION + ***************************** + + If you increase the partition power and deploy changed rings, you may + introduce unavailability in your cluster. This has an end-user impact. Make + sure you execute required operations to increase the partition power + accurately. + + """ + if len(argv) < 3: + print(Commands.increase_partition_power.__doc__.strip()) + exit(EXIT_ERROR) + + if builder.increase_partition_power(): + print('The partition power is now %d.' % builder.part_power) + print('The change will take effect after the next write_ring.') + + builder._update_last_part_moves() + builder.save(builder_file) + + exit(EXIT_SUCCESS) + else: + print('Ring partition power cannot be increased. Either the ring') + print('was not prepared yet, or this operation has already run.') + exit(EXIT_ERROR) + + @staticmethod + def cancel_increase_partition_power(): + """ +swift-ring-builder cancel_increase_partition_power + Cancel the increase of the partition power. + + A write_ring command is needed to make the change take effect. + + Once the updated rings have been deployed to all servers you need to run + the swift-object-relinker tool to cleanup unneeded links. + + ***************************** + USE THIS WITH EXTREME CAUTION + ***************************** + + If you increase the partition power and deploy changed rings, you may + introduce unavailability in your cluster. This has an end-user impact. Make + sure you execute required operations to increase the partition power + accurately. + + """ + if len(argv) < 3: + print(Commands.cancel_increase_partition_power.__doc__.strip()) + exit(EXIT_ERROR) + + if not builder.cancel_increase_partition_power(): + print('Ring partition power increase cannot be canceled.') + exit(EXIT_ERROR) + + builder.save(builder_file) + + print('The next partition power is now %d.' % builder.next_part_power) + print('The change will take effect after the next write_ring.') + print('Ensure your object-servers are using the changed rings and') + print('cleanup (using swift-object-relinker) the hard links') + exit(EXIT_SUCCESS) + + @staticmethod + def finish_increase_partition_power(): + """ +swift-ring-builder finish_increase_partition_power + Finally removes the next_part_power flag. Has to be run after the + swift-object-relinker tool has been used to cleanup old existing data. + + A write_ring command is needed to make the change take effect. + + ***************************** + USE THIS WITH EXTREME CAUTION + ***************************** + + If you increase the partition power and deploy changed rings, you may + introduce unavailability in your cluster. This has an end-user impact. Make + sure you execute required operations to increase the partition power + accurately. + + """ + if len(argv) < 3: + print(Commands.finish_increase_partition_power.__doc__.strip()) + exit(EXIT_ERROR) + + if not builder.finish_increase_partition_power(): + print('Ring partition power increase cannot be finished.') + exit(EXIT_ERROR) + + print('The change will take effect after the next write_ring.') + builder.save(builder_file) + + exit(EXIT_SUCCESS) + + +def main(arguments=None): + global argv, backup_dir, builder, builder_file, ring_file + if arguments is not None: + argv = arguments + else: + argv = sys_argv + + if len(argv) < 2: + print("swift-ring-builder %(MAJOR_VERSION)s.%(MINOR_VERSION)s\n" % + globals()) + print(Commands.default.__doc__.strip()) + print() + cmds = [c for c in dir(Commands) + if getattr(Commands, c).__doc__ and not c.startswith('_') and + c != 'default'] + cmds.sort() + for cmd in cmds: + print(getattr(Commands, cmd).__doc__.strip()) + print() + print(parse_search_value.__doc__.strip()) + print() + for line in wrap(' '.join(cmds), 79, initial_indent='Quick list: ', + subsequent_indent=' '): + print(line) + print('Exit codes: 0 = operation successful\n' + ' 1 = operation completed with warnings\n' + ' 2 = error') + exit(EXIT_SUCCESS) + + builder_file, ring_file = parse_builder_ring_filename_args(argv) + if builder_file != argv[1]: + if len(argv) > 2 and argv[2] in ('write_builder', 'version'): + pass + else: + print('Note: using %s instead of %s as builder file' % ( + builder_file, argv[1])) + + try: + builder = RingBuilder.load(builder_file) + except exceptions.UnPicklingError as e: + msg = str(e) + try: + CompositeRingBuilder.load(builder_file) + msg += ' (it appears to be a composite ring builder file?)' + except Exception: # noqa + pass + print(msg) + exit(EXIT_ERROR) + except (exceptions.FileNotFoundError, exceptions.PermissionError) as e: + if len(argv) < 3 or argv[2] not in ('create', 'write_builder', + 'version'): + print(e) + exit(EXIT_ERROR) + except Exception as e: + print('Problem occurred while reading builder file: %s. %s' % + (builder_file, e)) + exit(EXIT_ERROR) + + backup_dir = pathjoin(dirname(builder_file), 'backups') + try: + mkdir(backup_dir) + except OSError as err: + if err.errno != EEXIST: + raise + + if len(argv) == 2: + command = "default" + else: + command = argv[2] + if argv[0].endswith('-safe'): + try: + with lock_parent_directory(abspath(builder_file), 15): + getattr(Commands, command, Commands.unknown)() + except exceptions.LockTimeout: + print("Ring/builder dir currently locked.") + exit(2) + else: + getattr(Commands, command, Commands.unknown)() + + +def error_handling_main(): + # We exit code 1 on WARNING statuses, 2 on ERROR. This means we need + # to handle any uncaught exceptions by printing the usual backtrace, + # but then exiting 2 (not 1 as is usual for a python + # exception). + + # We *don't* want to do this in main(), however, because we don't want to + # pollute the test environment or cause a bunch of test churn to mock out + # sys.excepthook + + def exit_with_status_two(tp, val, tb): + traceback.print_exception(tp, val, tb) + exit(2) + + sys.excepthook = exit_with_status_two + main() + + +if __name__ == '__main__': + error_handling_main() diff --git a/swift/cli/ringcomposer.py b/swift/cli/ringcomposer.py new file mode 100644 index 0000000000..99b7ce6ccf --- /dev/null +++ b/swift/cli/ringcomposer.py @@ -0,0 +1,182 @@ +# Copyright (c) 2017 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +``swift-ring-composer`` is an experimental tool for building a composite ring +file from other existing component ring builder files. Its CLI, name or +implementation may change or be removed altogether in future versions of Swift. + +Currently its interface is similar to that of the ``swift-ring-builder``. The +command structure takes the form of:: + + swift-ring-composer + +where ```` is a special builder which stores a json +blob of composite ring metadata. This metadata describes the component +``RingBuilder``'s used in the composite ring, their order and version. + +There are currently 2 sub-commands: ``show`` and ``compose``. The ``show`` +sub-command takes no additional arguments and displays the current contents of +of the composite builder file:: + + swift-ring-composer show + +The ``compose`` sub-command is the one that actually stitches the component +ring builders together to create both the composite ring file and composite +builder file. The command takes the form:: + + swift-ring-composer compose \\ + [ .. ] --output \\ + [--force] + +There may look like there is a lot going on there but it's actually quite +simple. The ``compose`` command takes in the list of builders to stitch +together and the filename for the composite ring file via the ``--output`` +option. The ``--force`` option overrides checks on the ring composition. + +To change ring devices, first add or remove devices from the component ring +builders and then use the ``compose`` sub-command to create a new composite +ring file. + +.. note:: + + ``swift-ring-builder`` cannot be used to inspect the generated composite + ring file because there is no conventional builder file corresponding to + the composite ring file name. You can either programmatically look inside + the composite ring file using the swift ring classes or create a temporary + builder file from the composite ring file using:: + + swift-ring-builder write_builder + + Do not use this builder file to manage ring devices. + +For further details use:: + + swift-ring-composer -h +""" +import argparse +import json +import os +import sys + +from swift.common.ring.composite_builder import CompositeRingBuilder + +EXIT_SUCCESS = 0 +EXIT_ERROR = 2 + +WARNING = """ +NOTE: This tool is for experimental use and may be + removed in future versions of Swift. +""" + +DESCRIPTION = """ +This is a tool for building a composite ring file from other existing ring +builder files. The component ring builders must all have the same partition +power. Each device must only be used in a single component builder. Each region +must only be used in a single component builder. +""" + + +def _print_to_stderr(msg): + print(msg, file=sys.stderr) + + +def _print_err(msg, err): + _print_to_stderr('%s\nOriginal exception message:\n%s' % (msg, err)) + + +def show(composite_builder, args): + print(json.dumps(composite_builder.to_dict(), indent=4, sort_keys=True)) + return EXIT_SUCCESS + + +def compose(composite_builder, args): + composite_builder = composite_builder or CompositeRingBuilder() + try: + ring_data = composite_builder.compose( + args.builder_files, force=args.force, require_modified=True) + except Exception as err: + _print_err( + 'An error occurred while composing the ring.', err) + return EXIT_ERROR + try: + ring_data.save(args.output) + except Exception as err: + _print_err( + 'An error occurred while writing the composite ring file.', err) + return EXIT_ERROR + try: + composite_builder.save(args.composite_builder_file) + except Exception as err: + _print_err( + 'An error occurred while writing the composite builder file.', err) + return EXIT_ERROR + return EXIT_SUCCESS + + +def main(arguments=None): + if arguments is not None: + argv = arguments + else: + argv = sys.argv + + parser = argparse.ArgumentParser(description=DESCRIPTION) + parser.add_argument( + 'composite_builder_file', + metavar='composite_builder_file', type=str, + help='Name of composite builder file') + + subparsers = parser.add_subparsers( + help='subcommand help', title='subcommands') + + # show + show_parser = subparsers.add_parser( + 'show', help='show composite ring builder metadata') + show_parser.set_defaults(func=show) + + # compose + compose_parser = subparsers.add_parser( + 'compose', help='compose composite ring', + usage='%(prog)s [-h] ' + '[builder_file builder_file [builder_file ...] ' + '--output ring_file [--force]') + bf_help = ('Paths to component ring builder files to include in composite ' + 'ring') + compose_parser.add_argument('builder_files', metavar='builder_file', + nargs='*', type=str, help=bf_help) + compose_parser.add_argument('--output', metavar='output_file', type=str, + required=True, help='Name of output ring file') + compose_parser.add_argument( + '--force', action='store_true', + help='Force new composite ring file to be written') + compose_parser.set_defaults(func=compose) + + _print_to_stderr(WARNING) + args = parser.parse_args(argv[1:]) + composite_builder = None + if args.func != compose or os.path.exists(args.composite_builder_file): + try: + composite_builder = CompositeRingBuilder.load( + args.composite_builder_file) + except Exception as err: + _print_err( + 'An error occurred while loading the composite builder file.', + err) + exit(EXIT_ERROR) + + exit(args.func(composite_builder, args)) + + +if __name__ == '__main__': + main() diff --git a/swift/cli/shard-info.py b/swift/cli/shard-info.py new file mode 100644 index 0000000000..fdcfdf5d3e --- /dev/null +++ b/swift/cli/shard-info.py @@ -0,0 +1,196 @@ +# Copyright (c) 2017 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from collections import defaultdict + +from swift.common import utils +from swift.common.db_replicator import roundrobin_datadirs +from swift.common.ring import ring +from swift.common.utils import Timestamp +from swift.container.backend import ContainerBroker, DATADIR + +TAB = ' ' + + +def broker_key(broker): + broker.get_info() + return broker.path + + +def container_type(broker): + return 'ROOT' if broker.is_root_container() else 'SHARD' + + +def collect_brokers(conf_path, names2nodes): + conf = utils.readconf(conf_path, 'container-replicator') + root = conf.get('devices', '/srv/node') + swift_dir = conf.get('swift_dir', '/etc/swift') + c_ring = ring.Ring(swift_dir, ring_name='container') + dirs = [] + brokers = defaultdict(dict) + for node in c_ring.devs: + if node is None: + continue + datadir = os.path.join(root, node['device'], DATADIR) + if os.path.isdir(datadir): + dirs.append((datadir, node['id'], lambda *args: True)) + for part, object_file, node_id in roundrobin_datadirs(dirs): + broker = ContainerBroker(object_file) + for node in c_ring.get_part_nodes(int(part)): + if node['id'] == node_id: + node_index = str(node['index']) + break + else: + node_index = 'handoff' + names2nodes[broker_key(broker)][(node_id, node_index)] = broker + return brokers + + +def print_broker_info(node, broker, indent_level=0): + indent = indent_level * TAB + info = broker.get_info() + raw_info = broker._get_info() + deleted_at = float(info['delete_timestamp']) + if deleted_at: + deleted_at = Timestamp(info['delete_timestamp']).isoformat + else: + deleted_at = ' - ' + print('%s(%s) %s, objs: %s, bytes: %s, actual_objs: %s, put: %s, ' + 'deleted: %s' % + (indent, node[1][0], broker.get_db_state(), + info['object_count'], info['bytes_used'], raw_info['object_count'], + Timestamp(info['put_timestamp']).isoformat, deleted_at)) + + +def print_db(node, broker, expect_type='ROOT', indent_level=0): + indent = indent_level * TAB + print('%s(%s) %s node id: %s, node index: %s' % + (indent, node[1][0], broker.db_file, node[0], node[1])) + actual_type = container_type(broker) + if actual_type != expect_type: + print('%s ERROR expected %s but found %s' % + (indent, expect_type, actual_type)) + + +def print_own_shard_range(node, sr, indent_level): + indent = indent_level * TAB + range = '%r - %r' % (sr.lower, sr.upper) + print('%s(%s) %23s, objs: %3s, bytes: %3s, timestamp: %s (%s), ' + 'modified: %s (%s), %7s: %s (%s), deleted: %s, epoch: %s' % + (indent, node[1][0], range, sr.object_count, sr.bytes_used, + sr.timestamp.isoformat, sr.timestamp.internal, + sr.meta_timestamp.isoformat, sr.meta_timestamp.internal, + sr.state_text, sr.state_timestamp.isoformat, + sr.state_timestamp.internal, sr.deleted, + sr.epoch.internal if sr.epoch else None)) + + +def print_own_shard_range_info(node, shard_ranges, indent_level=0): + shard_ranges.sort(key=lambda x: x.deleted) + for sr in shard_ranges: + print_own_shard_range(node, sr, indent_level) + + +def print_shard_range(node, sr, indent_level): + indent = indent_level * TAB + range = '%r - %r' % (sr.lower, sr.upper) + print('%s(%s) %23s, objs: %3s, bytes: %3s, timestamp: %s (%s), ' + 'modified: %s (%s), %7s: %s (%s), deleted: %s, epoch: %s %s' % + (indent, node[1][0], range, sr.object_count, sr.bytes_used, + sr.timestamp.isoformat, sr.timestamp.internal, + sr.meta_timestamp.isoformat, sr.meta_timestamp.internal, + sr.state_text, sr.state_timestamp.isoformat, + sr.state_timestamp.internal, sr.deleted, + sr.epoch.internal if sr.epoch else None, sr.name)) + + +def print_shard_range_info(node, shard_ranges, indent_level=0): + shard_ranges.sort(key=lambda x: x.deleted) + for sr in shard_ranges: + print_shard_range(node, sr, indent_level) + + +def print_sharding_info(node, broker, indent_level=0): + indent = indent_level * TAB + print('%s(%s) %s' % (indent, node[1][0], broker.get_sharding_sysmeta())) + + +def print_container(name, name2nodes2brokers, expect_type='ROOT', + indent_level=0, used_names=None): + used_names = used_names or set() + indent = indent_level * TAB + node2broker = name2nodes2brokers[name] + ordered_by_index = sorted(node2broker.keys(), key=lambda x: x[1]) + brokers = [(node, node2broker[node]) for node in ordered_by_index] + + print('%sName: %s' % (indent, name)) + if name in used_names: + print('%s (Details already listed)\n' % indent) + return + + used_names.add(name) + print(indent + 'DB files:') + for node, broker in brokers: + print_db(node, broker, expect_type, indent_level=indent_level + 1) + + print(indent + 'Info:') + for node, broker in brokers: + print_broker_info(node, broker, indent_level=indent_level + 1) + + print(indent + 'Sharding info:') + for node, broker in brokers: + print_sharding_info(node, broker, indent_level=indent_level + 1) + print(indent + 'Own shard range:') + for node, broker in brokers: + shard_ranges = broker.get_shard_ranges( + include_deleted=True, include_own=True, exclude_others=True) + print_own_shard_range_info(node, shard_ranges, + indent_level=indent_level + 1) + print(indent + 'Shard ranges:') + shard_names = set() + for node, broker in brokers: + shard_ranges = broker.get_shard_ranges(include_deleted=True) + for sr_name in shard_ranges: + shard_names.add(sr_name.name) + print_shard_range_info(node, shard_ranges, + indent_level=indent_level + 1) + print(indent + 'Shards:') + for sr_name in shard_names: + print_container(sr_name, name2nodes2brokers, expect_type='SHARD', + indent_level=indent_level + 1, used_names=used_names) + print('\n') + + +def run(conf_paths): + # container_name -> (node id, node index) -> broker + name2nodes2brokers = defaultdict(dict) + for conf_path in conf_paths: + collect_brokers(conf_path, name2nodes2brokers) + + print('First column on each line is (node index)\n') + for name, node2broker in name2nodes2brokers.items(): + expect_root = False + for node, broker in node2broker.items(): + expect_root = broker.is_root_container() or expect_root + if expect_root: + print_container(name, name2nodes2brokers) + + +if __name__ == '__main__': + conf_dir = '/etc/swift/container-server' + conf_paths = [os.path.join(conf_dir, p) for p in os.listdir(conf_dir) + if p.endswith(('conf', 'conf.d'))] + run(conf_paths) diff --git a/swift/common/__init__.py b/swift/common/__init__.py index 880a66aa87..56aa6012d4 100644 --- a/swift/common/__init__.py +++ b/swift/common/__init__.py @@ -1 +1 @@ -""" Code common to all of Swift. """ +"""Code common to all of Swift.""" diff --git a/swift/common/base_storage_server.py b/swift/common/base_storage_server.py new file mode 100644 index 0000000000..7ba30703d2 --- /dev/null +++ b/swift/common/base_storage_server.py @@ -0,0 +1,141 @@ +# Copyright (c) 2010-2014 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +import time +import functools + +from swift import __version__ as swift_version +from swift.common.utils import public, config_true_value, \ + LOG_LINE_DEFAULT_FORMAT +from swift.common.http import is_server_error +from swift.common.swob import Response, HTTPException + + +def labeled_timing_stats(metric, **dec_kwargs): + """ + Returns a decorator that emits labeled metrics timing events or errors + for public methods in swift's wsgi server controllers, based on response + code. + + The controller methods are not allowed to override the following labels: + 'method', 'status'. + """ + def decorating_func(func): + + @functools.wraps(func) + def _timing_stats(ctrl, req, *args, **kwargs): + labels = {} + start_time = time.time() + req_method = req.method + try: + resp = func( + ctrl, req, *args, timing_stats_labels=labels, **kwargs) + except HTTPException as e: + resp = e + labels['method'] = req_method + labels['status'] = resp.status_int + + ctrl.statsd.timing_since(metric, start_time, labels=labels, + **dec_kwargs) + return resp + + return _timing_stats + return decorating_func + + +def timing_stats(**dec_kwargs): + """ + Returns a decorator that logs timing events or errors for public methods in + swift's wsgi server controllers, based on response code. + """ + def decorating_func(func): + method = func.__name__ + + @functools.wraps(func) + def _timing_stats(ctrl, *args, **kwargs): + start_time = time.time() + try: + resp = func(ctrl, *args, **kwargs) + except HTTPException as e: + resp = e + # .timing is for successful responses *or* error codes that are + # not Swift's fault. For example, 500 is definitely the server's + # fault, but 412 is an error code (4xx are all errors) that is + # due to a header the client sent. + # + # .errors.timing is for failures that *are* Swift's fault. + # Examples include 507 for an unmounted drive or 500 for an + # unhandled exception. + if not is_server_error(resp.status_int): + ctrl.logger.timing_since(method + '.timing', + start_time, **dec_kwargs) + else: + ctrl.logger.timing_since(method + '.errors.timing', + start_time, **dec_kwargs) + return resp + + return _timing_stats + return decorating_func + + +class BaseStorageServer(object): + """ + Implements common OPTIONS method for object, account, container servers. + """ + + def __init__(self, conf, **kwargs): + self._allowed_methods = None + self.replication_server = config_true_value( + conf.get('replication_server', 'true')) + self.log_format = conf.get('log_format', LOG_LINE_DEFAULT_FORMAT) + self.anonymization_method = conf.get('log_anonymization_method', 'md5') + self.anonymization_salt = conf.get('log_anonymization_salt', '') + + @property + def server_type(self): + raise NotImplementedError( + 'Storage nodes have not implemented the Server type.') + + @property + def allowed_methods(self): + if self._allowed_methods is None: + self._allowed_methods = [] + all_methods = inspect.getmembers(self, predicate=callable) + for name, m in all_methods: + if not getattr(m, 'publicly_accessible', False): + continue + if getattr(m, 'replication', False) and \ + not self.replication_server: + continue + self._allowed_methods.append(name) + self._allowed_methods.sort() + return self._allowed_methods + + @public + @timing_stats() + def OPTIONS(self, req): + """ + Base handler for OPTIONS requests + + :param req: swob.Request object + :returns: swob.Response object + """ + # Prepare the default response + headers = {'Allow': ', '.join(self.allowed_methods), + 'Server': '%s/%s' % (self.server_type, swift_version)} + resp = Response(status=200, request=req, headers=headers) + + return resp diff --git a/swift/common/bench.py b/swift/common/bench.py deleted file mode 100644 index 7eef51a57d..0000000000 --- a/swift/common/bench.py +++ /dev/null @@ -1,486 +0,0 @@ -# Copyright (c) 2010-2012 OpenStack, LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re -import sys -import uuid -import time -import random -import signal -import socket -import logging -from contextlib import contextmanager -from optparse import Values - -import eventlet -import eventlet.pools -from eventlet.green.httplib import CannotSendRequest - -from swift.common.utils import config_true_value, LogAdapter -import swiftclient as client -from swift.common import direct_client -from swift.common.http import HTTP_CONFLICT - -try: - import simplejson as json -except ImportError: - import json - - -def _func_on_containers(logger, conf, concurrency_key, func): - """Run a function on each container with concurrency.""" - - bench = Bench(logger, conf, []) - pool = eventlet.GreenPool(int(getattr(conf, concurrency_key))) - for container in conf.containers: - pool.spawn_n(func, bench.url, bench.token, container) - pool.waitall() - - -def delete_containers(logger, conf): - """Utility function to delete benchmark containers.""" - - def _deleter(url, token, container): - try: - client.delete_container(url, token, container) - except client.ClientException, e: - if e.http_status != HTTP_CONFLICT: - logger.warn("Unable to delete container '%s'. " - "Got http status '%d'." - % (container, e.http_status)) - - _func_on_containers(logger, conf, 'del_concurrency', _deleter) - - -def create_containers(logger, conf): - """Utility function to create benchmark containers.""" - - _func_on_containers(logger, conf, 'put_concurrency', client.put_container) - - -class SourceFile(object): - """ - Iterable, file-like object to lazily emit a bunch of zeros in - reasonable-size chunks. - - swift.common.direct_client wants iterables, but swiftclient wants - file-like objects where hasattr(thing, 'read') is true. Therefore, - this class can do both. - """ - - def __init__(self, size, chunk_size=1024 * 64): - self.pos = 0 - self.size = size - self.chunk_size = chunk_size - - def __iter__(self): - return self - - def __len__(self): - return self.size - - def next(self): - if self.pos >= self.size: - raise StopIteration - chunk_size = min(self.size - self.pos, self.chunk_size) - yield '0' * chunk_size - self.pos += chunk_size - - def read(self, desired_size): - chunk_size = min(self.size - self.pos, desired_size) - self.pos += chunk_size - return '0' * chunk_size - - -class ConnectionPool(eventlet.pools.Pool): - - def __init__(self, url, size): - self.url = url - eventlet.pools.Pool.__init__(self, size, size) - - def create(self): - return client.http_connection(self.url) - - -class BenchServer(object): - """ - A BenchServer binds to an IP/port and listens for bench jobs. A bench - job consists of the normal conf "dict" encoded in JSON, terminated with an - EOF. The log level is at least INFO, but DEBUG may also be specified in - the conf dict. - - The server will wait forever for jobs, running them one at a time. - """ - def __init__(self, logger, bind_ip, bind_port): - self.logger = logger - self.bind_ip = bind_ip - self.bind_port = int(bind_port) - - def run(self): - s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - self.logger.info('Binding to %s:%s', self.bind_ip, self.bind_port) - s.bind((self.bind_ip, self.bind_port)) - s.listen(20) - while True: - client, address = s.accept() - self.logger.debug('Accepting connection from %s:%s', *address) - client_file = client.makefile('rb+', 1) - json_data = client_file.read() - conf = Values(json.loads(json_data)) - - self.logger.info( - 'Starting run for %s:%s [put/get/del_concurrency: %s/%s/%s, ' - 'num_objects: %s, num_gets: %s]', address[0], address[1], - conf.put_concurrency, conf.get_concurrency, - conf.del_concurrency, conf.num_objects, conf.num_gets) - - logger = logging.getLogger('bench-server') - level = logging.DEBUG if conf.log_level.lower() == 'debug' \ - else logging.INFO - logger.setLevel(level) - loghandler = logging.StreamHandler(client_file) - logformat = logging.Formatter( - '%(server)s %(asctime)s %(levelname)s %(message)s') - loghandler.setFormatter(logformat) - logger.addHandler(loghandler) - logger = LogAdapter(logger, 'swift-bench-server') - - controller = BenchController(logger, conf) - try: - controller.run() - except socket.error: - logger.warning('Socket error', exc_info=1) - - logger.logger.removeHandler(loghandler) - client_file.close() - client.close() - - self.logger.info('...bench run completed; waiting for next run.') - - -class Bench(object): - - def __init__(self, logger, conf, names): - self.logger = logger - self.aborted = False - self.user = conf.user - self.key = conf.key - self.auth_url = conf.auth - self.use_proxy = config_true_value(conf.use_proxy) - self.auth_version = conf.auth_version - self.logger.info("Auth version: %s" % self.auth_version) - if self.use_proxy: - url, token = client.get_auth(self.auth_url, self.user, self.key, - auth_version=self.auth_version) - self.token = token - self.account = url.split('/')[-1] - if conf.url == '': - self.url = url - else: - self.url = conf.url - else: - self.token = 'SlapChop!' - self.account = conf.account - self.url = conf.url - self.ip, self.port = self.url.split('/')[2].split(':') - - self.object_size = int(conf.object_size) - self.object_sources = conf.object_sources - self.lower_object_size = int(conf.lower_object_size) - self.upper_object_size = int(conf.upper_object_size) - self.files = [] - if self.object_sources: - self.object_sources = self.object_sources.split() - self.files = [file(f, 'rb').read() for f in self.object_sources] - - self.put_concurrency = int(conf.put_concurrency) - self.get_concurrency = int(conf.get_concurrency) - self.del_concurrency = int(conf.del_concurrency) - self.total_objects = int(conf.num_objects) - self.total_gets = int(conf.num_gets) - self.timeout = int(conf.timeout) - self.devices = conf.devices.split() - self.names = names - self.conn_pool = ConnectionPool(self.url, - max(self.put_concurrency, - self.get_concurrency, - self.del_concurrency)) - - def _log_status(self, title): - total = time.time() - self.beginbeat - self.logger.info(_('%(complete)s %(title)s [%(fail)s failures], ' - '%(rate).01f/s'), - {'title': title, 'complete': self.complete, - 'fail': self.failures, - 'rate': (float(self.complete) / total)}) - - @contextmanager - def connection(self): - try: - hc = self.conn_pool.get() - try: - yield hc - except CannotSendRequest: - self.logger.info(_("CannotSendRequest. Skipping...")) - try: - hc.close() - except Exception: - pass - self.failures += 1 - hc = self.conn_pool.create() - finally: - self.conn_pool.put(hc) - - def run(self): - pool = eventlet.GreenPool(self.concurrency) - self.beginbeat = self.heartbeat = time.time() - self.heartbeat -= 13 # just to get the first report quicker - self.failures = 0 - self.complete = 0 - for i in xrange(self.total): - if self.aborted: - break - pool.spawn_n(self._run, i) - pool.waitall() - self._log_status(self.msg + ' **FINAL**') - - def _run(self, thread): - return - - -class DistributedBenchController(object): - """ - This class manages a distributed swift-bench run. For this Controller - class to make sense, the conf.bench_clients list must contain at least one - entry. - - The idea is to split the configured load between one or more - swift-bench-client processes, each of which use eventlet for concurrency. - We deliberately take a simple, naive approach with these limitations: - 1) Concurrency, num_objects, and num_gets are spread evenly between the - swift-bench-client processes. With a low concurrency to - swift-bench-client count ratio, rounding may result in a greater - than desired aggregate concurrency. - 2) Each swift-bench-client process runs independently so some may - finish up before others, i.e. the target aggregate concurrency is - not necessarily present the whole time. This may bias aggregate - reported rates lower than a more efficient architecture. - 3) Because of #2, some swift-bench-client processes may be running GETs - while others are still runinng their PUTs. Because of this - potential skew, distributed runs will not isolate one operation at a - time like a single swift-bench run will. - 3) Reported aggregate rates are simply the sum of each - swift-bench-client process reported FINAL number. That's probably - inaccurate somehow. - """ - - def __init__(self, logger, conf): - self.logger = logger - # ... INFO 1000 PUTS **FINAL** [0 failures], 34.9/s - self.final_re = re.compile( - 'INFO (\d+) (.*) \*\*FINAL\*\* \[(\d+) failures\], (\d+\.\d+)/s') - self.clients = conf.bench_clients - del conf.bench_clients - for k in ['put_concurrency', 'get_concurrency', 'del_concurrency', - 'num_objects', 'num_gets']: - setattr(conf, k, max(1, int(getattr(conf, k)) / len(self.clients))) - self.conf = conf - - def run(self): - eventlet.patcher.monkey_patch(socket=True) - pool = eventlet.GreenPool(size=len(self.clients)) - pile = eventlet.GreenPile(pool) - for client in self.clients: - pile.spawn(self.do_run, client) - results = { - 'PUTS': dict(count=0, failures=0, rate=0.0), - 'GETS': dict(count=0, failures=0, rate=0.0), - 'DEL': dict(count=0, failures=0, rate=0.0), - } - for result in pile: - for k, v in result.iteritems(): - target = results[k] - target['count'] += int(v['count']) - target['failures'] += int(v['failures']) - target['rate'] += float(v['rate']) - for k in ['PUTS', 'GETS', 'DEL']: - v = results[k] - self.logger.info('%d %s **FINAL** [%d failures], %.1f/s' % ( - v['count'], k, v['failures'], v['rate'])) - - def do_run(self, client): - s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - ip, port = client.split(':') - s.connect((ip, int(port))) - s.sendall(json.dumps(self.conf.__dict__)) - s.shutdown(socket.SHUT_WR) - s_file = s.makefile('rb', 1) - result = {} - for line in s_file: - match = self.final_re.search(line) - if match: - g = match.groups() - result[g[1]] = { - 'count': g[0], - 'failures': g[2], - 'rate': g[3], - } - else: - sys.stderr.write('%s %s' % (client, line)) - return result - - -class BenchController(object): - - def __init__(self, logger, conf): - self.logger = logger - self.conf = conf - self.names = [] - self.delete = config_true_value(conf.delete) - self.gets = int(conf.num_gets) - self.aborted = False - - def sigint1(self, signum, frame): - if self.delete: - print >>sys.stderr, ( - 'SIGINT received; finishing up and running DELETE.\n' - 'Send one more SIGINT to exit *immediately*.') - self.aborted = True - if self.running and not isinstance(self.running, BenchDELETE): - self.running.aborted = True - signal.signal(signal.SIGINT, self.sigint2) - else: - self.sigint2(signum, frame) - - def sigint2(self, signum, frame): - sys.exit('Final SIGINT received.') - - def run(self): - signal.signal(signal.SIGINT, self.sigint1) - puts = BenchPUT(self.logger, self.conf, self.names) - self.running = puts - puts.run() - if self.gets and not self.aborted: - gets = BenchGET(self.logger, self.conf, self.names) - self.running = gets - gets.run() - if self.delete: - dels = BenchDELETE(self.logger, self.conf, self.names) - self.running = dels - dels.run() - - -class BenchDELETE(Bench): - - def __init__(self, logger, conf, names): - Bench.__init__(self, logger, conf, names) - self.concurrency = self.del_concurrency - self.total = len(names) - self.msg = 'DEL' - - def _run(self, thread): - if time.time() - self.heartbeat >= 15: - self.heartbeat = time.time() - self._log_status('DEL') - device, partition, name, container_name = self.names.pop() - with self.connection() as conn: - try: - if self.use_proxy: - client.delete_object(self.url, self.token, - container_name, name, http_conn=conn) - else: - node = {'ip': self.ip, 'port': self.port, 'device': device} - direct_client.direct_delete_object(node, partition, - self.account, - container_name, name) - except client.ClientException, e: - self.logger.debug(str(e)) - self.failures += 1 - self.complete += 1 - - -class BenchGET(Bench): - - def __init__(self, logger, conf, names): - Bench.__init__(self, logger, conf, names) - self.concurrency = self.get_concurrency - self.total = self.total_gets - self.msg = 'GETS' - - def _run(self, thread): - if time.time() - self.heartbeat >= 15: - self.heartbeat = time.time() - self._log_status('GETS') - device, partition, name, container_name = random.choice(self.names) - with self.connection() as conn: - try: - if self.use_proxy: - client.get_object(self.url, self.token, - container_name, name, http_conn=conn) - else: - node = {'ip': self.ip, 'port': self.port, 'device': device} - direct_client.direct_get_object(node, partition, - self.account, - container_name, name) - except client.ClientException, e: - self.logger.debug(str(e)) - self.failures += 1 - self.complete += 1 - - -class BenchPUT(Bench): - - def __init__(self, logger, conf, names): - Bench.__init__(self, logger, conf, names) - self.concurrency = self.put_concurrency - self.total = self.total_objects - self.msg = 'PUTS' - self.containers = conf.containers - - def _run(self, thread): - if time.time() - self.heartbeat >= 15: - self.heartbeat = time.time() - self._log_status('PUTS') - name = uuid.uuid4().hex - if self.object_sources: - source = random.choice(self.files) - elif self.upper_object_size > self.lower_object_size: - source = SourceFile(random.randint(self.lower_object_size, - self.upper_object_size)) - else: - source = SourceFile(self.object_size) - device = random.choice(self.devices) - partition = str(random.randint(1, 3000)) - container_name = random.choice(self.containers) - with self.connection() as conn: - try: - if self.use_proxy: - client.put_object(self.url, self.token, - container_name, name, source, - content_length=len(source), - http_conn=conn) - else: - node = {'ip': self.ip, 'port': self.port, 'device': device} - direct_client.direct_put_object(node, partition, - self.account, - container_name, name, - source, - content_length=len(source)) - except client.ClientException, e: - self.logger.debug(str(e)) - self.failures += 1 - else: - self.names.append((device, partition, name, container_name)) - self.complete += 1 diff --git a/swift/common/bufferedhttp.py b/swift/common/bufferedhttp.py index 00e58da756..62c07e1623 100644 --- a/swift/common/bufferedhttp.py +++ b/swift/common/bufferedhttp.py @@ -1,4 +1,4 @@ -# Copyright (c) 2010-2012 OpenStack, LLC. +# Copyright (c) 2010-2012 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -26,12 +26,23 @@ make all calls through httplib. """ -from urllib import quote +from swift.common import constraints +import http.client import logging import time +import socket -from eventlet.green.httplib import CONTINUE, HTTPConnection, HTTPMessage, \ - HTTPResponse, HTTPSConnection, _UNKNOWN +from eventlet.green.http.client import CONTINUE, HTTPConnection, \ + HTTPResponse, HTTPSConnection, _UNKNOWN, ImproperConnectionState +from urllib.parse import quote, parse_qsl, urlencode + +from eventlet.green.http import client as green_http_client + +# Apparently http.server uses this to decide when/whether to send a 431. +# Give it some slack, so the app is more likely to get the chance to reject +# with a 400 instead. +http.client._MAXHEADERS = constraints.MAX_HEADER_COUNT * 1.6 +green_http_client._MAXHEADERS = constraints.MAX_HEADER_COUNT * 1.6 class BufferedHTTPResponse(HTTPResponse): @@ -39,13 +50,24 @@ class BufferedHTTPResponse(HTTPResponse): def __init__(self, sock, debuglevel=0, strict=0, method=None): # pragma: no cover + # sock should be an eventlet.greenio.GreenSocket self.sock = sock - self.fp = sock.makefile('rb') + if sock is None: + # ...but it could be None if we close the connection as we're + # getting it wrapped up in a Response + self._real_socket = None + # No socket means no file-like -- set it to None like in + # HTTPResponse.close() + self.fp = None + else: + # sock.fd is a socket.socket, which should have a _real_close + self._real_socket = sock.fd + self.fp = sock.makefile('rb') self.debuglevel = debuglevel self.strict = strict self._method = method - self.msg = None + self._headers = self.msg = None # from the Status-Line of the response self.version = _UNKNOWN # HTTP-Version @@ -56,11 +78,45 @@ def __init__(self, sock, debuglevel=0, strict=0, self.chunk_left = _UNKNOWN # bytes left to read in current chunk self.length = _UNKNOWN # number of bytes left in response self.will_close = _UNKNOWN # conn will close at end of response + self._readline_buffer = b'' + + @property + def headers(self): + return self._headers + + @headers.setter + def headers(self, hdrs): + try: + header_payload = hdrs.get_payload() + except AttributeError: + pass + else: + if isinstance(header_payload, list) and len(header_payload) == 1: + header_payload = header_payload[0].get_payload() + if header_payload: + # This shouldn't be here. We must've bumped up against + # https://bugs.python.org/issue37093 + for line in header_payload.rstrip('\r\n').split('\n'): + if ':' not in line or line[:1] in ' \t': + # Well, we're no more broken than we were before... + # Should we support line folding? + # How can/should we handle a bad header line? + break + header, value = line.split(':', 1) + value = value.strip(' \t\n\r') + hdrs.add_header(header, value) + # Clear the payload now that all headers are present. + # Otherwise, we may double-up the headers parsed here + # if/when repeatedly setting the headers property. + hdrs.set_payload(None) + self._headers = hdrs def expect_response(self): if self.fp: self.fp.close() self.fp = None + if not self.sock: + raise ImproperConnectionState('Socket already closed') self.fp = self.sock.makefile('rb', 0) version, status, reason = self._read_status() if status != CONTINUE: @@ -70,12 +126,49 @@ def expect_response(self): self.status = status self.reason = reason.strip() self.version = 11 - self.msg = HTTPMessage(self.fp, 0) - self.msg.fp = None + self.headers = self.msg = http.client.parse_headers(self.fp) + + def read(self, amt=None): + if not self._readline_buffer: + return HTTPResponse.read(self, amt) + + if amt is None: + # Unbounded read: send anything we have buffered plus whatever + # is left. + buffered = self._readline_buffer + self._readline_buffer = b'' + return buffered + HTTPResponse.read(self, amt) + elif amt <= len(self._readline_buffer): + # Bounded read that we can satisfy entirely from our buffer + res = self._readline_buffer[:amt] + self._readline_buffer = self._readline_buffer[amt:] + return res + else: + # Bounded read that wants more bytes than we have + smaller_amt = amt - len(self._readline_buffer) + buf = self._readline_buffer + self._readline_buffer = b'' + return buf + HTTPResponse.read(self, smaller_amt) + + def nuke_from_orbit(self): + """ + Terminate the socket with extreme prejudice. + + Closes the underlying socket regardless of whether or not anyone else + has references to it. Use this when you are certain that nobody else + you care about has a reference to this socket. + """ + if self._real_socket: + # Hopefully this is equivalent to py2's _real_socket.close()? + # TODO: verify that this does everything ^^^^ does for py2 + self._real_socket._real_close() + self._real_socket = None + self.close() def close(self): HTTPResponse.close(self) self.sock = None + self._real_socket = None class BufferedHTTPConnection(HTTPConnection): @@ -84,24 +177,41 @@ class BufferedHTTPConnection(HTTPConnection): def connect(self): self._connected_time = time.time() - return HTTPConnection.connect(self) + ret = HTTPConnection.connect(self) + self.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) + return ret def putrequest(self, method, url, skip_host=0, skip_accept_encoding=0): + '''Send a request to the server. + + :param method: specifies an HTTP request method, e.g. 'GET'. + :param url: specifies the object being requested, e.g. '/index.html'. + :param skip_host: if True does not add automatically a 'Host:' header + :param skip_accept_encoding: if True does not add automatically an + 'Accept-Encoding:' header + ''' self._method = method self._path = url return HTTPConnection.putrequest(self, method, url, skip_host, skip_accept_encoding) + def putheader(self, header, value): + if not isinstance(header, bytes): + header = header.encode('latin-1') + HTTPConnection.putheader(self, header, value) + def getexpect(self): - response = BufferedHTTPResponse(self.sock, strict=self.strict, - method=self._method) + kwargs = {'method': self._method} + if hasattr(self, 'strict'): + kwargs['strict'] = self.strict + response = BufferedHTTPResponse(self.sock, **kwargs) response.expect_response() return response def getresponse(self): response = HTTPConnection.getresponse(self) - logging.debug(_("HTTP PERF: %(time).5f seconds to %(method)s " - "%(host)s:%(port)s %(path)s)"), + logging.debug("HTTP PERF: %(time).5f seconds to %(method)s " + "%(host)s:%(port)s %(path)s)", {'time': time.time() - self._connected_time, 'method': self._method, 'host': self.host, 'port': self.port, 'path': self._path}) @@ -126,12 +236,15 @@ def http_connect(ipaddr, port, device, partition, method, path, :param ssl: set True if SSL should be used (default: False) :returns: HTTPConnection object """ - if isinstance(path, unicode): - try: - path = path.encode("utf-8") - except UnicodeError: - pass # what should I do? - path = quote('/' + device + '/' + str(partition) + path) + if isinstance(path, str): + path = path.encode("utf-8") + if isinstance(device, str): + device = device.encode("utf-8") + if isinstance(partition, str): + partition = partition.encode('utf-8') + elif isinstance(partition, int): + partition = str(partition).encode('ascii') + path = quote(b'/' + device + b'/' + partition + path) return http_connect_raw( ipaddr, port, method, path, headers, query_string, ssl) @@ -159,11 +272,16 @@ def http_connect_raw(ipaddr, port, method, path, headers=None, else: conn = BufferedHTTPConnection('%s:%s' % (ipaddr, port)) if query_string: + # Round trip to ensure proper quoting + query_string = urlencode( + parse_qsl(query_string, keep_blank_values=True, + encoding='latin1'), + encoding='latin1') path += '?' + query_string conn.path = path conn.putrequest(method, path, skip_host=(headers and 'Host' in headers)) if headers: - for header, value in headers.iteritems(): + for header, value in headers.items(): conn.putheader(header, str(value)) conn.endheaders() return conn diff --git a/swift/common/constraints.py b/swift/common/constraints.py index 07bf44e494..204a0d029e 100644 --- a/swift/common/constraints.py +++ b/swift/common/constraints.py @@ -1,4 +1,4 @@ -# Copyright (c) 2010-2012 OpenStack, LLC. +# Copyright (c) 2010-2012 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,89 +13,158 @@ # See the License for the specific language governing permissions and # limitations under the License. +import functools import os +from os.path import isdir # tighter scoped import for mocking + +from configparser import ConfigParser, NoSectionError, NoOptionError import urllib -from ConfigParser import ConfigParser, NoSectionError, NoOptionError, \ - RawConfigParser +from swift.common import utils, exceptions from swift.common.swob import HTTPBadRequest, HTTPLengthRequired, \ - HTTPRequestEntityTooLarge + HTTPRequestEntityTooLarge, HTTPPreconditionFailed, HTTPNotImplemented, \ + HTTPException, wsgi_to_str, wsgi_to_bytes -constraints_conf = ConfigParser() -constraints_conf.read('/etc/swift/swift.conf') +MAX_FILE_SIZE = 5368709122 +MAX_META_NAME_LENGTH = 128 +MAX_META_VALUE_LENGTH = 256 +MAX_META_COUNT = 90 +MAX_META_OVERALL_SIZE = 4096 +MAX_HEADER_SIZE = 8192 +MAX_REQUEST_LINE = 8192 +MAX_OBJECT_NAME_LENGTH = 1024 +CONTAINER_LISTING_LIMIT = 10000 +ACCOUNT_LISTING_LIMIT = 10000 +MAX_ACCOUNT_NAME_LENGTH = 256 +MAX_CONTAINER_NAME_LENGTH = 256 +VALID_API_VERSIONS = ["v1", "v1.0"] +EXTRA_HEADER_COUNT = 0 +AUTO_CREATE_ACCOUNT_PREFIX = '.' +# If adding an entry to DEFAULT_CONSTRAINTS, note that +# these constraints are automatically published by the +# proxy server in responses to /info requests, with values +# updated by reload_constraints() +DEFAULT_CONSTRAINTS = { + 'max_file_size': MAX_FILE_SIZE, + 'max_meta_name_length': MAX_META_NAME_LENGTH, + 'max_meta_value_length': MAX_META_VALUE_LENGTH, + 'max_meta_count': MAX_META_COUNT, + 'max_meta_overall_size': MAX_META_OVERALL_SIZE, + 'max_header_size': MAX_HEADER_SIZE, + 'max_request_line': MAX_REQUEST_LINE, + 'max_object_name_length': MAX_OBJECT_NAME_LENGTH, + 'container_listing_limit': CONTAINER_LISTING_LIMIT, + 'account_listing_limit': ACCOUNT_LISTING_LIMIT, + 'max_account_name_length': MAX_ACCOUNT_NAME_LENGTH, + 'max_container_name_length': MAX_CONTAINER_NAME_LENGTH, + 'valid_api_versions': VALID_API_VERSIONS, + 'extra_header_count': EXTRA_HEADER_COUNT, + 'auto_create_account_prefix': AUTO_CREATE_ACCOUNT_PREFIX, +} -def constraints_conf_int(name, default): - try: - return int(constraints_conf.get('swift-constraints', name)) - except (NoSectionError, NoOptionError): - return default - - -#: Max file size allowed for objects -MAX_FILE_SIZE = constraints_conf_int('max_file_size', - 5368709122) # 5 * 1024 * 1024 * 1024 + 2 -#: Max length of the name of a key for metadata -MAX_META_NAME_LENGTH = constraints_conf_int('max_meta_name_length', 128) -#: Max length of the value of a key for metadata -MAX_META_VALUE_LENGTH = constraints_conf_int('max_meta_value_length', 256) -#: Max number of metadata items -MAX_META_COUNT = constraints_conf_int('max_meta_count', 90) -#: Max overall size of metadata -MAX_META_OVERALL_SIZE = constraints_conf_int('max_meta_overall_size', 4096) -#: Max object name length -MAX_OBJECT_NAME_LENGTH = constraints_conf_int('max_object_name_length', 1024) -#: Max object list length of a get request for a container -CONTAINER_LISTING_LIMIT = constraints_conf_int('container_listing_limit', - 10000) -#: Max container list length of a get request for an account -ACCOUNT_LISTING_LIMIT = constraints_conf_int('account_listing_limit', 10000) -#: Max account name length -MAX_ACCOUNT_NAME_LENGTH = constraints_conf_int('max_account_name_length', 256) -#: Max container name length -MAX_CONTAINER_NAME_LENGTH = constraints_conf_int('max_container_name_length', - 256) - - -#: Query string format= values to their corresponding content-type values -FORMAT2CONTENT_TYPE = {'plain': 'text/plain', 'json': 'application/json', - 'xml': 'application/xml'} +SWIFT_CONSTRAINTS_LOADED = False +OVERRIDE_CONSTRAINTS = {} # any constraints overridden by SWIFT_CONF_FILE +EFFECTIVE_CONSTRAINTS = {} # populated by reload_constraints + + +def reload_constraints(): + """ + Parse SWIFT_CONF_FILE and reset module level global constraint attrs, + populating OVERRIDE_CONSTRAINTS AND EFFECTIVE_CONSTRAINTS along the way. + """ + global SWIFT_CONSTRAINTS_LOADED, OVERRIDE_CONSTRAINTS + SWIFT_CONSTRAINTS_LOADED = False + OVERRIDE_CONSTRAINTS = {} + constraints_conf = ConfigParser() + if constraints_conf.read(utils.SWIFT_CONF_FILE): + SWIFT_CONSTRAINTS_LOADED = True + for name, default in DEFAULT_CONSTRAINTS.items(): + try: + value = constraints_conf.get('swift-constraints', name) + except NoOptionError: + pass + except NoSectionError: + # We are never going to find the section for another option + break + else: + if isinstance(default, int): + value = int(value) # Go ahead and let it error + elif isinstance(default, str): + pass # No translation needed, I guess + else: + # Hope we want a list! + value = utils.list_from_csv(value) + OVERRIDE_CONSTRAINTS[name] = value + for name, default in DEFAULT_CONSTRAINTS.items(): + value = OVERRIDE_CONSTRAINTS.get(name, default) + EFFECTIVE_CONSTRAINTS[name] = value + # "globals" in this context is module level globals, always. + globals()[name.upper()] = value + + +reload_constraints() + + +# By default the maximum number of allowed headers depends on the number of max +# allowed metadata settings plus a default value of 36 for swift internally +# generated headers and regular http headers. If for some reason this is not +# enough (custom middleware for example) it can be increased with the +# extra_header_count constraint. +MAX_HEADER_COUNT = MAX_META_COUNT + 36 + max(EXTRA_HEADER_COUNT, 0) def check_metadata(req, target_type): """ - Check metadata sent in the request headers. + Check metadata sent in the request headers. This should only check + that the metadata in the request given is valid. Checks against + account/container overall metadata should be forwarded on to its + respective server to be checked. :param req: request object :param target_type: str: one of: object, container, or account: indicates which type the target storage for the metadata is - :raises HTTPBadRequest: bad metadata + :returns: HTTPBadRequest with bad metadata otherwise None """ - prefix = 'x-%s-meta-' % target_type.lower() + target_type = target_type.lower() + prefix = 'x-%s-meta-' % target_type meta_count = 0 meta_size = 0 - for key, value in req.headers.iteritems(): + for key, value in req.headers.items(): + if (isinstance(value, str) + and len(value) > MAX_HEADER_SIZE): + + return HTTPBadRequest(body=b'Header value too long: %s' % + wsgi_to_bytes(key[:MAX_META_NAME_LENGTH]), + request=req, content_type='text/plain') if not key.lower().startswith(prefix): continue key = key[len(prefix):] if not key: return HTTPBadRequest(body='Metadata name cannot be empty', request=req, content_type='text/plain') + bad_key = not check_utf8(wsgi_to_str(key)) + bad_value = value and not check_utf8(wsgi_to_str(value)) + if target_type in ('account', 'container') and (bad_key or bad_value): + return HTTPBadRequest(body='Metadata must be valid UTF-8', + request=req, content_type='text/plain') meta_count += 1 meta_size += len(key) + len(value) if len(key) > MAX_META_NAME_LENGTH: return HTTPBadRequest( - body='Metadata name too long; max %d' % MAX_META_NAME_LENGTH, + body=wsgi_to_bytes('Metadata name too long: %s%s' % ( + prefix, key)), request=req, content_type='text/plain') - elif len(value) > MAX_META_VALUE_LENGTH: + if len(value) > MAX_META_VALUE_LENGTH: return HTTPBadRequest( - body='Metadata value too long; max %d' % MAX_META_VALUE_LENGTH, + body=wsgi_to_bytes('Metadata value longer than %d: %s%s' % ( + MAX_META_VALUE_LENGTH, prefix, key)), request=req, content_type='text/plain') - elif meta_count > MAX_META_COUNT: + if meta_count > MAX_META_COUNT: return HTTPBadRequest( body='Too many metadata items; max %d' % MAX_META_COUNT, request=req, content_type='text/plain') - elif meta_size > MAX_META_OVERALL_SIZE: + if meta_size > MAX_META_OVERALL_SIZE: return HTTPBadRequest( body='Total metadata too large; max %d' % MAX_META_OVERALL_SIZE, @@ -109,61 +178,102 @@ def check_object_creation(req, object_name): :param req: HTTP request object :param object_name: name of object to be created - :raises HTTPRequestEntityTooLarge: the object is too large - :raises HTTPLengthRequered: missing content-length header and not - a chunked request - :raises HTTPBadRequest: missing or bad content-type header, or - bad metadata + :returns: HTTPRequestEntityTooLarge -- the object is too large + :returns: HTTPLengthRequired -- missing content-length header and not + a chunked request + :returns: HTTPBadRequest -- missing or bad content-type header, or + bad metadata + :returns: HTTPNotImplemented -- unsupported transfer-encoding header value """ - if req.content_length and req.content_length > MAX_FILE_SIZE: + try: + ml = req.message_length() + except ValueError as e: + return HTTPBadRequest(request=req, content_type='text/plain', + body=str(e)) + except AttributeError as e: + return HTTPNotImplemented(request=req, content_type='text/plain', + body=str(e)) + if ml is not None and ml > MAX_FILE_SIZE: return HTTPRequestEntityTooLarge(body='Your request is too large.', request=req, content_type='text/plain') if req.content_length is None and \ req.headers.get('transfer-encoding') != 'chunked': - return HTTPLengthRequired(request=req) - if 'X-Copy-From' in req.headers and req.content_length: - return HTTPBadRequest(body='Copy requests require a zero byte body', - request=req, content_type='text/plain') + return HTTPLengthRequired(body='Missing Content-Length header.', + request=req, + content_type='text/plain') + if len(object_name) > MAX_OBJECT_NAME_LENGTH: return HTTPBadRequest(body='Object name length of %d longer than %d' % (len(object_name), MAX_OBJECT_NAME_LENGTH), request=req, content_type='text/plain') + if 'Content-Type' not in req.headers: return HTTPBadRequest(request=req, content_type='text/plain', - body='No content type') - if not check_utf8(req.headers['Content-Type']): + body=b'No content type') + + try: + req = check_delete_headers(req) + except HTTPException as e: + return HTTPBadRequest(request=req, body=e.body, + content_type='text/plain') + + if not check_utf8(wsgi_to_str(req.headers['Content-Type'])): return HTTPBadRequest(request=req, body='Invalid Content-Type', content_type='text/plain') - if 'x-object-manifest' in req.headers: - value = req.headers['x-object-manifest'] - container = prefix = None - try: - container, prefix = value.split('/', 1) - except ValueError: - pass - if not container or not prefix or '?' in value or '&' in value or \ - prefix[0] == '/': - return HTTPBadRequest( - request=req, - body='X-Object-Manifest must in the format container/prefix') return check_metadata(req, 'object') +def check_dir(root, drive): + """ + Verify that the path to the device is a directory and is a lesser + constraint that is enforced when a full mount_check isn't possible + with, for instance, a VM using loopback or partitions. + + :param root: base path where the dir is + :param drive: drive name to be checked + :returns: full path to the device + :raises ValueError: if drive fails to validate + """ + return check_drive(root, drive, False) + + def check_mount(root, drive): """ Verify that the path to the device is a mount point and mounted. This allows us to fast fail on drives that have been unmounted because of - issues, and also prevents us for accidently filling up the root partition. + issues, and also prevents us for accidentally filling up the root + partition. :param root: base path where the devices are mounted :param drive: drive name to be checked - :returns: True if it is a valid mounted device, False otherwise + :returns: full path to the device + :raises ValueError: if drive fails to validate """ - if not (urllib.quote_plus(drive) == drive): - return False + return check_drive(root, drive, True) + + +def check_drive(root, drive, mount_check): + """ + Validate the path given by root and drive is a valid existing directory. + + :param root: base path where the devices are mounted + :param drive: drive name to be checked + :param mount_check: additionally require path is mounted + + :returns: full path to the device + :raises ValueError: if drive fails to validate + """ + if not (urllib.parse.quote_plus(drive) == drive): + raise ValueError('%s is not a valid drive name' % drive) path = os.path.join(root, drive) - return os.path.exists(path) and os.path.ismount(path) + if mount_check: + if not utils.ismount(path): + raise ValueError('%s is not mounted' % path) + else: + if not isdir(path): + raise ValueError('%s is not a directory' % path) + return path def check_float(string): @@ -180,22 +290,149 @@ def check_float(string): return False -def check_utf8(string): +def valid_timestamp(request): + """ + Helper function to extract a timestamp from requests that require one. + + :param request: the swob request object + + :returns: a valid Timestamp instance + :raises HTTPBadRequest: on missing or invalid X-Timestamp + """ + try: + return request.timestamp + except exceptions.InvalidTimestamp as e: + raise HTTPBadRequest(body=str(e), request=request, + content_type='text/plain') + + +def check_delete_headers(request): + """ + Check that 'x-delete-after' and 'x-delete-at' headers have valid values. + Values should be positive integers and correspond to a time greater than + the request timestamp. + + If the 'x-delete-after' header is found then its value is used to compute + an 'x-delete-at' value which takes precedence over any existing + 'x-delete-at' header. + + :param request: the swob request object + :raises: HTTPBadRequest in case of invalid values + :returns: the swob request object + """ + now = float(valid_timestamp(request)) + if 'x-delete-after' in request.headers: + try: + x_delete_after = int(request.headers['x-delete-after']) + except ValueError: + raise HTTPBadRequest(request=request, + content_type='text/plain', + body='Non-integer X-Delete-After') + actual_del_time = utils.normalize_delete_at_timestamp( + now + x_delete_after) + if int(actual_del_time) <= now: + raise HTTPBadRequest(request=request, + content_type='text/plain', + body='X-Delete-After in past') + request.headers['x-delete-at'] = actual_del_time + del request.headers['x-delete-after'] + + if 'x-delete-at' in request.headers: + try: + x_delete_at = int(utils.normalize_delete_at_timestamp( + int(request.headers['x-delete-at']))) + except ValueError: + raise HTTPBadRequest(request=request, content_type='text/plain', + body='Non-integer X-Delete-At') + + if x_delete_at <= now and not utils.config_true_value( + request.headers.get('x-backend-replication', 'f')): + raise HTTPBadRequest(request=request, content_type='text/plain', + body='X-Delete-At in past') + return request + + +def check_utf8(string, internal=False): """ - Validate if a string is valid UTF-8 str or unicode + Validate if a string is valid UTF-8 str or unicode and that it + does not contain any reserved characters. :param string: string to be validated - :returns: True if the string is valid utf-8 str or unicode, False otherwise + :param internal: boolean, allows reserved characters if True + :returns: True if the string is valid utf-8 str or unicode and + contains no null characters, False otherwise """ if not string: return False try: - if isinstance(string, unicode): - string.encode('utf-8') + if isinstance(string, str): + encoded = string.encode('utf-8') + decoded = string else: - string.decode('UTF-8') - return True + encoded = string + decoded = string.decode('UTF-8') + if decoded.encode('UTF-8') != encoded: + return False + # A UTF-8 string with surrogates in it is invalid. + # + # Note: this check is only useful on Python 2. On Python 3, a + # bytestring with a UTF-8-encoded surrogate codepoint is (correctly) + # treated as invalid, so the decode() call above will fail. + # + # Note 2: this check requires us to use a wide build of Python 2. On + # narrow builds of Python 2, potato = u"\U0001F954" will have length + # 2, potato[0] == u"\ud83e" (surrogate), and potato[1] == u"\udda0" + # (also a surrogate), so even if it is correctly UTF-8 encoded as + # b'\xf0\x9f\xa6\xa0', it will not pass this check. Fortunately, + # most Linux distributions build Python 2 wide, and Python 3.3+ + # removed the wide/narrow distinction entirely. + if any(0xD800 <= ord(codepoint) <= 0xDFFF + for codepoint in decoded): + return False + if b'\x00' != utils.RESERVED_BYTE and b'\x00' in encoded: + return False + return True if internal else utils.RESERVED_BYTE not in encoded # If string is unicode, decode() will raise UnicodeEncodeError # So, we should catch both UnicodeDecodeError & UnicodeEncodeError except UnicodeError: return False + + +def check_name_format(req, name, target_type): + """ + Validate that the header contains valid account or container name. + + :param req: HTTP request object + :param name: header value to validate + :param target_type: which header is being validated (Account or Container) + :returns: A properly encoded account name or container name + :raise HTTPPreconditionFailed: if account header + is not well formatted. + """ + if not name: + raise HTTPPreconditionFailed( + request=req, + body='%s name cannot be empty' % target_type) + if '/' in name: + raise HTTPPreconditionFailed( + request=req, + body='%s name cannot contain slashes' % target_type) + return name + + +check_account_format = functools.partial(check_name_format, + target_type='Account') +check_container_format = functools.partial(check_name_format, + target_type='Container') + + +def valid_api_version(version): + """ + Checks if the requested version is valid. + + Currently Swift only supports "v1" and "v1.0". + """ + global VALID_API_VERSIONS + if not isinstance(VALID_API_VERSIONS, list): + VALID_API_VERSIONS = [str(VALID_API_VERSIONS)] + return version in VALID_API_VERSIONS diff --git a/swift/common/container_sync_realms.py b/swift/common/container_sync_realms.py new file mode 100644 index 0000000000..a9832ba437 --- /dev/null +++ b/swift/common/container_sync_realms.py @@ -0,0 +1,167 @@ +# Copyright (c) 2013 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import errno +import hashlib +import hmac +import os +import time + +import configparser + +from swift.common.utils import get_valid_utf8_str + + +class ContainerSyncRealms(object): + """ + Loads and parses the container-sync-realms.conf, occasionally + checking the file's mtime to see if it needs to be reloaded. + """ + + def __init__(self, conf_path, logger): + self.conf_path = conf_path + self.logger = logger + self.next_mtime_check = 0 + self.mtime_check_interval = 300 + self.conf_path_mtime = 0 + self.data = {} + self.reload() + + def reload(self): + """Forces a reload of the conf file.""" + self.next_mtime_check = 0 + self.conf_path_mtime = 0 + self._reload() + + def _reload(self): + now = time.time() + if now >= self.next_mtime_check: + self.next_mtime_check = now + self.mtime_check_interval + try: + mtime = os.path.getmtime(self.conf_path) + except OSError as err: + if err.errno == errno.ENOENT: + log_func = self.logger.debug + else: + log_func = self.logger.error + log_func('Could not load %(conf)r: %(error)s', { + 'conf': self.conf_path, 'error': err}) + else: + if mtime != self.conf_path_mtime: + self.conf_path_mtime = mtime + try: + conf = configparser.ConfigParser() + conf.read(self.conf_path) + except configparser.ParsingError as err: + self.logger.error( + 'Could not load %(conf)r: %(error)s', + {'conf': self.conf_path, 'error': err}) + else: + try: + self.mtime_check_interval = conf.getfloat( + 'DEFAULT', 'mtime_check_interval') + self.next_mtime_check = \ + now + self.mtime_check_interval + except configparser.NoOptionError: + self.mtime_check_interval = 300 + self.next_mtime_check = \ + now + self.mtime_check_interval + except (configparser.ParsingError, ValueError) as err: + self.logger.error( + 'Error in %(conf)r with ' + 'mtime_check_interval: %(error)s', + {'conf': self.conf_path, 'error': err}) + realms = {} + for section in conf.sections(): + realm = {} + clusters = {} + for option, value in conf.items(section): + if option in ('key', 'key2'): + realm[option] = value + elif option.startswith('cluster_'): + clusters[option[8:].upper()] = value + realm['clusters'] = clusters + realms[section.upper()] = realm + self.data = realms + + def realms(self): + """Returns a list of realms.""" + self._reload() + return list(self.data.keys()) + + def key(self, realm): + """Returns the key for the realm.""" + self._reload() + result = self.data.get(realm.upper()) + if result: + result = result.get('key') + return result + + def key2(self, realm): + """Returns the key2 for the realm.""" + self._reload() + result = self.data.get(realm.upper()) + if result: + result = result.get('key2') + return result + + def clusters(self, realm): + """Returns a list of clusters for the realm.""" + self._reload() + result = self.data.get(realm.upper()) + if result: + result = result.get('clusters') + if result: + result = list(result.keys()) + return result or [] + + def endpoint(self, realm, cluster): + """Returns the endpoint for the cluster in the realm.""" + self._reload() + result = None + realm_data = self.data.get(realm.upper()) + if realm_data: + cluster_data = realm_data.get('clusters') + if cluster_data: + result = cluster_data.get(cluster.upper()) + return result + + def get_sig(self, request_method, path, x_timestamp, nonce, realm_key, + user_key): + """ + Returns the hexdigest string of the HMAC-SHA1 (RFC 2104) for + the information given. + + :param request_method: HTTP method of the request. + :param path: The path to the resource (url-encoded). + :param x_timestamp: (str) The X-Timestamp header value for the request. + :param nonce: A unique value for the request. + :param realm_key: Shared secret at the cluster operator level. + :param user_key: Shared secret at the user's container level. + :returns: hexdigest str of the HMAC-SHA1 for the request. + """ + nonce = get_valid_utf8_str(nonce) + realm_key = get_valid_utf8_str(realm_key) + user_key = get_valid_utf8_str(user_key) + # XXX We don't know what is the best here yet; wait for container + # sync to be tested. + if isinstance(path, str): + path = path.encode('utf-8') + return hmac.new( + realm_key, + b'%s\n%s\n%s\n%s\n%s' % ( + request_method.encode('ascii'), path, + x_timestamp.encode('ascii'), nonce, user_key), + hashlib.sha1).hexdigest() diff --git a/swift/common/daemon.py b/swift/common/daemon.py index 009619cc78..d6c431b6d1 100644 --- a/swift/common/daemon.py +++ b/swift/common/daemon.py @@ -1,4 +1,4 @@ -# Copyright (c) 2010-2012 OpenStack, LLC. +# Copyright (c) 2010-2012 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,18 +13,39 @@ # See the License for the specific language governing permissions and # limitations under the License. +import errno import os import sys +import time import signal from re import sub +import eventlet import eventlet.debug from swift.common import utils class Daemon(object): - """Daemon base class""" + """ + Daemon base class + + A daemon has a run method that accepts a ``once`` kwarg and will dispatch + to :meth:`run_once` or :meth:`run_forever`. + + A subclass of Daemon must implement :meth:`run_once` and + :meth:`run_forever`. + + A subclass of Daemon may override :meth:`get_worker_args` to dispatch + arguments to individual child process workers and :meth:`is_healthy` to + perform context specific periodic wellness checks which can reset worker + arguments. + + Implementations of Daemon do not know *how* to daemonize, or execute + multiple daemonized workers, they simply provide the behavior of the daemon + and context specific knowledge about how workers should be started. + """ + WORKERS_HEALTHCHECK_INTERVAL = 5.0 def __init__(self, conf): self.conf = conf @@ -39,42 +60,240 @@ def run_forever(self, *args, **kwargs): raise NotImplementedError('run_forever not implemented') def run(self, once=False, **kwargs): - """Run the daemon""" + if once: + self.run_once(**kwargs) + else: + self.run_forever(**kwargs) + + def post_multiprocess_run(self): + """ + Override this to do something after running using multiple worker + processes. This method is called in the parent process. + + This is probably only useful for run-once mode since there is no + "after running" in run-forever mode. + """ + pass + + def get_worker_args(self, once=False, **kwargs): + """ + For each worker yield a (possibly empty) dict of kwargs to pass along + to the daemon's :meth:`run` method after fork. The length of elements + returned from this method will determine the number of processes + created. + + If the returned iterable is empty, the Strategy will fallback to + run-inline strategy. + + :param once: False if the worker(s) will be daemonized, True if the + worker(s) will be run once + :param kwargs: plumbed through via command line argparser + + :returns: an iterable of dicts, each element represents the kwargs to + be passed to a single worker's :meth:`run` method after fork. + """ + return [] + + def is_healthy(self): + """ + This method is called very frequently on the instance of the daemon + held by the parent process. If it returns False, all child workers are + terminated, and new workers will be created. + + :returns: a boolean, True only if all workers should continue to run + """ + return True + + +class DaemonStrategy(object): + """ + This is the execution strategy for using subclasses of Daemon. The default + behavior is to invoke the daemon's :meth:`Daemon.run` method from within + the parent process. When the :meth:`Daemon.run` method returns the parent + process will exit. + + However, if the Daemon returns a non-empty iterable from + :meth:`Daemon.get_worker_args`, the daemon's :meth:`Daemon.run` method will + be invoked in child processes, with the arguments provided from the parent + process's instance of the daemon. If a child process exits it will be + restarted with the same options, unless it was executed in once mode. + + :param daemon: an instance of a :class:`Daemon` (has a `run` method) + :param logger: a logger instance + """ + + def __init__(self, daemon, logger): + self.daemon = daemon + self.logger = logger + self.running = False + # only used by multi-worker strategy + self.options_by_pid = {} + self.unspawned_worker_options = [] + + def setup(self, **kwargs): utils.validate_configuration() - utils.drop_privileges(self.conf.get('user', 'swift')) + utils.drop_privileges(self.daemon.conf.get('user', 'swift')) + utils.clean_up_daemon_hygiene() utils.capture_stdio(self.logger, **kwargs) def kill_children(*args): + self.running = False + self.logger.notice('SIGTERM received (%s)', os.getpid()) signal.signal(signal.SIGTERM, signal.SIG_IGN) os.killpg(0, signal.SIGTERM) - sys.exit() + os._exit(0) signal.signal(signal.SIGTERM, kill_children) - if once: - self.run_once(**kwargs) + self.running = True + utils.systemd_notify(self.logger) + + def _run_inline(self, once=False, **kwargs): + """Run the daemon""" + self.daemon.run(once=once, **kwargs) + + def run(self, once=False, **kwargs): + """Daemonize and execute our strategy""" + self.setup(**kwargs) + try: + self._run(once=once, **kwargs) + except KeyboardInterrupt: + self.logger.notice('User quit') + finally: + self.cleanup(stopping=True) + self.running = False + + def _fork(self, once, **kwargs): + pid = os.fork() + if pid == 0: + signal.signal(signal.SIGHUP, signal.SIG_DFL) + signal.signal(signal.SIGTERM, signal.SIG_DFL) + # only MAINPID should be sending notifications + os.environ.pop('NOTIFY_SOCKET', None) + + self.daemon.run(once, **kwargs) + + self.logger.debug('Forked worker %s finished', os.getpid()) + # do not return from this stack, nor execute any finally blocks + os._exit(0) else: - self.run_forever(**kwargs) + self.register_worker_start(pid, kwargs) + return pid + + def iter_unspawned_workers(self): + while True: + try: + per_worker_options = self.unspawned_worker_options.pop() + except IndexError: + return + yield per_worker_options + + def spawned_pids(self): + return list(self.options_by_pid.keys()) + + def register_worker_start(self, pid, per_worker_options): + self.logger.debug('Spawned worker %s with %r', pid, per_worker_options) + self.options_by_pid[pid] = per_worker_options + + def register_worker_exit(self, pid): + self.unspawned_worker_options.append(self.options_by_pid.pop(pid)) + + def ask_daemon_to_prepare_workers(self, once, **kwargs): + self.unspawned_worker_options = list( + self.daemon.get_worker_args(once=once, **kwargs)) + + def abort_workers_if_daemon_would_like(self): + if not self.daemon.is_healthy(): + self.logger.debug( + 'Daemon needs to change options, aborting workers') + self.cleanup() + return True + return False + + def check_on_all_running_workers(self): + for p in self.spawned_pids(): + try: + pid, status = os.waitpid(p, os.WNOHANG) + except OSError as err: + if err.errno not in (errno.EINTR, errno.ECHILD): + raise + self.logger.notice('Worker %s died', p) + else: + if pid == 0: + # child still running + continue + self.logger.debug('Worker %s exited', p) + self.register_worker_exit(p) + + def _run(self, once, **kwargs): + self.ask_daemon_to_prepare_workers(once, **kwargs) + if not self.unspawned_worker_options: + return self._run_inline(once, **kwargs) + for per_worker_options in self.iter_unspawned_workers(): + if self._fork(once, **per_worker_options) == 0: + return 0 + while self.running: + if self.abort_workers_if_daemon_would_like(): + self.ask_daemon_to_prepare_workers(once, **kwargs) + self.check_on_all_running_workers() + if not once: + for per_worker_options in self.iter_unspawned_workers(): + if self._fork(once, **per_worker_options) == 0: + return 0 + else: + if not self.spawned_pids(): + self.logger.notice('Finished %s', os.getpid()) + break + time.sleep(self.daemon.WORKERS_HEALTHCHECK_INTERVAL) + self.daemon.post_multiprocess_run() + return 0 + + def cleanup(self, stopping=False): + """ + Cleanup worker processes + + :param stopping: if set, tell systemd we're stopping + """ + + if stopping: + utils.systemd_notify(self.logger, "STOPPING=1") + for p in self.spawned_pids(): + try: + os.kill(p, signal.SIGTERM) + except OSError as err: + if err.errno not in (errno.ESRCH, errno.EINTR, errno.ECHILD): + raise + self.register_worker_exit(p) + self.logger.debug('Cleaned up worker %s', p) def run_daemon(klass, conf_file, section_name='', once=False, **kwargs): """ - Loads settings from conf, then instantiates daemon "klass" and runs the - daemon with the specified once kwarg. The section_name will be derived - from the daemon "klass" if not provided (e.g. ObjectReplicator => + Loads settings from conf, then instantiates daemon ``klass`` and runs the + daemon with the specified ``once`` kwarg. The section_name will be derived + from the daemon ``klass`` if not provided (e.g. ObjectReplicator => object-replicator). - :param klass: Class to instantiate, subclass of common.daemon.Daemon + :param klass: Class to instantiate, subclass of :class:`Daemon` :param conf_file: Path to configuration file :param section_name: Section name from conf file to load config from - :param once: Passed to daemon run method + :param once: Passed to daemon :meth:`Daemon.run` method """ # very often the config section_name is based on the class name # the None singleton will be passed through to readconf as is - if section_name is '': + if section_name == '': section_name = sub(r'([a-z])([A-Z])', r'\1-\2', klass.__name__).lower() - conf = utils.readconf(conf_file, section_name, - log_name=kwargs.get('log_name')) + try: + conf = utils.readconf(conf_file, section_name, + log_name=kwargs.get('log_name')) + except (ValueError, IOError) as e: + # The message will be printed to stderr + # and results in an exit code of 1. + sys.exit(e) + + # patch eventlet/logging early + utils.monkey_patch() + eventlet.hubs.use_hub(utils.get_hub()) # once on command line (i.e. daemonize=false) will over-ride config once = once or not utils.config_true_value(conf.get('daemonize', 'true')) @@ -87,16 +306,30 @@ def run_daemon(klass, conf_file, section_name='', once=False, **kwargs): log_to_console=kwargs.pop('verbose', False), log_route=section_name) + # optional nice/ionice priority scheduling + utils.modify_priority(conf, logger) + # disable fallocate if desired if utils.config_true_value(conf.get('disable_fallocate', 'no')): utils.disable_fallocate() + # set utils.FALLOCATE_RESERVE if desired + utils.FALLOCATE_RESERVE, utils.FALLOCATE_IS_PERCENT = \ + utils.config_fallocate_value(conf.get('fallocate_reserve', '1%')) # By default, disable eventlet printing stacktraces eventlet_debug = utils.config_true_value(conf.get('eventlet_debug', 'no')) eventlet.debug.hub_exceptions(eventlet_debug) + # Ensure TZ environment variable exists to avoid stat('/etc/localtime') on + # some platforms. This locks in reported times to UTC. + os.environ['TZ'] = 'UTC+0' + time.tzset() + + logger.notice('Starting %s', os.getpid()) try: - klass(conf).run(once=once, **kwargs) + d = klass(conf) + DaemonStrategy(d, logger).run(once=once, **kwargs) except KeyboardInterrupt: logger.info('User quit') - logger.info('Exited') + logger.notice('Exited %s', os.getpid()) + return d diff --git a/swift/common/db.py b/swift/common/db.py index 06f7c6295f..447228f130 100644 --- a/swift/common/db.py +++ b/swift/common/db.py @@ -1,4 +1,4 @@ -# Copyright (c) 2010-2012 OpenStack, LLC. +# Copyright (c) 2010-2012 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,39 +15,77 @@ """ Database code for Swift """ -from __future__ import with_statement -from contextlib import contextmanager -import hashlib +from contextlib import contextmanager, closing +import base64 +import json import logging -import operator import os from uuid import uuid4 -import sys import time -import cPickle as pickle import errno +import pickle # nosec: B403 from tempfile import mkstemp from eventlet import sleep, Timeout import sqlite3 -from swift.common.utils import json, normalize_timestamp, renamer, \ - mkdirs, lock_parent_directory, fallocate +from swift.common.constraints import MAX_META_COUNT, MAX_META_OVERALL_SIZE, \ + check_utf8 +from swift.common.utils import Timestamp, renamer, \ + mkdirs, lock_parent_directory, fallocate, md5 from swift.common.exceptions import LockTimeout +from swift.common.swob import HTTPBadRequest #: Whether calls will be made to preallocate disk space for database files. -DB_PREALLOCATION = True +DB_PREALLOCATION = False +#: Whether calls will be made to log queries (py3 only) +QUERY_LOGGING = False #: Timeout for trying to connect to a DB BROKER_TIMEOUT = 25 #: Pickle protocol to use PICKLE_PROTOCOL = 2 -#: Max number of pending entries +#: Max size of .pending file in bytes. When this is exceeded, the pending +# records will be merged. PENDING_CAP = 131072 +SQLITE_ARG_LIMIT = 999 +RECLAIM_PAGE_SIZE = 10000 -def utf8encode(*args): - return [(s.encode('utf8') if isinstance(s, unicode) else s) for s in args] + +def native_str_keys_and_values(metadata): + bin_keys = [k for k in metadata if isinstance(k, bytes)] + for k in bin_keys: + sv = metadata[k] + del metadata[k] + metadata[k.decode('utf-8')] = [ + x.decode('utf-8') if isinstance(x, bytes) else x + for x in sv] + + +ZERO_LIKE_VALUES = {None, '', 0, '0'} + + +def zero_like(count): + """ + We've cargo culted our consumers to be tolerant of various expressions of + zero in our databases for backwards compatibility with less disciplined + producers. + """ + return count in ZERO_LIKE_VALUES + + +def _db_timeout(timeout, db_file, call): + with LockTimeout(timeout, db_file): + retry_wait = 0.001 + while True: + try: + return call() + except sqlite3.OperationalError as e: + if 'locked' not in str(e): + raise + sleep(retry_wait) + retry_wait = min(retry_wait * 2, 0.05) class DatabaseConnectionError(sqlite3.DatabaseError): @@ -63,31 +101,66 @@ def __str__(self): self.path, self.timeout, self.msg) +class DatabaseAlreadyExists(sqlite3.DatabaseError): + """More friendly error messages for DB Errors.""" + + def __init__(self, path): + self.path = path + + def __str__(self): + return 'DB %s already exists' % self.path + + class GreenDBConnection(sqlite3.Connection): """SQLite DB Connection handler that plays well with eventlet.""" + # slots are needed for python 3.11.0 (there's an issue fixed in 3.11.1, + # see https://github.com/python/cpython/issues/99886) + __slots__ = ('timeout', 'db_file') - def __init__(self, *args, **kwargs): - self.timeout = kwargs.get('timeout', BROKER_TIMEOUT) - kwargs['timeout'] = 0 - self.db_file = args and args[0] or '-' - sqlite3.Connection.__init__(self, *args, **kwargs) + def __init__(self, database, timeout=None, *args, **kwargs): + if timeout is None: + timeout = BROKER_TIMEOUT + self.timeout = timeout + self.db_file = database + super(GreenDBConnection, self).__init__( + database, timeout=0, *args, **kwargs) - def _timeout(self, call): - with LockTimeout(self.timeout, self.db_file): - while True: - try: - return call() - except sqlite3.OperationalError, e: - if 'locked' not in str(e): - raise - sleep(0.05) + def cursor(self, cls=None): + if cls is None: + cls = GreenDBCursor + return sqlite3.Connection.cursor(self, cls) def execute(self, *args, **kwargs): - return self._timeout(lambda: sqlite3.Connection.execute( - self, *args, **kwargs)) + # py311 stopped calling self.cursor() to get the cursor; + # see https://github.com/python/cpython/pull/31351 + curs = self.cursor() + curs.execute(*args, **kwargs) + return curs def commit(self): - return self._timeout(lambda: sqlite3.Connection.commit(self)) + return _db_timeout( + self.timeout, self.db_file, + lambda: sqlite3.Connection.commit(self)) + + +class GreenDBCursor(sqlite3.Cursor): + """SQLite Cursor handler that plays well with eventlet.""" + # slots are needed for python 3.11.0 (there's an issue fixed in 3.11.1, + # see https://github.com/python/cpython/issues/99886) + __slots__ = ('timeout', 'db_file') + + def __init__(self, *args, **kwargs): + self.timeout = args[0].timeout + self.db_file = args[0].db_file + super(GreenDBCursor, self).__init__(*args, **kwargs) + + def execute(self, *args, **kwargs): + return _db_timeout( + self.timeout, self.db_file, lambda: sqlite3.Cursor.execute( + self, *args, **kwargs)) + + # NB: executemany and executescript are *not* greened, and never have been + # (as far as I can tell) def dict_factory(crs, row): @@ -107,19 +180,18 @@ def chexor(old, name, timestamp): :param old: hex representation of the current DB hash :param name: name of the object or container being inserted - :param timestamp: timestamp of the new record - :returns: a hex representation of the new hash value + :param timestamp: a string representation of attributes of the item being + inserted, for example the string representation of the item's + timestamp. """ if name is None: raise Exception('name is None!') - old = old.decode('hex') - new = hashlib.md5(('%s-%s' % (name, timestamp)).encode('utf_8')).digest() - response = ''.join( - map(chr, map(operator.xor, map(ord, old), map(ord, new)))) - return response.encode('hex') + new = md5(('%s-%s' % (name, timestamp)).encode('utf8'), + usedforsecurity=False).hexdigest() + return '%032x' % (int(old, 16) ^ int(new, 16)) -def get_db_connection(path, timeout=30, okay_to_create=False): +def get_db_connection(path, timeout=30, logger=None, okay_to_create=False): """ Returns a properly configured SQLite database connection. @@ -132,7 +204,9 @@ def get_db_connection(path, timeout=30, okay_to_create=False): connect_time = time.time() conn = sqlite3.connect(path, check_same_thread=False, factory=GreenDBConnection, timeout=timeout) - if path != ':memory:' and not okay_to_create: + if QUERY_LOGGING and logger: + conn.set_trace_callback(logger.debug) + if not okay_to_create: # attempt to detect and fail when connect creates the db file stat = os.stat(path) if stat.st_size == 0 and stat.st_ctime >= connect_time: @@ -141,10 +215,11 @@ def get_db_connection(path, timeout=30, okay_to_create=False): 'DB file created by connect?') conn.row_factory = sqlite3.Row conn.text_factory = str - conn.execute('PRAGMA synchronous = NORMAL') - conn.execute('PRAGMA count_changes = OFF') - conn.execute('PRAGMA temp_store = MEMORY') - conn.execute('PRAGMA journal_mode = DELETE') + with closing(conn.cursor()) as cur: + cur.execute('PRAGMA synchronous = NORMAL') + cur.execute('PRAGMA count_changes = OFF') + cur.execute('PRAGMA temp_store = MEMORY') + cur.execute('PRAGMA journal_mode = DELETE') conn.create_function('chexor', 3, chexor) except sqlite3.DatabaseError: import traceback @@ -153,17 +228,111 @@ def get_db_connection(path, timeout=30, okay_to_create=False): return conn +class TombstoneReclaimer(object): + """Encapsulates reclamation of deleted rows in a database.""" + def __init__(self, broker, age_timestamp): + """ + Encapsulates reclamation of deleted rows in a database. + + :param broker: an instance of :class:`~swift.common.db.DatabaseBroker`. + :param age_timestamp: a float timestamp: tombstones older than this + time will be deleted. + """ + self.broker = broker + self.age_timestamp = age_timestamp + self.marker = '' + self.remaining_tombstones = self.reclaimed = 0 + self.finished = False + # limit 1 offset N gives back the N+1th matching row; that row is used + # as an exclusive end_marker for a batch of deletes, so a batch + # comprises rows satisfying self.marker <= name < end_marker. + self.batch_query = ''' + SELECT name FROM %s WHERE deleted = 1 + AND name >= ? + ORDER BY NAME LIMIT 1 OFFSET ? + ''' % self.broker.db_contains_type + self.clean_batch_query = ''' + DELETE FROM %s WHERE deleted = 1 + AND name >= ? AND %s < '%s' + ''' % (self.broker.db_contains_type, self.broker.db_reclaim_timestamp, + self.age_timestamp) + + def _reclaim(self, conn): + curs = conn.execute(self.batch_query, (self.marker, RECLAIM_PAGE_SIZE)) + row = curs.fetchone() + end_marker = row[0] if row else '' + if end_marker: + # do a single book-ended DELETE and bounce out + curs = conn.execute(self.clean_batch_query + ' AND name < ?', + (self.marker, end_marker)) + self.marker = end_marker + self.reclaimed += curs.rowcount + self.remaining_tombstones += RECLAIM_PAGE_SIZE - curs.rowcount + else: + # delete off the end + curs = conn.execute(self.clean_batch_query, (self.marker,)) + self.finished = True + self.reclaimed += curs.rowcount + + def reclaim(self): + """ + Perform reclaim of deleted rows older than ``age_timestamp``. + """ + while not self.finished: + with self.broker.get() as conn: + self._reclaim(conn) + conn.commit() + + def get_tombstone_count(self): + """ + Return the number of remaining tombstones newer than ``age_timestamp``. + Executes the ``reclaim`` method if it has not already been called on + this instance. + + :return: The number of tombstones in the ``broker`` that are newer than + ``age_timestamp``. + """ + if not self.finished: + self.reclaim() + with self.broker.get() as conn: + curs = conn.execute(''' + SELECT COUNT(*) FROM %s WHERE deleted = 1 + AND name >= ? + ''' % (self.broker.db_contains_type,), (self.marker,)) + tombstones = curs.fetchone()[0] + self.remaining_tombstones += tombstones + return self.remaining_tombstones + + class DatabaseBroker(object): """Encapsulates working with a database.""" + delete_meta_whitelist = [] + def __init__(self, db_file, timeout=BROKER_TIMEOUT, logger=None, - account=None, container=None, pending_timeout=10, - stale_reads_ok=False): - """ Encapsulates working with a database. """ + account=None, container=None, pending_timeout=None, + stale_reads_ok=False, skip_commits=False): + """Encapsulates working with a database. + + :param db_file: path to a database file. + :param timeout: timeout used for database operations. + :param logger: a logger instance. + :param account: name of account. + :param container: name of container. + :param pending_timeout: timeout used when attempting to take a lock to + write to pending file. + :param stale_reads_ok: if True then no error is raised if pending + commits cannot be committed before the database is read, otherwise + an error is raised. + :param skip_commits: if True then this broker instance will never + commit records from the pending file to the database; + :meth:`~swift.common.db.DatabaseBroker.put_record` should not + called on brokers with skip_commits True. + """ self.conn = None - self.db_file = db_file - self.pending_file = self.db_file + '.pending' - self.pending_timeout = pending_timeout + self._db_file = db_file + self.pending_file = self._db_file + '.pending' + self.pending_timeout = pending_timeout or 10 self.stale_reads_ok = stale_reads_ok self.db_dir = os.path.dirname(db_file) self.timeout = timeout @@ -171,27 +340,39 @@ def __init__(self, db_file, timeout=BROKER_TIMEOUT, logger=None, self.account = account self.container = container self._db_version = -1 + self.skip_commits = skip_commits + + def __str__(self): + """ + Returns a string identifying the entity under broker to a human. + The baseline implementation returns a full pathname to a database. + This is vital for useful diagnostics. + """ + return self.db_file - def initialize(self, put_timestamp=None): + def initialize(self, put_timestamp=None, storage_policy_index=None): """ Create the DB - :param put_timestamp: timestamp of initial PUT request + The storage_policy_index is passed through to the subclass's + ``_initialize`` method. It is ignored by ``AccountBroker``. + + :param put_timestamp: internalized timestamp of initial PUT request + :param storage_policy_index: only required for containers """ - if self.db_file == ':memory:': - tmp_db_file = None - conn = get_db_connection(self.db_file, self.timeout) - else: - mkdirs(self.db_dir) - fd, tmp_db_file = mkstemp(suffix='.tmp', dir=self.db_dir) - os.close(fd) - conn = sqlite3.connect(tmp_db_file, check_same_thread=False, - factory=GreenDBConnection, timeout=0) + mkdirs(self.db_dir) + fd, tmp_db_file = mkstemp(suffix='.tmp', dir=self.db_dir) + os.close(fd) + conn = sqlite3.connect(tmp_db_file, check_same_thread=False, + factory=GreenDBConnection, timeout=0) + if QUERY_LOGGING: + conn.set_trace_callback(self.logger.debug) # creating dbs implicitly does a lot of transactions, so we # pick fast, unsafe options here and do a big fsync at the end. - conn.execute('PRAGMA synchronous = OFF') - conn.execute('PRAGMA temp_store = MEMORY') - conn.execute('PRAGMA journal_mode = MEMORY') + with closing(conn.cursor()) as cur: + cur.execute('PRAGMA synchronous = OFF') + cur.execute('PRAGMA temp_store = MEMORY') + cur.execute('PRAGMA journal_mode = MEMORY') conn.create_function('chexor', 3, chexor) conn.row_factory = sqlite3.Row conn.text_factory = str @@ -232,8 +413,9 @@ def initialize(self, put_timestamp=None): END; """) if not put_timestamp: - put_timestamp = normalize_timestamp(0) - self._initialize(conn, put_timestamp) + put_timestamp = Timestamp.zero().internal + self._initialize(conn, put_timestamp, + storage_policy_index=storage_policy_index) conn.commit() if tmp_db_file: conn.close() @@ -243,11 +425,10 @@ def initialize(self, put_timestamp=None): if os.path.exists(self.db_file): # It's as if there was a "condition" where different parts # of the system were "racing" each other. - raise DatabaseConnectionError( - self.db_file, - 'DB created by someone else while working?') + raise DatabaseAlreadyExists(self.db_file) renamer(tmp_db_file, self.db_file) - self.conn = get_db_connection(self.db_file, self.timeout) + self.conn = get_db_connection(self.db_file, self.timeout, + self.logger) else: self.conn = conn @@ -255,60 +436,114 @@ def delete_db(self, timestamp): """ Mark the DB as deleted - :param timestamp: delete timestamp + :param timestamp: internalized delete timestamp """ - timestamp = normalize_timestamp(timestamp) # first, clear the metadata cleared_meta = {} - for k in self.metadata.iterkeys(): + for k in self.metadata: + if k.lower() in self.delete_meta_whitelist: + continue cleared_meta[k] = ('', timestamp) self.update_metadata(cleared_meta) # then mark the db as deleted with self.get() as conn: - self._delete_db(conn, timestamp) + conn.execute( + """ + UPDATE %s_stat + SET delete_timestamp = ?, + status = 'DELETED', + status_changed_at = ? + WHERE delete_timestamp < ? """ % self.db_type, + (timestamp, timestamp, timestamp)) conn.commit() - def possibly_quarantine(self, exc_type, exc_value, exc_traceback): + @property + def db_file(self): + return self._db_file + + def get_device_path(self): + suffix_path = os.path.dirname(self.db_dir) + partition_path = os.path.dirname(suffix_path) + dbs_path = os.path.dirname(partition_path) + return os.path.dirname(dbs_path) + + def quarantine(self, reason): """ - Checks the exception info to see if it indicates a quarantine situation - (malformed or corrupted database). If not, the original exception will - be reraised. If so, the database will be quarantined and a new + The database will be quarantined and a sqlite3.DatabaseError will be raised indicating the action taken. """ - if 'database disk image is malformed' in str(exc_value): - exc_hint = 'malformed' - elif 'file is encrypted or is not a database' in str(exc_value): - exc_hint = 'corrupted' - else: - raise exc_type(*exc_value.args), None, exc_traceback - prefix_path = os.path.dirname(self.db_dir) - partition_path = os.path.dirname(prefix_path) - dbs_path = os.path.dirname(partition_path) - device_path = os.path.dirname(dbs_path) + device_path = self.get_device_path() quar_path = os.path.join(device_path, 'quarantined', self.db_type + 's', os.path.basename(self.db_dir)) try: - renamer(self.db_dir, quar_path) - except OSError, e: + renamer(self.db_dir, quar_path, fsync=False) + except OSError as e: if e.errno not in (errno.EEXIST, errno.ENOTEMPTY): raise quar_path = "%s-%s" % (quar_path, uuid4().hex) - renamer(self.db_dir, quar_path) - detail = _('Quarantined %s to %s due to %s database') % \ - (self.db_dir, quar_path, exc_hint) + renamer(self.db_dir, quar_path, fsync=False) + detail = ('Quarantined %(db_dir)s to %(quar_path)s due to ' + '%(reason)s') % {'db_dir': self.db_dir, + 'quar_path': quar_path, + 'reason': reason} self.logger.error(detail) raise sqlite3.DatabaseError(detail) + def possibly_quarantine(self, err): + """ + Checks the exception info to see if it indicates a quarantine situation + (malformed or corrupted database). If not, the original exception will + be reraised. If so, the database will be quarantined and a new + sqlite3.DatabaseError will be raised indicating the action taken. + """ + if 'database disk image is malformed' in str(err): + exc_hint = 'malformed database' + elif 'malformed database schema' in str(err): + exc_hint = 'malformed database' + elif ' is not a database' in str(err): + # older versions said 'file is not a database' + # now 'file is encrypted or is not a database' + exc_hint = 'corrupted database' + elif 'disk I/O error' in str(err): + exc_hint = 'disk error while accessing database' + else: + raise err + + self.quarantine(exc_hint) + + @contextmanager + def updated_timeout(self, new_timeout): + """Use with "with" statement; updates ``timeout`` within the block.""" + old_timeout = self.timeout + try: + self.timeout = new_timeout + if self.conn: + self.conn.timeout = new_timeout + yield old_timeout + finally: + self.timeout = old_timeout + if self.conn: + self.conn.timeout = old_timeout + + @contextmanager + def maybe_get(self, conn): + if conn: + yield conn + else: + with self.get() as conn: + yield conn + @contextmanager def get(self): """Use with the "with" statement; returns a database connection.""" if not self.conn: - if self.db_file != ':memory:' and os.path.exists(self.db_file): + if os.path.exists(self.db_file): try: - self.conn = get_db_connection(self.db_file, self.timeout) - except (sqlite3.DatabaseError, DatabaseConnectionError): - self.possibly_quarantine(*sys.exc_info()) + self.conn = get_db_connection(self.db_file, self.timeout, + self.logger) + except (sqlite3.DatabaseError, DatabaseConnectionError) as e: + self.possibly_quarantine(e) else: raise DatabaseConnectionError(self.db_file, "DB doesn't exist") conn = self.conn @@ -317,12 +552,12 @@ def get(self): yield conn conn.rollback() self.conn = conn - except sqlite3.DatabaseError: + except sqlite3.DatabaseError as e: try: conn.close() - except: + except Exception: pass - self.possibly_quarantine(*sys.exc_info()) + self.possibly_quarantine(e) except (Exception, Timeout): conn.close() raise @@ -331,8 +566,9 @@ def get(self): def lock(self): """Use with the "with" statement; locks a database.""" if not self.conn: - if self.db_file != ':memory:' and os.path.exists(self.db_file): - self.conn = get_db_connection(self.db_file, self.timeout) + if os.path.exists(self.db_file): + self.conn = get_db_connection(self.db_file, self.timeout, + self.logger) else: raise DatabaseConnectionError(self.db_file, "DB doesn't exist") conn = self.conn @@ -342,16 +578,19 @@ def lock(self): conn.execute('BEGIN IMMEDIATE') try: yield True - except (Exception, Timeout): - pass - try: - conn.execute('ROLLBACK') - conn.isolation_level = orig_isolation_level - self.conn = conn - except (Exception, Timeout): - logging.exception( - _('Broker error trying to rollback locked connection')) - conn.close() + finally: + try: + conn.execute('ROLLBACK') + conn.isolation_level = orig_isolation_level + self.conn = conn + except (Exception, Timeout): + logging.exception( + 'Broker error trying to rollback locked connection') + conn.close() + + def _new_db_id(self): + device_name = os.path.basename(self.get_device_path()) + return "%s-%s" % (str(uuid4()), device_name) def newid(self, remote_id): """ @@ -362,7 +601,7 @@ def newid(self, remote_id): with self.get() as conn: row = conn.execute(''' UPDATE %s_stat SET id=? - ''' % self.db_type, (str(uuid4()),)) + ''' % self.db_type, (self._new_db_id(),)) row = conn.execute(''' SELECT ROWID FROM %s ORDER BY ROWID DESC LIMIT 1 ''' % self.db_contains_type).fetchone() @@ -378,6 +617,45 @@ def _newid(self, conn): # Override for additional work when receiving an rsynced db. pass + def _is_deleted(self, conn): + """ + Check if the database is considered deleted + + :param conn: database conn + + :returns: True if the DB is considered to be deleted, False otherwise + """ + raise NotImplementedError() + + def is_deleted(self): + """ + Check if the DB is considered to be deleted. + + :returns: True if the DB is considered to be deleted, False otherwise + """ + if not os.path.exists(self.db_file): + return True + self._commit_puts_stale_ok() + with self.get() as conn: + return self._is_deleted(conn) + + def empty(self): + """ + Check if the broker abstraction contains any undeleted records. + """ + raise NotImplementedError() + + def is_reclaimable(self, now, reclaim_age): + """ + Check if the broker abstraction is empty, and has been marked deleted + for at least a reclaim age. + """ + info = self.get_replication_info() + return (zero_like(info['count']) and + (Timestamp(now - reclaim_age) > + Timestamp(info['delete_timestamp']) > + Timestamp(info['put_timestamp']))) + def merge_timestamps(self, created_at, put_timestamp, delete_timestamp): """ Used in replication to handle updating timestamps. @@ -387,11 +665,16 @@ def merge_timestamps(self, created_at, put_timestamp, delete_timestamp): :param delete_timestamp: delete timestamp """ with self.get() as conn: + old_status = self._is_deleted(conn) conn.execute(''' UPDATE %s_stat SET created_at=MIN(?, created_at), put_timestamp=MAX(?, put_timestamp), delete_timestamp=MAX(?, delete_timestamp) ''' % self.db_type, (created_at, put_timestamp, delete_timestamp)) + if old_status != self._is_deleted(conn): + timestamp = Timestamp.now() + self._update_status_changed_at(conn, timestamp.internal) + conn.commit() def get_items_since(self, start, count): @@ -402,11 +685,7 @@ def get_items_since(self, start, count): :param count: number to get :returns: list of objects between start and end """ - try: - self._commit_puts() - except LockTimeout: - if not self.stale_reads_ok: - raise + self._commit_puts_stale_ok() with self.get() as conn: curs = conn.execute(''' SELECT * FROM %s WHERE ROWID > ? ORDER BY ROWID ASC LIMIT ? @@ -431,58 +710,179 @@ def get_sync(self, id, incoming=True): return -1 return row['sync_point'] - def get_syncs(self, incoming=True): + def get_syncs(self, incoming=True, include_timestamp=False): """ Get a serialized copy of the sync table. :param incoming: if True, get the last incoming sync, otherwise get the last outgoing sync - :returns: list of {'remote_id', 'sync_point'} + :param include_timestamp: If True include the updated_at timestamp + :returns: list of {'remote_id', 'sync_point'} or + {'remote_id', 'sync_point', 'updated_at'} + if include_timestamp is True. """ with self.get() as conn: + columns = 'remote_id, sync_point' + if include_timestamp: + columns += ', updated_at' curs = conn.execute(''' - SELECT remote_id, sync_point FROM %s_sync - ''' % 'incoming' if incoming else 'outgoing') - result = [] - for row in curs: - result.append({'remote_id': row[0], 'sync_point': row[1]}) - return result + SELECT %s FROM %s_sync + ''' % (columns, 'incoming' if incoming else 'outgoing')) + curs.row_factory = dict_factory + return [r for r in curs] + + def get_max_row(self, table=None): + if not table: + table = self.db_contains_type + query = ''' + SELECT SQLITE_SEQUENCE.seq + FROM SQLITE_SEQUENCE + WHERE SQLITE_SEQUENCE.name == '%s' + LIMIT 1 + ''' % (table, ) + with self.get() as conn: + row = conn.execute(query).fetchone() + return row[0] if row else -1 def get_replication_info(self): """ Get information about the DB required for replication. - :returns: dict containing keys: hash, id, created_at, put_timestamp, - delete_timestamp, count, max_row, and metadata + :returns: dict containing keys from get_info plus max_row and metadata + + Note:: get_info's _count is translated to just + "count" and metadata is the raw string. """ - try: - self._commit_puts() - except LockTimeout: - if not self.stale_reads_ok: - raise - query_part1 = ''' - SELECT hash, id, created_at, put_timestamp, delete_timestamp, - %s_count AS count, - CASE WHEN SQLITE_SEQUENCE.seq IS NOT NULL - THEN SQLITE_SEQUENCE.seq ELSE -1 END AS max_row, ''' % \ - self.db_contains_type - query_part2 = ''' - FROM (%s_stat LEFT JOIN SQLITE_SEQUENCE - ON SQLITE_SEQUENCE.name == '%s') LIMIT 1 - ''' % (self.db_type, self.db_contains_type) + info = self.get_info() + info['count'] = info.pop('%s_count' % self.db_contains_type) + info['metadata'] = self.get_raw_metadata() + info['max_row'] = self.get_max_row() + return info + + def get_info(self): + self._commit_puts_stale_ok() with self.get() as conn: - try: - curs = conn.execute(query_part1 + 'metadata' + query_part2) - except sqlite3.OperationalError, err: - if 'no such column: metadata' not in str(err): - raise - curs = conn.execute(query_part1 + "'' as metadata" + - query_part2) + curs = conn.execute('SELECT * from %s_stat' % self.db_type) curs.row_factory = dict_factory return curs.fetchone() - def _commit_puts(self): - pass # stub to be overridden if need be + def put_record(self, record): + """ + Put a record into the DB. If the DB has an associated pending file with + space then the record is appended to that file and a commit to the DB + is deferred. If its pending file is full then the record will be + committed immediately. + + :param record: a record to be added to the DB. + :raises DatabaseConnectionError: if the DB file does not exist or if + ``skip_commits`` is True. + :raises LockTimeout: if a timeout occurs while waiting to take a lock + to write to the pending file. + """ + if not os.path.exists(self.db_file): + raise DatabaseConnectionError(self.db_file, "DB doesn't exist") + if self.skip_commits: + raise DatabaseConnectionError(self.db_file, + 'commits not accepted') + with lock_parent_directory(self.pending_file, self.pending_timeout): + pending_size = 0 + try: + pending_size = os.path.getsize(self.pending_file) + except OSError as err: + if err.errno != errno.ENOENT: + raise + if pending_size > PENDING_CAP: + self._commit_puts([record]) + else: + with open(self.pending_file, 'a+b') as fp: + # Colons aren't used in base64 encoding; so they are our + # delimiter + fp.write(b':') + fp.write(base64.b64encode(pickle.dumps( + self.make_tuple_for_pickle(record), + protocol=PICKLE_PROTOCOL))) + fp.flush() + + def _skip_commit_puts(self): + return self.skip_commits or not os.path.exists(self.pending_file) + + def _commit_puts(self, item_list=None): + """ + Scan for .pending files and commit the found records by feeding them + to merge_items(). Assume that lock_parent_directory has already been + called. + + :param item_list: A list of items to commit in addition to .pending + """ + if self._skip_commit_puts(): + if item_list: + # this broker instance should not be used to commit records, + # but if it is then raise an error rather than quietly + # discarding the records in item_list. + raise DatabaseConnectionError(self.db_file, + 'commits not accepted') + return + if item_list is None: + item_list = [] + self._preallocate() + if not os.path.getsize(self.pending_file): + if item_list: + self.merge_items(item_list) + return + with open(self.pending_file, 'r+b') as fp: + for entry in fp.read().split(b':'): + if entry: + try: + data = pickle.loads(base64.b64decode(entry), + encoding='utf8') # nosec: B301 + self._commit_puts_load(item_list, data) + except Exception: + self.logger.exception( + 'Invalid pending entry %(file)s: %(entry)s', + {'file': self.pending_file, 'entry': entry}) + if item_list: + self.merge_items(item_list) + try: + os.ftruncate(fp.fileno(), 0) + except OSError as err: + if err.errno != errno.ENOENT: + raise + + def _commit_puts_stale_ok(self): + """ + Catch failures of _commit_puts() if broker is intended for + reading of stats, and thus does not care for pending updates. + """ + if self._skip_commit_puts(): + return + try: + with lock_parent_directory(self.pending_file, + self.pending_timeout): + self._commit_puts() + except (LockTimeout, sqlite3.OperationalError): + if not self.stale_reads_ok: + raise + + def _commit_puts_load(self, item_list, entry): + """ + Unmarshall the :param:entry tuple and append it to :param:item_list. + This is implemented by a particular broker to be compatible + with its :func:`merge_items`. + """ + raise NotImplementedError + + def merge_items(self, item_list, source=None): + """ + Save :param:item_list to the database. + """ + raise NotImplementedError + + def make_tuple_for_pickle(self, record): + """ + Turn this db record dict into the format this service uses for + pending pickles. + """ + raise NotImplementedError def merge_syncs(self, sync_points, incoming=True): """ @@ -515,7 +915,7 @@ def _preallocate(self): within 512k of a boundary, it allocates to the next boundary. Boundaries are 2m, 5m, 10m, 25m, 50m, then every 50m after. """ - if not DB_PREALLOCATION or self.db_file == ':memory:': + if not DB_PREALLOCATION: return MB = (1024 * 1024) @@ -537,6 +937,21 @@ def prealloc_points(): with open(self.db_file, 'rb+') as fp: fallocate(fp.fileno(), int(prealloc_size)) + def get_raw_metadata(self): + with self.get() as conn: + try: + row = conn.execute('SELECT metadata FROM %s_stat' % + self.db_type).fetchone() + if not row: + self.quarantine("missing row in %s_stat table" % + self.db_type) + metadata = row[0] + except sqlite3.OperationalError as err: + if 'no such column: metadata' not in str(err): + raise + metadata = '' + return metadata + @property def metadata(self): """ @@ -544,65 +959,151 @@ def metadata(self): are tuples of (value, timestamp) where the timestamp indicates when that key was set to that value. """ - with self.get() as conn: - try: - metadata = conn.execute('SELECT metadata FROM %s_stat' % - self.db_type).fetchone()[0] - except sqlite3.OperationalError, err: - if 'no such column: metadata' not in str(err): - raise - metadata = '' + metadata = self.get_raw_metadata() if metadata: metadata = json.loads(metadata) + native_str_keys_and_values(metadata) else: metadata = {} return metadata - def update_metadata(self, metadata_updates): + @staticmethod + def validate_metadata(metadata): + """ + Validates that metadata falls within acceptable limits. + + :param metadata: to be validated + :raises HTTPBadRequest: if MAX_META_COUNT or MAX_META_OVERALL_SIZE + is exceeded, or if metadata contains non-UTF-8 data + """ + meta_count = 0 + meta_size = 0 + for key, (value, timestamp) in metadata.items(): + if key and not check_utf8(key): + raise HTTPBadRequest('Metadata must be valid UTF-8') + if value and not check_utf8(value): + raise HTTPBadRequest('Metadata must be valid UTF-8') + key = key.lower() + if value and key.startswith(('x-account-meta-', + 'x-container-meta-')): + prefix = 'x-account-meta-' + if key.startswith('x-container-meta-'): + prefix = 'x-container-meta-' + key = key[len(prefix):] + meta_count = meta_count + 1 + meta_size = meta_size + len(key) + len(value) + if meta_count > MAX_META_COUNT: + raise HTTPBadRequest('Too many metadata items; max %d' + % MAX_META_COUNT) + if meta_size > MAX_META_OVERALL_SIZE: + raise HTTPBadRequest('Total metadata too large; max %d' + % MAX_META_OVERALL_SIZE) + + def update_metadata(self, metadata_updates, validate_metadata=False): """ Updates the metadata dict for the database. The metadata dict values - are tuples of (value, timestamp) where the timestamp indicates when - that key was set to that value. Key/values will only be overwritten if - the timestamp is newer. To delete a key, set its value to ('', - timestamp). These empty keys will eventually be removed by - :func:reclaim + are tuples of (value, timestamp) where the timestamp is an internalized + timestamp string that indicates when that key was set to that value. + Key/values will only be overwritten if the timestamp is newer. To + delete a key, set its value to ('', timestamp). These empty keys will + eventually be removed by :func:`reclaim` """ old_metadata = self.metadata if set(metadata_updates).issubset(set(old_metadata)): - for key, (value, timestamp) in metadata_updates.iteritems(): + for key, (value, timestamp) in metadata_updates.items(): if timestamp > old_metadata[key][1]: break else: return with self.get() as conn: try: - md = conn.execute('SELECT metadata FROM %s_stat' % - self.db_type).fetchone()[0] - md = md and json.loads(md) or {} - except sqlite3.OperationalError, err: + row = conn.execute('SELECT metadata FROM %s_stat' % + self.db_type).fetchone() + if not row: + self.quarantine("missing row in %s_stat table" % + self.db_type) + md = row[0] + md = json.loads(md) if md else {} + native_str_keys_and_values(md) + except sqlite3.OperationalError as err: if 'no such column: metadata' not in str(err): raise conn.execute(""" ALTER TABLE %s_stat ADD COLUMN metadata TEXT DEFAULT '' """ % self.db_type) md = {} - for key, value_timestamp in metadata_updates.iteritems(): + for key, value_timestamp in metadata_updates.items(): value, timestamp = value_timestamp if key not in md or timestamp > md[key][1]: md[key] = value_timestamp + if validate_metadata: + DatabaseBroker.validate_metadata(md) conn.execute('UPDATE %s_stat SET metadata = ?' % self.db_type, (json.dumps(md),)) conn.commit() - def reclaim(self, timestamp): - """Removes any empty metadata values older than the timestamp""" - if not self.metadata: - return + def reclaim(self, age_timestamp, sync_timestamp): + """ + Delete reclaimable rows and metadata from the db. + + By default this method will delete rows from the db_contains_type table + that are marked deleted and whose created_at timestamp is < + age_timestamp, and deletes rows from incoming_sync and outgoing_sync + where the updated_at timestamp is < sync_timestamp. In addition, this + calls the :meth:`_reclaim_metadata` method. + + Subclasses may reclaim other items by overriding :meth:`_reclaim`. + + :param age_timestamp: (float) the max created_at timestamp of object + rows to delete + :param sync_timestamp: (float) the max update_at timestamp of sync rows + to delete + """ + if not self._skip_commit_puts(): + with lock_parent_directory(self.pending_file, + self.pending_timeout): + self._commit_puts() + + tombstone_reclaimer = TombstoneReclaimer(self, age_timestamp) + tombstone_reclaimer.reclaim() with self.get() as conn: - if self._reclaim(conn, timestamp): - conn.commit() + self._reclaim_other_stuff(conn, age_timestamp, sync_timestamp) + conn.commit() + return tombstone_reclaimer + + def _reclaim_other_stuff(self, conn, age_timestamp, sync_timestamp): + """ + This is only called once at the end of reclaim after tombstone reclaim + has been completed. + + :param conn: db connection + :param age_timestamp: (float) the max created_at timestamp of object + rows to delete + :param sync_timestamp: (float) the max update_at timestamp of sync rows + to delete + """ + self._reclaim_sync(conn, sync_timestamp) + self._reclaim_metadata(conn, age_timestamp) - def _reclaim(self, conn, timestamp): + def _reclaim_sync(self, conn, sync_timestamp): + """ + :param conn: db connection + :param sync_timestamp: (float) the max update_at timestamp of sync rows + to delete + """ + try: + conn.execute(''' + DELETE FROM outgoing_sync WHERE updated_at < ? + ''', (sync_timestamp,)) + conn.execute(''' + DELETE FROM incoming_sync WHERE updated_at < ? + ''', (sync_timestamp,)) + except sqlite3.OperationalError as err: + # Old dbs didn't have updated_at in the _sync tables. + if 'no such column: updated_at' not in str(err): + raise + + def _reclaim_metadata(self, conn, timestamp): """ Removes any empty metadata values older than the timestamp using the given database connection. This function will not call commit on the @@ -611,18 +1112,23 @@ def _reclaim(self, conn, timestamp): from other related functions. :param conn: Database connection to reclaim metadata within. - :param timestamp: Empty metadata items last updated before this + :param timestamp: (float) Empty metadata items last updated before this timestamp will be removed. :returns: True if conn.commit() should be called """ + timestamp = Timestamp(timestamp) try: - md = conn.execute('SELECT metadata FROM %s_stat' % - self.db_type).fetchone()[0] + row = conn.execute('SELECT metadata FROM %s_stat' % + self.db_type).fetchone() + if not row: + self.quarantine("missing row in %s_stat table" % + self.db_type) + md = row[0] if md: md = json.loads(md) keys_to_delete = [] - for key, (value, value_timestamp) in md.iteritems(): - if value == '' and value_timestamp < timestamp: + for key, (value, value_timestamp) in md.items(): + if value == '' and Timestamp(value_timestamp) < timestamp: keys_to_delete.append(key) if keys_to_delete: for key in keys_to_delete: @@ -630,1061 +1136,39 @@ def _reclaim(self, conn, timestamp): conn.execute('UPDATE %s_stat SET metadata = ?' % self.db_type, (json.dumps(md),)) return True - except sqlite3.OperationalError, err: + except sqlite3.OperationalError as err: if 'no such column: metadata' not in str(err): raise return False - -class ContainerBroker(DatabaseBroker): - """Encapsulates working with a container database.""" - db_type = 'container' - db_contains_type = 'object' - - def _initialize(self, conn, put_timestamp): - """Creates a brand new database (tables, indices, triggers, etc.)""" - if not self.account: - raise ValueError( - 'Attempting to create a new database with no account set') - if not self.container: - raise ValueError( - 'Attempting to create a new database with no container set') - self.create_object_table(conn) - self.create_container_stat_table(conn, put_timestamp) - - def create_object_table(self, conn): - """ - Create the object table which is specifc to the container DB. - - :param conn: DB connection object - """ - conn.executescript(""" - CREATE TABLE object ( - ROWID INTEGER PRIMARY KEY AUTOINCREMENT, - name TEXT, - created_at TEXT, - size INTEGER, - content_type TEXT, - etag TEXT, - deleted INTEGER DEFAULT 0 - ); - - CREATE INDEX ix_object_deleted_name ON object (deleted, name); - - CREATE TRIGGER object_insert AFTER INSERT ON object - BEGIN - UPDATE container_stat - SET object_count = object_count + (1 - new.deleted), - bytes_used = bytes_used + new.size, - hash = chexor(hash, new.name, new.created_at); - END; - - CREATE TRIGGER object_update BEFORE UPDATE ON object - BEGIN - SELECT RAISE(FAIL, 'UPDATE not allowed; DELETE and INSERT'); - END; - - CREATE TRIGGER object_delete AFTER DELETE ON object - BEGIN - UPDATE container_stat - SET object_count = object_count - (1 - old.deleted), - bytes_used = bytes_used - old.size, - hash = chexor(hash, old.name, old.created_at); - END; - """) - - def create_container_stat_table(self, conn, put_timestamp=None): - """ - Create the container_stat table which is specific to the container DB. - - :param conn: DB connection object - :param put_timestamp: put timestamp - """ - if put_timestamp is None: - put_timestamp = normalize_timestamp(0) - conn.executescript(""" - CREATE TABLE container_stat ( - account TEXT, - container TEXT, - created_at TEXT, - put_timestamp TEXT DEFAULT '0', - delete_timestamp TEXT DEFAULT '0', - object_count INTEGER, - bytes_used INTEGER, - reported_put_timestamp TEXT DEFAULT '0', - reported_delete_timestamp TEXT DEFAULT '0', - reported_object_count INTEGER DEFAULT 0, - reported_bytes_used INTEGER DEFAULT 0, - hash TEXT default '00000000000000000000000000000000', - id TEXT, - status TEXT DEFAULT '', - status_changed_at TEXT DEFAULT '0', - metadata TEXT DEFAULT '', - x_container_sync_point1 INTEGER DEFAULT -1, - x_container_sync_point2 INTEGER DEFAULT -1 - ); - - INSERT INTO container_stat (object_count, bytes_used) - VALUES (0, 0); - """) - conn.execute(''' - UPDATE container_stat - SET account = ?, container = ?, created_at = ?, id = ?, - put_timestamp = ? - ''', (self.account, self.container, normalize_timestamp(time.time()), - str(uuid4()), put_timestamp)) - - def get_db_version(self, conn): - if self._db_version == -1: - self._db_version = 0 - for row in conn.execute(''' - SELECT name FROM sqlite_master - WHERE name = 'ix_object_deleted_name' '''): - self._db_version = 1 - return self._db_version - - def _newid(self, conn): - conn.execute(''' - UPDATE container_stat - SET reported_put_timestamp = 0, reported_delete_timestamp = 0, - reported_object_count = 0, reported_bytes_used = 0''') - def update_put_timestamp(self, timestamp): """ Update the put_timestamp. Only modifies it if it is greater than the current timestamp. - :param timestamp: put timestamp + :param timestamp: internalized put timestamp """ with self.get() as conn: - conn.execute(''' - UPDATE container_stat SET put_timestamp = ? - WHERE put_timestamp < ? ''', (timestamp, timestamp)) + conn.execute( + 'UPDATE %s_stat SET put_timestamp = ?' + ' WHERE put_timestamp < ?' % self.db_type, + (timestamp, timestamp)) conn.commit() - def _delete_db(self, conn, timestamp): + def update_status_changed_at(self, timestamp): """ - Mark the DB as deleted + Update the status_changed_at field in the stat table. Only + modifies status_changed_at if the timestamp is greater than the + current status_changed_at timestamp. - :param conn: DB connection object - :param timestamp: timestamp to mark as deleted + :param timestamp: internalized timestamp """ - conn.execute(""" - UPDATE container_stat - SET delete_timestamp = ?, - status = 'DELETED', - status_changed_at = ? - WHERE delete_timestamp < ? """, (timestamp, timestamp, timestamp)) - - def empty(self): - """ - Check if the DB is empty. - - :returns: True if the database has no active objects, False otherwise - """ - try: - self._commit_puts() - except LockTimeout: - if not self.stale_reads_ok: - raise with self.get() as conn: - row = conn.execute( - 'SELECT object_count from container_stat').fetchone() - return (row[0] == 0) - - def _commit_puts(self, item_list=None): - """Handles commiting rows in .pending files.""" - if self.db_file == ':memory:' or not os.path.exists(self.pending_file): - return - if item_list is None: - item_list = [] - with lock_parent_directory(self.pending_file, self.pending_timeout): - self._preallocate() - if not os.path.getsize(self.pending_file): - if item_list: - self.merge_items(item_list) - return - with open(self.pending_file, 'r+b') as fp: - for entry in fp.read().split(':'): - if entry: - try: - (name, timestamp, size, content_type, etag, - deleted) = pickle.loads(entry.decode('base64')) - item_list.append({'name': name, - 'created_at': timestamp, - 'size': size, - 'content_type': content_type, - 'etag': etag, - 'deleted': deleted}) - except Exception: - self.logger.exception( - _('Invalid pending entry %(file)s: %(entry)s'), - {'file': self.pending_file, 'entry': entry}) - if item_list: - self.merge_items(item_list) - try: - os.ftruncate(fp.fileno(), 0) - except OSError, err: - if err.errno != errno.ENOENT: - raise - - def reclaim(self, object_timestamp, sync_timestamp): - """ - Delete rows from the object table that are marked deleted and - whose created_at timestamp is < object_timestamp. Also deletes rows - from incoming_sync and outgoing_sync where the updated_at timestamp is - < sync_timestamp. - - In addition, this calls the DatabaseBroker's :func:_reclaim method. - - :param object_timestamp: max created_at timestamp of object rows to - delete - :param sync_timestamp: max update_at timestamp of sync rows to delete - """ - self._commit_puts() - with self.get() as conn: - conn.execute(""" - DELETE FROM object - WHERE deleted = 1 - AND created_at < ?""", (object_timestamp,)) - try: - conn.execute(''' - DELETE FROM outgoing_sync WHERE updated_at < ? - ''', (sync_timestamp,)) - conn.execute(''' - DELETE FROM incoming_sync WHERE updated_at < ? - ''', (sync_timestamp,)) - except sqlite3.OperationalError, err: - # Old dbs didn't have updated_at in the _sync tables. - if 'no such column: updated_at' not in str(err): - raise - DatabaseBroker._reclaim(self, conn, object_timestamp) - conn.commit() - - def delete_object(self, name, timestamp): - """ - Mark an object deleted. - - :param name: object name to be deleted - :param timestamp: timestamp when the object was marked as deleted - """ - self.put_object(name, timestamp, 0, 'application/deleted', 'noetag', 1) - - def put_object(self, name, timestamp, size, content_type, etag, deleted=0): - """ - Creates an object in the DB with its metadata. - - :param name: object name to be created - :param timestamp: timestamp of when the object was created - :param size: object size - :param content_type: object content-type - :param etag: object etag - :param deleted: if True, marks the object as deleted and sets the - deteleted_at timestamp to timestamp - """ - record = {'name': name, 'created_at': timestamp, 'size': size, - 'content_type': content_type, 'etag': etag, - 'deleted': deleted} - if self.db_file == ':memory:': - self.merge_items([record]) - return - if not os.path.exists(self.db_file): - raise DatabaseConnectionError(self.db_file, "DB doesn't exist") - pending_size = 0 - try: - pending_size = os.path.getsize(self.pending_file) - except OSError, err: - if err.errno != errno.ENOENT: - raise - if pending_size > PENDING_CAP: - self._commit_puts([record]) - else: - with lock_parent_directory( - self.pending_file, self.pending_timeout): - with open(self.pending_file, 'a+b') as fp: - # Colons aren't used in base64 encoding; so they are our - # delimiter - fp.write(':') - fp.write(pickle.dumps( - (name, timestamp, size, content_type, etag, deleted), - protocol=PICKLE_PROTOCOL).encode('base64')) - fp.flush() - - def is_deleted(self, timestamp=None): - """ - Check if the DB is considered to be deleted. - - :returns: True if the DB is considered to be deleted, False otherwise - """ - if self.db_file != ':memory:' and not os.path.exists(self.db_file): - return True - try: - self._commit_puts() - except LockTimeout: - if not self.stale_reads_ok: - raise - with self.get() as conn: - row = conn.execute(''' - SELECT put_timestamp, delete_timestamp, object_count - FROM container_stat''').fetchone() - # leave this db as a tombstone for a consistency window - if timestamp and row['delete_timestamp'] > timestamp: - return False - # The container is considered deleted if the delete_timestamp - # value is greater than the put_timestamp, and there are no - # objects in the container. - return (row['object_count'] in (None, '', 0, '0')) and \ - (float(row['delete_timestamp']) > float(row['put_timestamp'])) - - def get_info(self, include_metadata=False): - """ - Get global data for the container. - - :returns: dict with keys: account, container, created_at, - put_timestamp, delete_timestamp, object_count, bytes_used, - reported_put_timestamp, reported_delete_timestamp, - reported_object_count, reported_bytes_used, hash, id, - x_container_sync_point1, and x_container_sync_point2. - If include_metadata is set, metadata is included as a key - pointing to a dict of tuples of the metadata - """ - try: - self._commit_puts() - except LockTimeout: - if not self.stale_reads_ok: - raise - with self.get() as conn: - data = None - trailing1 = 'metadata' - trailing2 = 'x_container_sync_point1, x_container_sync_point2' - while not data: - try: - data = conn.execute(''' - SELECT account, container, created_at, put_timestamp, - delete_timestamp, object_count, bytes_used, - reported_put_timestamp, reported_delete_timestamp, - reported_object_count, reported_bytes_used, hash, - id, %s, %s - FROM container_stat - ''' % (trailing1, trailing2)).fetchone() - except sqlite3.OperationalError, err: - if 'no such column: metadata' in str(err): - trailing1 = "'' as metadata" - elif 'no such column: x_container_sync_point' in str(err): - trailing2 = '-1 AS x_container_sync_point1, ' \ - '-1 AS x_container_sync_point2' - else: - raise - data = dict(data) - if include_metadata: - try: - data['metadata'] = json.loads(data.get('metadata', '')) - except ValueError: - data['metadata'] = {} - elif 'metadata' in data: - del data['metadata'] - return data - - def set_x_container_sync_points(self, sync_point1, sync_point2): - with self.get() as conn: - orig_isolation_level = conn.isolation_level - try: - # We turn off auto-transactions to ensure the alter table - # commands are part of the transaction. - conn.isolation_level = None - conn.execute('BEGIN') - try: - self._set_x_container_sync_points(conn, sync_point1, - sync_point2) - except sqlite3.OperationalError, err: - if 'no such column: x_container_sync_point' not in \ - str(err): - raise - conn.execute(''' - ALTER TABLE container_stat - ADD COLUMN x_container_sync_point1 INTEGER DEFAULT -1 - ''') - conn.execute(''' - ALTER TABLE container_stat - ADD COLUMN x_container_sync_point2 INTEGER DEFAULT -1 - ''') - self._set_x_container_sync_points(conn, sync_point1, - sync_point2) - conn.execute('COMMIT') - finally: - conn.isolation_level = orig_isolation_level - - def _set_x_container_sync_points(self, conn, sync_point1, sync_point2): - if sync_point1 is not None and sync_point2 is not None: - conn.execute(''' - UPDATE container_stat - SET x_container_sync_point1 = ?, - x_container_sync_point2 = ? - ''', (sync_point1, sync_point2)) - elif sync_point1 is not None: - conn.execute(''' - UPDATE container_stat - SET x_container_sync_point1 = ? - ''', (sync_point1,)) - elif sync_point2 is not None: - conn.execute(''' - UPDATE container_stat - SET x_container_sync_point2 = ? - ''', (sync_point2,)) - - def reported(self, put_timestamp, delete_timestamp, object_count, - bytes_used): - """ - Update reported stats. - - :param put_timestamp: put_timestamp to update - :param delete_timestamp: delete_timestamp to update - :param object_count: object_count to update - :param bytes_used: bytes_used to update - """ - with self.get() as conn: - conn.execute(''' - UPDATE container_stat - SET reported_put_timestamp = ?, reported_delete_timestamp = ?, - reported_object_count = ?, reported_bytes_used = ? - ''', (put_timestamp, delete_timestamp, object_count, bytes_used)) - conn.commit() - - def list_objects_iter(self, limit, marker, end_marker, prefix, delimiter, - path=None): - """ - Get a list of objects sorted by name starting at marker onward, up - to limit entries. Entries will begin with the prefix and will not - have the delimiter after the prefix. - - :param limit: maximum number of entries to get - :param marker: marker query - :param end_marker: end marker query - :param prefix: prefix query - :param delimeter: delimeter for query - :param path: if defined, will set the prefix and delimter based on - the path - - :returns: list of tuples of (name, created_at, size, content_type, - etag) - """ - (marker, end_marker, prefix, delimiter, path) = utf8encode( - marker, end_marker, prefix, delimiter, path) - try: - self._commit_puts() - except LockTimeout: - if not self.stale_reads_ok: - raise - if path is not None: - prefix = path - if path: - prefix = path = path.rstrip('/') + '/' - delimiter = '/' - elif delimiter and not prefix: - prefix = '' - orig_marker = marker - with self.get() as conn: - results = [] - while len(results) < limit: - query = '''SELECT name, created_at, size, content_type, etag - FROM object WHERE''' - query_args = [] - if end_marker: - query += ' name < ? AND' - query_args.append(end_marker) - if marker and marker >= prefix: - query += ' name > ? AND' - query_args.append(marker) - elif prefix: - query += ' name >= ? AND' - query_args.append(prefix) - if self.get_db_version(conn) < 1: - query += ' +deleted = 0' - else: - query += ' deleted = 0' - query += ' ORDER BY name LIMIT ?' - query_args.append(limit - len(results)) - curs = conn.execute(query, query_args) - curs.row_factory = None - - if prefix is None: - return [r for r in curs] - if not delimiter: - return [r for r in curs if r[0].startswith(prefix)] - rowcount = 0 - for row in curs: - rowcount += 1 - marker = name = row[0] - if len(results) >= limit or not name.startswith(prefix): - curs.close() - return results - end = name.find(delimiter, len(prefix)) - if path is not None: - if name == path: - continue - if end >= 0 and len(name) > end + len(delimiter): - marker = name[:end] + chr(ord(delimiter) + 1) - curs.close() - break - elif end > 0: - marker = name[:end] + chr(ord(delimiter) + 1) - dir_name = name[:end + 1] - if dir_name != orig_marker: - results.append([dir_name, '0', 0, None, '']) - curs.close() - break - results.append(row) - if not rowcount: - break - return results - - def merge_items(self, item_list, source=None): - """ - Merge items into the object table. - - :param item_list: list of dictionaries of {'name', 'created_at', - 'size', 'content_type', 'etag', 'deleted'} - :param source: if defined, update incoming_sync with the source - """ - with self.get() as conn: - max_rowid = -1 - for rec in item_list: - query = ''' - DELETE FROM object - WHERE name = ? AND (created_at < ?) - ''' - if self.get_db_version(conn) >= 1: - query += ' AND deleted IN (0, 1)' - conn.execute(query, (rec['name'], rec['created_at'])) - query = 'SELECT 1 FROM object WHERE name = ?' - if self.get_db_version(conn) >= 1: - query += ' AND deleted IN (0, 1)' - if not conn.execute(query, (rec['name'],)).fetchall(): - conn.execute(''' - INSERT INTO object (name, created_at, size, - content_type, etag, deleted) - VALUES (?, ?, ?, ?, ?, ?) - ''', ([rec['name'], rec['created_at'], rec['size'], - rec['content_type'], rec['etag'], rec['deleted']])) - if source: - max_rowid = max(max_rowid, rec['ROWID']) - if source: - try: - conn.execute(''' - INSERT INTO incoming_sync (sync_point, remote_id) - VALUES (?, ?) - ''', (max_rowid, source)) - except sqlite3.IntegrityError: - conn.execute(''' - UPDATE incoming_sync SET sync_point=max(?, sync_point) - WHERE remote_id=? - ''', (max_rowid, source)) - conn.commit() - - -class AccountBroker(DatabaseBroker): - """Encapsulates working with a account database.""" - db_type = 'account' - db_contains_type = 'container' - - def _initialize(self, conn, put_timestamp): - """ - Create a brand new database (tables, indices, triggers, etc.) - - :param conn: DB connection object - :param put_timestamp: put timestamp - """ - if not self.account: - raise ValueError( - 'Attempting to create a new database with no account set') - self.create_container_table(conn) - self.create_account_stat_table(conn, put_timestamp) - - def create_container_table(self, conn): - """ - Create container table which is specific to the account DB. - - :param conn: DB connection object - """ - conn.executescript(""" - CREATE TABLE container ( - ROWID INTEGER PRIMARY KEY AUTOINCREMENT, - name TEXT, - put_timestamp TEXT, - delete_timestamp TEXT, - object_count INTEGER, - bytes_used INTEGER, - deleted INTEGER DEFAULT 0 - ); - - CREATE INDEX ix_container_deleted_name ON - container (deleted, name); - - CREATE TRIGGER container_insert AFTER INSERT ON container - BEGIN - UPDATE account_stat - SET container_count = container_count + (1 - new.deleted), - object_count = object_count + new.object_count, - bytes_used = bytes_used + new.bytes_used, - hash = chexor(hash, new.name, - new.put_timestamp || '-' || - new.delete_timestamp || '-' || - new.object_count || '-' || new.bytes_used); - END; - - CREATE TRIGGER container_update BEFORE UPDATE ON container - BEGIN - SELECT RAISE(FAIL, 'UPDATE not allowed; DELETE and INSERT'); - END; - - - CREATE TRIGGER container_delete AFTER DELETE ON container - BEGIN - UPDATE account_stat - SET container_count = container_count - (1 - old.deleted), - object_count = object_count - old.object_count, - bytes_used = bytes_used - old.bytes_used, - hash = chexor(hash, old.name, - old.put_timestamp || '-' || - old.delete_timestamp || '-' || - old.object_count || '-' || old.bytes_used); - END; - """) - - def create_account_stat_table(self, conn, put_timestamp): - """ - Create account_stat table which is specific to the account DB. - - :param conn: DB connection object - :param put_timestamp: put timestamp - """ - conn.executescript(""" - CREATE TABLE account_stat ( - account TEXT, - created_at TEXT, - put_timestamp TEXT DEFAULT '0', - delete_timestamp TEXT DEFAULT '0', - container_count INTEGER, - object_count INTEGER DEFAULT 0, - bytes_used INTEGER DEFAULT 0, - hash TEXT default '00000000000000000000000000000000', - id TEXT, - status TEXT DEFAULT '', - status_changed_at TEXT DEFAULT '0', - metadata TEXT DEFAULT '' - ); - - INSERT INTO account_stat (container_count) VALUES (0); - """) - - conn.execute(''' - UPDATE account_stat SET account = ?, created_at = ?, id = ?, - put_timestamp = ? - ''', (self.account, normalize_timestamp(time.time()), str(uuid4()), - put_timestamp)) - - def get_db_version(self, conn): - if self._db_version == -1: - self._db_version = 0 - for row in conn.execute(''' - SELECT name FROM sqlite_master - WHERE name = 'ix_container_deleted_name' '''): - self._db_version = 1 - return self._db_version - - def update_put_timestamp(self, timestamp): - """ - Update the put_timestamp. Only modifies it if it is greater than - the current timestamp. - - :param timestamp: put timestamp - """ - with self.get() as conn: - conn.execute(''' - UPDATE account_stat SET put_timestamp = ? - WHERE put_timestamp < ? ''', (timestamp, timestamp)) + self._update_status_changed_at(conn, timestamp) conn.commit() - def _delete_db(self, conn, timestamp, force=False): - """ - Mark the DB as deleted. - - :param conn: DB connection object - :param timestamp: timestamp to mark as deleted - """ - conn.execute(""" - UPDATE account_stat - SET delete_timestamp = ?, - status = 'DELETED', - status_changed_at = ? - WHERE delete_timestamp < ? """, (timestamp, timestamp, timestamp)) - - def _commit_puts(self, item_list=None): - """Handles commiting rows in .pending files.""" - if self.db_file == ':memory:' or not os.path.exists(self.pending_file): - return - if item_list is None: - item_list = [] - with lock_parent_directory(self.pending_file, self.pending_timeout): - self._preallocate() - if not os.path.getsize(self.pending_file): - if item_list: - self.merge_items(item_list) - return - with open(self.pending_file, 'r+b') as fp: - for entry in fp.read().split(':'): - if entry: - try: - (name, put_timestamp, delete_timestamp, - object_count, bytes_used, deleted) = \ - pickle.loads(entry.decode('base64')) - item_list.append( - {'name': name, - 'put_timestamp': put_timestamp, - 'delete_timestamp': delete_timestamp, - 'object_count': object_count, - 'bytes_used': bytes_used, - 'deleted': deleted}) - except Exception: - self.logger.exception( - _('Invalid pending entry %(file)s: %(entry)s'), - {'file': self.pending_file, 'entry': entry}) - if item_list: - self.merge_items(item_list) - try: - os.ftruncate(fp.fileno(), 0) - except OSError, err: - if err.errno != errno.ENOENT: - raise - - def empty(self): - """ - Check if the account DB is empty. - - :returns: True if the database has no active containers. - """ - try: - self._commit_puts() - except LockTimeout: - if not self.stale_reads_ok: - raise - with self.get() as conn: - row = conn.execute( - 'SELECT container_count from account_stat').fetchone() - return (row[0] == 0) - - def reclaim(self, container_timestamp, sync_timestamp): - """ - Delete rows from the container table that are marked deleted and - whose created_at timestamp is < container_timestamp. Also deletes rows - from incoming_sync and outgoing_sync where the updated_at timestamp is - < sync_timestamp. - - In addition, this calls the DatabaseBroker's :func:_reclaim method. - - :param container_timestamp: max created_at timestamp of container rows - to delete - :param sync_timestamp: max update_at timestamp of sync rows to delete - """ - - self._commit_puts() - with self.get() as conn: - conn.execute(''' - DELETE FROM container WHERE - deleted = 1 AND delete_timestamp < ? - ''', (container_timestamp,)) - try: - conn.execute(''' - DELETE FROM outgoing_sync WHERE updated_at < ? - ''', (sync_timestamp,)) - conn.execute(''' - DELETE FROM incoming_sync WHERE updated_at < ? - ''', (sync_timestamp,)) - except sqlite3.OperationalError, err: - # Old dbs didn't have updated_at in the _sync tables. - if 'no such column: updated_at' not in str(err): - raise - DatabaseBroker._reclaim(self, conn, container_timestamp) - conn.commit() - - def get_container_timestamp(self, container_name): - """ - Get the put_timestamp of a container. - - :param container_name: container name - - :returns: put_timestamp of the container - """ - try: - self._commit_puts() - except LockTimeout: - if not self.stale_reads_ok: - raise - with self.get() as conn: - ret = conn.execute(''' - SELECT put_timestamp FROM container - WHERE name = ? AND deleted != 1''', - (container_name,)).fetchone() - if ret: - ret = ret[0] - return ret - - def put_container(self, name, put_timestamp, delete_timestamp, - object_count, bytes_used): - """ - Create a container with the given attributes. - - :param name: name of the container to create - :param put_timestamp: put_timestamp of the container to create - :param delete_timestamp: delete_timestamp of the container to create - :param object_count: number of objects in the container - :param bytes_used: number of bytes used by the container - """ - if delete_timestamp > put_timestamp and \ - object_count in (None, '', 0, '0'): - deleted = 1 - else: - deleted = 0 - record = {'name': name, 'put_timestamp': put_timestamp, - 'delete_timestamp': delete_timestamp, - 'object_count': object_count, - 'bytes_used': bytes_used, - 'deleted': deleted} - if self.db_file == ':memory:': - self.merge_items([record]) - return - if not os.path.exists(self.db_file): - raise DatabaseConnectionError(self.db_file, "DB doesn't exist") - pending_size = 0 - try: - pending_size = os.path.getsize(self.pending_file) - except OSError, err: - if err.errno != errno.ENOENT: - raise - if pending_size > PENDING_CAP: - self._commit_puts([record]) - else: - with lock_parent_directory(self.pending_file, - self.pending_timeout): - with open(self.pending_file, 'a+b') as fp: - # Colons aren't used in base64 encoding; so they are our - # delimiter - fp.write(':') - fp.write(pickle.dumps( - (name, put_timestamp, delete_timestamp, object_count, - bytes_used, deleted), - protocol=PICKLE_PROTOCOL).encode('base64')) - fp.flush() - - def can_delete_db(self, cutoff): - """ - Check if the accont DB can be deleted. - - :returns: True if the account can be deleted, False otherwise - """ - self._commit_puts() - with self.get() as conn: - row = conn.execute(''' - SELECT status, put_timestamp, delete_timestamp, container_count - FROM account_stat''').fetchone() - # The account is considered deleted if its status is marked - # as 'DELETED" and the delete_timestamp is older than the supplied - # cutoff date; or if the delete_timestamp value is greater than - # the put_timestamp, and there are no containers for the account - status_del = (row['status'] == 'DELETED') - deltime = float(row['delete_timestamp']) - past_cutoff = (deltime < cutoff) - time_later = (row['delete_timestamp'] > row['put_timestamp']) - no_containers = (row['container_count'] in (None, '', 0, '0')) - return ( - (status_del and past_cutoff) or (time_later and no_containers)) - - def is_deleted(self): - """ - Check if the account DB is considered to be deleted. - - :returns: True if the account DB is considered to be deleted, False - otherwise - """ - if self.db_file != ':memory:' and not os.path.exists(self.db_file): - return True - try: - self._commit_puts() - except LockTimeout: - if not self.stale_reads_ok: - raise - with self.get() as conn: - row = conn.execute(''' - SELECT put_timestamp, delete_timestamp, container_count, status - FROM account_stat''').fetchone() - return row['status'] == 'DELETED' or ( - row['container_count'] in (None, '', 0, '0') and - row['delete_timestamp'] > row['put_timestamp']) - - def is_status_deleted(self): - """Only returns true if the status field is set to DELETED.""" - with self.get() as conn: - row = conn.execute(''' - SELECT status - FROM account_stat''').fetchone() - return (row['status'] == "DELETED") - - def get_info(self): - """ - Get global data for the account. - - :returns: dict with keys: account, created_at, put_timestamp, - delete_timestamp, container_count, object_count, - bytes_used, hash, id - """ - try: - self._commit_puts() - except LockTimeout: - if not self.stale_reads_ok: - raise - with self.get() as conn: - return dict(conn.execute(''' - SELECT account, created_at, put_timestamp, delete_timestamp, - container_count, object_count, bytes_used, hash, id - FROM account_stat - ''').fetchone()) - - def list_containers_iter(self, limit, marker, end_marker, prefix, - delimiter): - """ - Get a list of containerss sorted by name starting at marker onward, up - to limit entries. Entries will begin with the prefix and will not - have the delimiter after the prefix. - - :param limit: maximum number of entries to get - :param marker: marker query - :param end_marker: end marker query - :param prefix: prefix query - :param delimiter: delimiter for query - - :returns: list of tuples of (name, object_count, bytes_used, 0) - """ - (marker, end_marker, prefix, delimiter) = utf8encode( - marker, end_marker, prefix, delimiter) - try: - self._commit_puts() - except LockTimeout: - if not self.stale_reads_ok: - raise - if delimiter and not prefix: - prefix = '' - orig_marker = marker - with self.get() as conn: - results = [] - while len(results) < limit: - query = """ - SELECT name, object_count, bytes_used, 0 - FROM container - WHERE deleted = 0 AND """ - query_args = [] - if end_marker: - query += ' name < ? AND' - query_args.append(end_marker) - if marker and marker >= prefix: - query += ' name > ? AND' - query_args.append(marker) - elif prefix: - query += ' name >= ? AND' - query_args.append(prefix) - if self.get_db_version(conn) < 1: - query += ' +deleted = 0' - else: - query += ' deleted = 0' - query += ' ORDER BY name LIMIT ?' - query_args.append(limit - len(results)) - curs = conn.execute(query, query_args) - curs.row_factory = None - - if prefix is None: - return [r for r in curs] - if not delimiter: - return [r for r in curs if r[0].startswith(prefix)] - rowcount = 0 - for row in curs: - rowcount += 1 - marker = name = row[0] - if len(results) >= limit or not name.startswith(prefix): - curs.close() - return results - end = name.find(delimiter, len(prefix)) - if end > 0: - marker = name[:end] + chr(ord(delimiter) + 1) - dir_name = name[:end + 1] - if dir_name != orig_marker: - results.append([dir_name, 0, 0, 1]) - curs.close() - break - results.append(row) - if not rowcount: - break - return results - - def merge_items(self, item_list, source=None): - """ - Merge items into the container table. - - :param item_list: list of dictionaries of {'name', 'put_timestamp', - 'delete_timestamp', 'object_count', 'bytes_used', - 'deleted'} - :param source: if defined, update incoming_sync with the source - """ - with self.get() as conn: - max_rowid = -1 - for rec in item_list: - record = [rec['name'], rec['put_timestamp'], - rec['delete_timestamp'], rec['object_count'], - rec['bytes_used'], rec['deleted']] - query = ''' - SELECT name, put_timestamp, delete_timestamp, - object_count, bytes_used, deleted - FROM container WHERE name = ? - ''' - if self.get_db_version(conn) >= 1: - query += ' AND deleted IN (0, 1)' - curs = conn.execute(query, (rec['name'],)) - curs.row_factory = None - row = curs.fetchone() - if row: - row = list(row) - for i in xrange(5): - if record[i] is None and row[i] is not None: - record[i] = row[i] - if row[1] > record[1]: # Keep newest put_timestamp - record[1] = row[1] - if row[2] > record[2]: # Keep newest delete_timestamp - record[2] = row[2] - # If deleted, mark as such - if record[2] > record[1] and \ - record[3] in (None, '', 0, '0'): - record[5] = 1 - else: - record[5] = 0 - conn.execute(''' - DELETE FROM container WHERE name = ? AND - deleted IN (0, 1) - ''', (record[0],)) - conn.execute(''' - INSERT INTO container (name, put_timestamp, - delete_timestamp, object_count, bytes_used, - deleted) - VALUES (?, ?, ?, ?, ?, ?) - ''', record) - if source: - max_rowid = max(max_rowid, rec['ROWID']) - if source: - try: - conn.execute(''' - INSERT INTO incoming_sync (sync_point, remote_id) - VALUES (?, ?) - ''', (max_rowid, source)) - except sqlite3.IntegrityError: - conn.execute(''' - UPDATE incoming_sync SET sync_point=max(?, sync_point) - WHERE remote_id=? - ''', (max_rowid, source)) - conn.commit() + def _update_status_changed_at(self, conn, timestamp): + conn.execute( + 'UPDATE %s_stat SET status_changed_at = ?' + ' WHERE status_changed_at < ?' % self.db_type, + (timestamp, timestamp)) diff --git a/swift/common/db_auditor.py b/swift/common/db_auditor.py new file mode 100644 index 0000000000..5a4d7f7831 --- /dev/null +++ b/swift/common/db_auditor.py @@ -0,0 +1,168 @@ +# Copyright (c) 2010-2018 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time +from random import random + +from eventlet import Timeout + +import swift.common.db +from swift.common.utils import get_logger, audit_location_generator, \ + config_true_value, dump_recon_cache, EventletRateLimiter +from swift.common.daemon import Daemon +from swift.common.exceptions import DatabaseAuditorException +from swift.common.recon import DEFAULT_RECON_CACHE_PATH, \ + server_type_to_recon_file + + +class DatabaseAuditor(Daemon): + """Base Database Auditor.""" + + @property + def rcache(self): + return os.path.join( + self.recon_cache_path, + server_type_to_recon_file(self.server_type)) + + @property + def server_type(self): + raise NotImplementedError + + @property + def broker_class(self): + raise NotImplementedError + + def __init__(self, conf, logger=None): + self.conf = conf + self.logger = logger or get_logger(conf, log_route='{}-auditor'.format( + self.server_type)) + self.devices = conf.get('devices', '/srv/node') + self.mount_check = config_true_value(conf.get('mount_check', 'true')) + self.interval = float(conf.get('interval', 1800)) + self.logging_interval = 3600 # once an hour + self.passes = 0 + self.failures = 0 + self.max_dbs_per_second = \ + float(conf.get('{}s_per_second'.format(self.server_type), 200)) + self.rate_limiter = EventletRateLimiter(self.max_dbs_per_second) + swift.common.db.DB_PREALLOCATION = \ + config_true_value(conf.get('db_preallocation', 'f')) + self.recon_cache_path = conf.get('recon_cache_path', + DEFAULT_RECON_CACHE_PATH) + self.datadir = '{}s'.format(self.server_type) + + def _one_audit_pass(self, reported): + all_locs = audit_location_generator(self.devices, self.datadir, '.db', + mount_check=self.mount_check, + logger=self.logger) + for path, device, partition in all_locs: + self.audit(path) + if time.time() - reported >= self.logging_interval: + self.logger.info( + 'Since %(time)s: %(server_type)s audits: %(pass)s ' + 'passed audit, %(fail)s failed audit', + {'time': time.ctime(reported), + 'pass': self.passes, + 'fail': self.failures, + 'server_type': self.server_type}) + dump_recon_cache( + {'{}_audits_since'.format(self.server_type): reported, + '{}_audits_passed'.format(self.server_type): self.passes, + '{}_audits_failed'.format(self.server_type): + self.failures}, + self.rcache, self.logger) + reported = time.time() + self.passes = 0 + self.failures = 0 + self.rate_limiter.wait() + return reported + + def run_forever(self, *args, **kwargs): + """Run the database audit until stopped.""" + reported = time.time() + time.sleep(random() * self.interval) + while True: + self.logger.info( + 'Begin %s audit pass.', self.server_type) + begin = time.time() + try: + reported = self._one_audit_pass(reported) + except (Exception, Timeout): + self.logger.increment('errors') + self.logger.exception('ERROR auditing') + elapsed = time.time() - begin + self.logger.info( + '%(server_type)s audit pass completed: %(elapsed).02fs', + {'elapsed': elapsed, 'server_type': self.server_type.title()}) + dump_recon_cache({ + '{}_auditor_pass_completed'.format(self.server_type): elapsed}, + self.rcache, self.logger) + if elapsed < self.interval: + time.sleep(self.interval - elapsed) + + def run_once(self, *args, **kwargs): + """Run the database audit once.""" + self.logger.info( + 'Begin %s audit "once" mode', self.server_type) + begin = reported = time.time() + self._one_audit_pass(reported) + elapsed = time.time() - begin + self.logger.info( + '%(server_type)s audit "once" mode completed: %(elapsed).02fs', + {'elapsed': elapsed, 'server_type': self.server_type.title()}) + dump_recon_cache( + {'{}_auditor_pass_completed'.format(self.server_type): elapsed}, + self.rcache, self.logger) + + def audit(self, path): + """ + Audits the given database path + + :param path: the path to a db + """ + start_time = time.time() + try: + broker = self.broker_class(path, logger=self.logger) + if not broker.is_deleted(): + info = broker.get_info() + err = self._audit(info, broker) + if err: + raise err + self.logger.increment('passes') + self.passes += 1 + self.logger.debug('Audit passed for %s', broker) + except DatabaseAuditorException as e: + self.logger.increment('failures') + self.failures += 1 + self.logger.error('Audit Failed for %(path)s: %(err)s', + {'path': path, 'err': str(e)}) + except (Exception, Timeout): + self.logger.increment('failures') + self.failures += 1 + self.logger.exception( + 'ERROR Could not get %(server_type)s info %(path)s', + {'server_type': self.server_type, 'path': path}) + self.logger.timing_since('timing', start_time) + + def _audit(self, info, broker): + """ + Run any additional audit checks in sub auditor classes + + :param info: The DB _info + :param broker: The broker + :return: None on success, otherwise an exception to throw. + """ + raise NotImplementedError diff --git a/swift/common/db_replicator.py b/swift/common/db_replicator.py index ed16177ed5..37403bd03f 100644 --- a/swift/common/db_replicator.py +++ b/swift/common/db_replicator.py @@ -1,4 +1,4 @@ -# Copyright (c) 2010-2012 OpenStack, LLC. +# Copyright (c) 2010-2012 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,7 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import with_statement +import json +import logging import os import random import math @@ -22,22 +23,31 @@ import uuid import errno import re +from contextlib import contextmanager from eventlet import GreenPool, sleep, Timeout from eventlet.green import subprocess -import simplejson import swift.common.db +from swift.common.constraints import check_drive from swift.common.utils import get_logger, whataremyips, storage_directory, \ renamer, mkdirs, lock_parent_directory, config_true_value, \ - unlink_older_than, dump_recon_cache, rsync_ip + unlink_older_than, dump_recon_cache, rsync_module_interpolation, \ + parse_override_options, round_robin_iter, Everything, get_db_files, \ + parse_db_filename, quote, RateLimitedIterator, config_auto_int_value, \ + listdir, unlink_paths_older_than + from swift.common import ring -from swift.common.http import HTTP_NOT_FOUND, HTTP_INSUFFICIENT_STORAGE +from swift.common.ring.utils import is_local_device +from swift.common.http import HTTP_NOT_FOUND, HTTP_INSUFFICIENT_STORAGE, \ + is_success from swift.common.bufferedhttp import BufferedHTTPConnection -from swift.common.exceptions import DriveNotMounted, ConnectionTimeout +from swift.common.exceptions import DriveNotMounted from swift.common.daemon import Daemon from swift.common.swob import Response, HTTPNotFound, HTTPNoContent, \ - HTTPAccepted, HTTPInsufficientStorage, HTTPBadRequest + HTTPAccepted, HTTPBadRequest +from swift.common.recon import DEFAULT_RECON_CACHE_PATH, \ + server_type_to_recon_file DEBUG_TIMINGS_THRESHOLD = 10 @@ -57,12 +67,23 @@ def quarantine_db(object_file, server_type): os.path.join(object_dir, '..', '..', '..', '..', 'quarantined', server_type + 's', os.path.basename(object_dir))) try: - renamer(object_dir, quarantine_dir) - except OSError, e: + renamer(object_dir, quarantine_dir, fsync=False) + except OSError as e: if e.errno not in (errno.EEXIST, errno.ENOTEMPTY): raise quarantine_dir = "%s-%s" % (quarantine_dir, uuid.uuid4().hex) - renamer(object_dir, quarantine_dir) + renamer(object_dir, quarantine_dir, fsync=False) + + +def looks_like_partition(dir_name): + """ + True if the directory name is a valid partition number, False otherwise. + """ + try: + part = int(dir_name) + return part >= 0 + except ValueError: + return False def roundrobin_datadirs(datadirs): @@ -72,38 +93,59 @@ def roundrobin_datadirs(datadirs): found (in their proper places). The partitions within each data dir are walked randomly, however. - :param datadirs: a list of (path, node_id) to walk - :returns: A generator of (partition, path_to_db_file, node_id) + :param datadirs: a list of tuples of (path, context, partition_filter) to + walk. The context may be any object; the context is not + used by this function but is included with each yielded + tuple. + :returns: A generator of (partition, path_to_db_file, context) """ - def walk_datadir(datadir, node_id): - partitions = os.listdir(datadir) + def walk_datadir(datadir, context, part_filter): + partitions = [pd for pd in os.listdir(datadir) + if looks_like_partition(pd) and part_filter(pd)] random.shuffle(partitions) for partition in partitions: part_dir = os.path.join(datadir, partition) if not os.path.isdir(part_dir): continue suffixes = os.listdir(part_dir) + if not suffixes: + os.rmdir(part_dir) + continue for suffix in suffixes: suff_dir = os.path.join(part_dir, suffix) if not os.path.isdir(suff_dir): continue hashes = os.listdir(suff_dir) + if not hashes: + os.rmdir(suff_dir) + continue for hsh in hashes: hash_dir = os.path.join(suff_dir, hsh) if not os.path.isdir(hash_dir): continue object_file = os.path.join(hash_dir, hsh + '.db') + # common case if os.path.exists(object_file): - yield (partition, object_file, node_id) + yield (partition, object_file, context) + continue + # look for any alternate db filenames + db_files = get_db_files(object_file) + if db_files: + yield (partition, db_files[-1], context) + continue + try: + os.rmdir(hash_dir) + except OSError as e: + if e.errno != errno.ENOTEMPTY: + raise - its = [walk_datadir(datadir, node_id) for datadir, node_id in datadirs] - while its: - for it in its: - try: - yield it.next() - except StopIteration: - its.remove(it) + its = [walk_datadir(datadir, context, filt) + for datadir, context, filt in datadirs] + + rr_its = round_robin_iter(its) + for datadir in rr_its: + yield datadir class ReplConnection(BufferedHTTPConnection): @@ -112,10 +154,10 @@ class ReplConnection(BufferedHTTPConnection): """ def __init__(self, node, partition, hash_, logger): - "" self.logger = logger self.node = node - BufferedHTTPConnection.__init__(self, '%(ip)s:%(port)s' % node) + host = "%s:%s" % (node['replication_ip'], node['replication_port']) + BufferedHTTPConnection.__init__(self, host) self.path = '/%s/%s/%s' % (node['device'], partition, hash_) def replicate(self, *args): @@ -124,90 +166,193 @@ def replicate(self, *args): :param args: list of json-encodable objects - :returns: httplib response object + :returns: bufferedhttp response object """ try: - body = simplejson.dumps(args) + body = json.dumps(args) self.request('REPLICATE', self.path, body, {'Content-Type': 'application/json'}) response = self.getresponse() response.data = response.read() return response except (Exception, Timeout): + self.close() self.logger.exception( - _('ERROR reading HTTP response from %s'), self.node) + 'ERROR reading HTTP response from %s', self.node) return None +class BrokerAnnotatedLogger: + """ + Formats log messages with broker details. + + This class augments messages with the broker's container path and DB + file path so that logs are easier to correlate during replication + and sharding workflows. + """ + def __init__(self, logger): + self.logger = logger + + def _get_broker_details(self, broker): + try: + db_file = broker.db_file + except Exception: # noqa + db_file = '' + try: + path = broker.path + except Exception: # noqa + path = '' + return path, db_file + + def _format_log_msg(self, broker, msg, *args): + # make best effort to include broker properties... + path, db_file = self._get_broker_details(broker) + if args: + if len(args) == 1 and isinstance(args[0], dict): + args = args[0] + msg = msg % args + return '%s, path: %s, db: %s' % (msg, quote(path), db_file) + + def _log(self, level, broker, msg, *args, **kwargs): + if not self.logger.isEnabledFor(level): + return + self.logger.log(level, self._format_log_msg(broker, msg, *args)) + + def debug(self, broker, msg, *args, **kwargs): + self._log(logging.DEBUG, broker, msg, *args, **kwargs) + + def info(self, broker, msg, *args, **kwargs): + self._log(logging.INFO, broker, msg, *args, **kwargs) + + def warning(self, broker, msg, *args, **kwargs): + self._log(logging.WARNING, broker, msg, *args, **kwargs) + + def error(self, broker, msg, *args, **kwargs): + self._log(logging.ERROR, broker, msg, *args, **kwargs) + + def exception(self, broker, msg, *args, **kwargs): + if not self.logger.isEnabledFor(logging.ERROR): + return + self.logger.exception(self._format_log_msg(broker, msg, *args)) + + class Replicator(Daemon): """ Implements the logic for directing db replication. """ - def __init__(self, conf): + def __init__(self, conf, logger=None): self.conf = conf - self.logger = get_logger(conf, log_route='replicator') + self.logger = logger or get_logger(conf, log_route='replicator') self.root = conf.get('devices', '/srv/node') self.mount_check = config_true_value(conf.get('mount_check', 'true')) + self.bind_ip = conf.get('bind_ip', '0.0.0.0') self.port = int(conf.get('bind_port', self.default_port)) concurrency = int(conf.get('concurrency', 8)) self.cpool = GreenPool(size=concurrency) swift_dir = conf.get('swift_dir', '/etc/swift') self.ring = ring.Ring(swift_dir, ring_name=self.server_type) + self._local_device_ids = {} self.per_diff = int(conf.get('per_diff', 1000)) self.max_diffs = int(conf.get('max_diffs') or 100) - self.interval = int(conf.get('interval') or - conf.get('run_pause') or 30) - self.vm_test_mode = config_true_value(conf.get('vm_test_mode', 'no')) - self.node_timeout = int(conf.get('node_timeout', 10)) + self.interval = float(conf.get('interval') or + conf.get('run_pause') or 30) + if 'run_pause' in conf: + if 'interval' in conf: + self.logger.warning( + 'Option %(type)s-replicator/run_pause is deprecated ' + 'and %(type)s-replicator/interval is already configured. ' + 'You can safely remove run_pause; it is now ignored and ' + 'will be removed in a future version.' + % {'type': self.server_type}) + else: + self.logger.warning( + 'Option %(type)s-replicator/run_pause is deprecated ' + 'and will be removed in a future version. ' + 'Update your configuration to use option ' + '%(type)s-replicator/interval.' + % {'type': self.server_type}) + self.databases_per_second = float( + conf.get('databases_per_second', 50)) + self.node_timeout = float(conf.get('node_timeout', 10)) self.conn_timeout = float(conf.get('conn_timeout', 0.5)) + self.rsync_compress = config_true_value( + conf.get('rsync_compress', 'no')) + self.rsync_module = conf.get('rsync_module', '').rstrip('/') + if not self.rsync_module: + self.rsync_module = '{replication_ip}::%s' % self.server_type self.reclaim_age = float(conf.get('reclaim_age', 86400 * 7)) swift.common.db.DB_PREALLOCATION = \ config_true_value(conf.get('db_preallocation', 'f')) + swift.common.db.QUERY_LOGGING = \ + config_true_value(conf.get('db_query_logging', 'f')) self._zero_stats() self.recon_cache_path = conf.get('recon_cache_path', - '/var/cache/swift') - self.recon_replicator = '%s.recon' % self.server_type + DEFAULT_RECON_CACHE_PATH) + self.recon_replicator = server_type_to_recon_file(self.server_type) self.rcache = os.path.join(self.recon_cache_path, self.recon_replicator) self.extract_device_re = re.compile('%s%s([^%s]+)' % ( self.root, os.path.sep, os.path.sep)) + self.handoffs_only = config_true_value(conf.get('handoffs_only', 'no')) + self.handoff_delete = config_auto_int_value( + conf.get('handoff_delete', 'auto'), 0) + if self.handoff_delete >= self.ring.replica_count: + self.logger.warning( + 'handoff_delete=%d is too high to have an effect on a ring ' + 'with replica count %d. Disabling.', + self.handoff_delete, self.ring.replica_count) + self.handoff_delete = 0 + self.db_logger = BrokerAnnotatedLogger(logger=self.logger) def _zero_stats(self): """Zero out the stats.""" self.stats = {'attempted': 0, 'success': 0, 'failure': 0, 'ts_repl': 0, 'no_change': 0, 'hashmatch': 0, 'rsync': 0, 'diff': 0, 'remove': 0, 'empty': 0, 'remote_merge': 0, - 'start': time.time(), 'diff_capped': 0} + 'start': time.time(), 'diff_capped': 0, 'deferred': 0, + 'failure_nodes': {}} def _report_stats(self): """Report the current stats to the logs.""" + now = time.time() self.logger.info( - _('Attempted to replicate %(count)d dbs in %(time).5f seconds ' - '(%(rate).5f/s)'), + 'Attempted to replicate %(count)d dbs in %(time).5f seconds ' + '(%(rate).5f/s)', {'count': self.stats['attempted'], - 'time': time.time() - self.stats['start'], + 'time': now - self.stats['start'], 'rate': self.stats['attempted'] / - (time.time() - self.stats['start'] + 0.0000001)}) - self.logger.info(_('Removed %(remove)d dbs') % self.stats) - self.logger.info(_('%(success)s successes, %(failure)s failures') - % self.stats) + (now - self.stats['start'] + 0.0000001)}) + self.logger.info('Removed %(remove)d dbs', self.stats) + self.logger.info('%(success)s successes, %(failure)s failures', + self.stats) dump_recon_cache( {'replication_stats': self.stats, - 'replication_time': time.time() - self.stats['start']}, + 'replication_time': now - self.stats['start'], + 'replication_last': now}, self.rcache, self.logger) self.logger.info(' '.join(['%s:%s' % item for item in - self.stats.items() if item[0] in + sorted(self.stats.items()) if item[0] in ('no_change', 'hashmatch', 'rsync', 'diff', 'ts_repl', - 'empty', 'diff_capped')])) + 'empty', 'diff_capped', 'remote_merge')])) + + def _add_failure_stats(self, failure_devs_info): + for node, dev in failure_devs_info: + self.stats['failure'] += 1 + failure_devs = self.stats['failure_nodes'].setdefault(node, {}) + failure_devs.setdefault(dev, 0) + failure_devs[dev] += 1 - def _rsync_file(self, db_file, remote_file, whole_file=True): + def _rsync_file(self, broker, remote_file, whole_file=True, + different_region=False): """ Sync a single file using rsync. Used by _rsync_db to handle syncing. - :param db_file: file to be synced + :param broker: DB broker object of DB to be synced :param remote_file: remote location to sync the DB file to :param whole-file: if True, uses rsync's --whole-file flag + :param different_region: if True, the destination node is in a + different region :returns: True if the sync was successful, False otherwise """ @@ -216,16 +361,25 @@ def _rsync_file(self, db_file, remote_file, whole_file=True): '--contimeout=%s' % int(math.ceil(self.conn_timeout))] if whole_file: popen_args.append('--whole-file') - popen_args.extend([db_file, remote_file]) + + if self.rsync_compress and different_region: + # Allow for compression, but only if the remote node is in + # a different region than the local one. + popen_args.append('--compress') + + popen_args.extend([broker.db_file, remote_file]) proc = subprocess.Popen(popen_args) proc.communicate() if proc.returncode != 0: - self.logger.error(_('ERROR rsync failed with %(code)s: %(args)s'), - {'code': proc.returncode, 'args': popen_args}) + self.db_logger.error( + broker, + 'ERROR rsync failed with %s: %r', + proc.returncode, popen_args) return proc.returncode == 0 def _rsync_db(self, broker, device, http, local_id, - replicate_method='complete_rsync', replicate_timeout=None): + replicate_method='complete_rsync', replicate_timeout=None, + different_region=False): """ Sync a whole db using rsync. @@ -235,28 +389,38 @@ def _rsync_db(self, broker, device, http, local_id, :param local_id: unique ID of the local database replica :param replicate_method: remote operation to perform after rsync :param replicate_timeout: timeout to wait in seconds + :param different_region: if True, the destination node is in a + different region """ - device_ip = rsync_ip(device['ip']) - if self.vm_test_mode: - remote_file = '%s::%s%s/%s/tmp/%s' % ( - device_ip, self.server_type, device['port'], device['device'], - local_id) - else: - remote_file = '%s::%s/%s/tmp/%s' % ( - device_ip, self.server_type, device['device'], local_id) + rsync_module = rsync_module_interpolation(self.rsync_module, device) + rsync_path = '%s/tmp/%s' % (device['device'], local_id) + remote_file = '%s/%s' % (rsync_module, rsync_path) mtime = os.path.getmtime(broker.db_file) - if not self._rsync_file(broker.db_file, remote_file): + if not self._rsync_file(broker, remote_file, + different_region=different_region): return False # perform block-level sync if the db was modified during the first sync if os.path.exists(broker.db_file + '-journal') or \ os.path.getmtime(broker.db_file) > mtime: # grab a lock so nobody else can modify it with broker.lock(): - if not self._rsync_file(broker.db_file, remote_file, False): + if not self._rsync_file(broker, remote_file, whole_file=False, + different_region=different_region): return False with Timeout(replicate_timeout or self.node_timeout): - response = http.replicate(replicate_method, local_id) - return response and response.status >= 200 and response.status < 300 + response = http.replicate(replicate_method, local_id, + os.path.basename(broker.db_file)) + return response and 200 <= response.status < 300 + + def _send_replicate_request(self, http, *repl_args): + with Timeout(self.node_timeout): + response = http.replicate(*repl_args) + if not response or not is_success(response.status): + if response: + self.logger.error('ERROR Bad response %s from %s', + response.status, http.host) + return False + return True def _usync_db(self, point, broker, http, remote_id, local_id): """ @@ -272,34 +436,43 @@ def _usync_db(self, point, broker, http, remote_id, local_id): """ self.stats['diff'] += 1 self.logger.increment('diffs') - self.logger.debug(_('Syncing chunks with %s'), http.host) + self.db_logger.debug( + broker, + 'usyncing chunks to %s, starting at row %s', + '%(ip)s:%(port)s/%(device)s' % http.node, point) + start = time.time() sync_table = broker.get_syncs() objects = broker.get_items_since(point, self.per_diff) diffs = 0 while len(objects) and diffs < self.max_diffs: diffs += 1 - with Timeout(self.node_timeout): - response = http.replicate('merge_items', objects, local_id) - if not response or response.status >= 300 or response.status < 200: - if response: - self.logger.error(_('ERROR Bad response %(status)s from ' - '%(host)s'), - {'status': response.status, - 'host': http.host}) + if not self._send_replicate_request( + http, 'merge_items', objects, local_id): return False + # replication relies on db order to send the next merge batch in + # order with no gaps point = objects[-1]['ROWID'] objects = broker.get_items_since(point, self.per_diff) + + self.db_logger.debug( + broker, + 'usyncing chunks to %s, finished at row %s (%gs)', + '%(ip)s:%(port)s/%(device)s' % http.node, + point, + time.time() - start + ) + if objects: - self.logger.debug(_( - 'Synchronization for %s has fallen more than ' - '%s rows behind; moving on and will try again next pass.') % - (broker.db_file, self.max_diffs * self.per_diff)) + self.db_logger.debug( + broker, 'Synchronization has fallen more than ' + '%s rows behind; moving on and will try again next pass', + self.max_diffs * self.per_diff) self.stats['diff_capped'] += 1 self.logger.increment('diff_caps') else: with Timeout(self.node_timeout): response = http.replicate('merge_syncs', sync_table) - if response and response.status >= 200 and response.status < 300: + if response and 200 <= response.status < 300: broker.merge_syncs([{'remote_id': remote_id, 'sync_point': point}], incoming=False) @@ -335,16 +508,24 @@ def _http_connect(self, node, partition, db_file): Make an http_connection using ReplConnection :param node: node dictionary from the ring - :param partition: partition partition to send in the url + :param partition: partition to send in the url :param db_file: DB file :returns: ReplConnection object """ - return ReplConnection(node, partition, - os.path.basename(db_file).split('.', 1)[0], - self.logger) + hsh, other, ext = parse_db_filename(db_file) + return ReplConnection(node, partition, hsh, self.logger) - def _repl_to_node(self, node, broker, partition, info): + def _gather_sync_args(self, info): + """ + Convert local replication_info to sync args tuple. + """ + sync_args_order = ('max_row', 'hash', 'id', 'created_at', + 'put_timestamp', 'delete_timestamp', 'metadata') + return tuple(info[key] for key in sync_args_order) + + def _repl_to_node(self, node, broker, partition, info, + different_region=False): """ Replicate a database to a node. @@ -354,44 +535,127 @@ def _repl_to_node(self, node, broker, partition, info): :param info: DB info as a dictionary of {'max_row', 'hash', 'id', 'created_at', 'put_timestamp', 'delete_timestamp', 'metadata'} + :param different_region: if True, the destination node is in a + different region :returns: True if successful, False otherwise """ - with ConnectionTimeout(self.conn_timeout): - http = self._http_connect(node, partition, broker.db_file) - if not http: - self.logger.error( - _('ERROR Unable to connect to remote server: %s'), node) - return False + http = self._http_connect(node, partition, broker.db_file) + sync_args = self._gather_sync_args(info) with Timeout(self.node_timeout): - response = http.replicate( - 'sync', info['max_row'], info['hash'], info['id'], - info['created_at'], info['put_timestamp'], - info['delete_timestamp'], info['metadata']) + response = http.replicate('sync', *sync_args) if not response: return False - elif response.status == HTTP_NOT_FOUND: # completely missing, rsync + return self._handle_sync_response(node, response, info, broker, http, + different_region=different_region) + + def _handle_sync_response(self, node, response, info, broker, http, + different_region=False): + if response.status == HTTP_NOT_FOUND: # completely missing, rsync self.stats['rsync'] += 1 self.logger.increment('rsyncs') - return self._rsync_db(broker, node, http, info['id']) + return self._rsync_db(broker, node, http, info['id'], + different_region=different_region) elif response.status == HTTP_INSUFFICIENT_STORAGE: raise DriveNotMounted() - elif response.status >= 200 and response.status < 300: - rinfo = simplejson.loads(response.data) + elif 200 <= response.status < 300: + rinfo = json.loads(response.data) local_sync = broker.get_sync(rinfo['id'], incoming=False) - if self._in_sync(rinfo, info, broker, local_sync): - return True - # if the difference in rowids between the two differs by - # more than 50%, rsync then do a remote merge. - if rinfo['max_row'] / float(info['max_row']) < 0.5: - self.stats['remote_merge'] += 1 - self.logger.increment('remote_merges') - return self._rsync_db(broker, node, http, info['id'], - replicate_method='rsync_then_merge', - replicate_timeout=(info['count'] / 2000)) - # else send diffs over to the remote server - return self._usync_db(max(rinfo['point'], local_sync), - broker, http, rinfo['id'], info['id']) + if rinfo.get('metadata', ''): + broker.update_metadata(json.loads(rinfo['metadata'])) + return self._choose_replication_mode( + node, rinfo, info, local_sync, broker, http, + different_region) + return False + + def _choose_replication_mode(self, node, rinfo, info, local_sync, broker, + http, different_region): + if self._in_sync(rinfo, info, broker, local_sync): + self.db_logger.debug( + broker, + 'in sync with %(ip)s:%(port)s/%(device)s, ' + 'nothing to do', node) + return True + + # if the difference in rowids between the two differs by + # more than 50% and the difference is greater than per_diff, + # rsync then do a remote merge. + # NOTE: difference > per_diff stops us from dropping to rsync + # on smaller containers, who have only a few rows to sync. + if (rinfo['max_row'] / float(info['max_row']) < 0.5 and + info['max_row'] - rinfo['max_row'] > self.per_diff): + self.stats['remote_merge'] += 1 + self.logger.increment('remote_merges') + return self._rsync_db(broker, node, http, info['id'], + replicate_method='rsync_then_merge', + replicate_timeout=(info['count'] / 2000), + different_region=different_region) + # else send diffs over to the remote server + return self._usync_db(max(rinfo['point'], local_sync), + broker, http, rinfo['id'], info['id']) + + def _post_replicate_hook(self, broker, info, responses): + """ + :param broker: broker instance for the database that just replicated + :param info: pre-replication full info dict + :param responses: a list of bools indicating success from nodes + """ + pass + + def cleanup_post_replicate(self, broker, orig_info, responses): + """ + Cleanup non primary database from disk if needed. + + :param broker: the broker for the database we're replicating + :param orig_info: snapshot of the broker replication info dict taken + before replication + :param responses: a list of boolean success values for each replication + request to other nodes + + :return success: returns False if deletion of the database was + attempted but unsuccessful, otherwise returns True. + """ + log_template = 'Not deleting db (%s)' + max_row_delta = broker.get_max_row() - orig_info['max_row'] + if max_row_delta < 0: + reason = 'negative max_row_delta: %s' % max_row_delta + self.db_logger.error(broker, log_template, reason) + return True + if max_row_delta: + reason = '%s new rows' % max_row_delta + self.db_logger.debug(broker, log_template, reason) + return True + if self.handoff_delete: + # delete handoff if we have had handoff_delete successes + successes_count = len([resp for resp in responses if resp]) + delete_handoff = successes_count >= self.handoff_delete + else: + delete_handoff = responses and all(responses) + if not delete_handoff: + reason = '%s/%s success' % (responses.count(True), len(responses)) + self.db_logger.debug(broker, log_template, reason) + return True + # If the db has been successfully synced to all of its peers, it can be + # removed. Callers should have already checked that the db is not on a + # primary node. + if not self.delete_db(broker): + self.db_logger.debug(broker, 'Failed to delete db') + return False + self.db_logger.debug(broker, 'Successfully deleted db') + return True + + def _reclaim_tmp_dbs(self, broker, now): + fnames = listdir(broker.db_dir) + fnames = [os.path.join(broker.db_dir, fname) for fname in fnames + if fname.endswith('.tmp')] + unlink_paths_older_than(fnames, now - self.reclaim_age) + + def _reclaim(self, broker, now=None): + if not now: + now = time.time() + self._reclaim_tmp_dbs(broker, now) + return broker.reclaim(now - self.reclaim_age, + now - (self.reclaim_age * 2)) def _replicate_object(self, partition, object_file, node_id): """ @@ -400,90 +664,163 @@ def _replicate_object(self, partition, object_file, node_id): :param partition: partition to be replicated to :param object_file: DB file name to be replicated - :param node_id: node id of the node to be replicated to + :param node_id: node id of the node to be replicated from + :returns: a tuple (success, responses). ``success`` is a boolean that + is True if the method completed successfully, False otherwise. + ``responses`` is a list of booleans each of which indicates the + success or not of replicating to a peer node if replication has + been attempted. ``success`` is False if any of ``responses`` is + False; when ``responses`` is empty, ``success`` may be either True + or False. """ - start_time = time.time() - self.logger.debug(_('Replicating db %s'), object_file) + start_time = now = time.time() + self.logger.debug('Replicating db %s', object_file) self.stats['attempted'] += 1 self.logger.increment('attempts') + shouldbehere = True + responses = [] + broker = None try: - broker = self.brokerclass(object_file, pending_timeout=30) - broker.reclaim(time.time() - self.reclaim_age, - time.time() - (self.reclaim_age * 2)) + broker = self.brokerclass(object_file, pending_timeout=30, + logger=self.logger) + self._reclaim(broker, now) info = broker.get_replication_info() - full_info = broker.get_info() - except (Exception, Timeout), e: + bpart = self.ring.get_part( + info['account'], info.get('container')) + if bpart != int(partition): + partition = bpart + # Important to set this false here since the later check only + # checks if it's on the proper device, not partition. + shouldbehere = False + self.db_logger.error( + broker, + 'Found db that should be on partition %s; will ' + 'replicate out and remove' % bpart) + except (Exception, Timeout) as e: if 'no such table' in str(e): - self.logger.error(_('Quarantining DB %s'), object_file) + if broker is None: + self.logger.error('Quarantining DB %s', object_file) + else: + self.db_logger.error(broker, 'Quarantining DB') quarantine_db(broker.db_file, broker.db_type) else: - self.logger.exception(_('ERROR reading db %s'), object_file) - self.stats['failure'] += 1 + if broker is None: + self.logger.exception('ERROR reading db from %s', + object_file) + else: + self.db_logger.exception(broker, 'ERROR reading db') + nodes = self.ring.get_part_nodes(int(partition)) + self._add_failure_stats([(failure_dev['replication_ip'], + failure_dev['device']) + for failure_dev in nodes]) self.logger.increment('failures') - return - # The db is considered deleted if the delete_timestamp value is greater - # than the put_timestamp, and there are no objects. - delete_timestamp = 0 - try: - delete_timestamp = float(info['delete_timestamp']) - except ValueError: - pass - put_timestamp = 0 - try: - put_timestamp = float(info['put_timestamp']) - except ValueError: - pass - if delete_timestamp < (time.time() - self.reclaim_age) and \ - delete_timestamp > put_timestamp and \ - info['count'] in (None, '', 0, '0'): - if self.report_up_to_date(full_info): - self.delete_db(object_file) + return False, responses + if broker.is_reclaimable(now, self.reclaim_age): + if self.report_up_to_date(info): + self.delete_db(broker) self.logger.timing_since('timing', start_time) - return - responses = [] + return True, responses + failure_devs_info = set() nodes = self.ring.get_part_nodes(int(partition)) - shouldbehere = bool([n for n in nodes if n['id'] == node_id]) + local_dev = None + for node in nodes: + if node['id'] == node_id: + local_dev = node + break + if shouldbehere: + shouldbehere = bool([n for n in nodes if n['id'] == node_id]) # See Footnote [1] for an explanation of the repl_nodes assignment. - i = 0 - while i < len(nodes) and nodes[i]['id'] != node_id: - i += 1 - repl_nodes = nodes[i + 1:] + nodes[:i] + if len(nodes) > 1: + i = 0 + while i < len(nodes) and nodes[i]['id'] != node_id: + i += 1 + repl_nodes = nodes[i + 1:] + nodes[:i] + else: # Special case if using only a single replica + repl_nodes = nodes more_nodes = self.ring.get_more_nodes(int(partition)) + if not local_dev: + # Check further if local device is a handoff node + for node in self.ring.get_more_nodes(int(partition)): + if node['id'] == node_id: + local_dev = node + break for node in repl_nodes: + different_region = False + if local_dev and local_dev['region'] != node['region']: + # This additional information will help later if we + # want to handle syncing to a node in different + # region with some optimizations. + different_region = True success = False try: - success = self._repl_to_node(node, broker, partition, info) + success = self._repl_to_node(node, broker, partition, info, + different_region) except DriveNotMounted: - repl_nodes.append(more_nodes.next()) - self.logger.error(_('ERROR Remote drive not mounted %s'), node) + try: + repl_nodes.append(next(more_nodes)) + except StopIteration: + self.db_logger.error( + broker, + 'ERROR There are not enough handoff nodes to reach ' + 'replica count for partition %s', + partition) + self.db_logger.error( + broker, + 'ERROR Remote drive not mounted %s', node) except (Exception, Timeout): - self.logger.exception(_('ERROR syncing %(file)s with node' - ' %(node)s'), - {'file': object_file, 'node': node}) - self.stats['success' if success else 'failure'] += 1 + self.db_logger.exception( + broker, "ERROR syncing with %s", node) + + if not success: + failure_devs_info.add((node['replication_ip'], node['device'])) self.logger.increment('successes' if success else 'failures') responses.append(success) - if not shouldbehere and all(responses): - # If the db shouldn't be on this node and has been successfully - # synced to all of its peers, it can be removed. - self.delete_db(object_file) + try: + self._post_replicate_hook(broker, info, responses) + except (Exception, Timeout): + self.db_logger.exception( + broker, 'UNHANDLED EXCEPTION: in post replicate hook') + if not shouldbehere: + if not self.cleanup_post_replicate(broker, info, responses): + failure_devs_info.update( + [(failure_dev['replication_ip'], failure_dev['device']) + for failure_dev in repl_nodes]) + target_devs_info = set([(target_dev['replication_ip'], + target_dev['device']) + for target_dev in repl_nodes]) + self.stats['success'] += len(target_devs_info - failure_devs_info) + self._add_failure_stats(failure_devs_info) + self.logger.timing_since('timing', start_time) + if shouldbehere: + responses.append(True) + return all(responses), responses - def delete_db(self, object_file): + def delete_db(self, broker): + object_file = broker.db_file hash_dir = os.path.dirname(object_file) suf_dir = os.path.dirname(hash_dir) with lock_parent_directory(object_file): shutil.rmtree(hash_dir, True) - try: - os.rmdir(suf_dir) - except OSError, err: - if err.errno not in (errno.ENOENT, errno.ENOTEMPTY): - self.logger.exception( - _('ERROR while trying to clean up %s') % suf_dir) self.stats['remove'] += 1 device_name = self.extract_device(object_file) self.logger.increment('removes.' + device_name) + for parent_dir in (suf_dir, os.path.dirname(suf_dir)): + try: + os.rmdir(parent_dir) + except OSError as err: + if err.errno == errno.ENOTEMPTY: + break + elif err.errno == errno.ENOENT: + continue + else: + self.db_logger.exception( + broker, + 'ERROR while trying to clean up %s', parent_dir) + return False + return True + def extract_device(self, object_file): """ Extract the device name from an object path. Returns "UNKNOWN" if the @@ -496,36 +833,98 @@ def extract_device(self, object_file): return match.groups()[0] return "UNKNOWN" + def _partition_dir_filter(self, device_id, partitions_to_replicate): + + def filt(partition_dir): + partition = int(partition_dir) + if self.handoffs_only: + primary_node_ids = [ + d['id'] for d in self.ring.get_part_nodes(partition)] + if device_id in primary_node_ids: + return False + + if partition not in partitions_to_replicate: + return False + + return True + + return filt + def report_up_to_date(self, full_info): return True + def roundrobin_datadirs(self, dirs): + return RateLimitedIterator( + roundrobin_datadirs(dirs), + elements_per_second=self.databases_per_second) + def run_once(self, *args, **kwargs): """Run a replication pass once.""" + override_options = parse_override_options(once=True, **kwargs) + + devices_to_replicate = override_options.devices or Everything() + partitions_to_replicate = override_options.partitions or Everything() + self._zero_stats() dirs = [] - ips = whataremyips() + ips = whataremyips(self.bind_ip) if not ips: - self.logger.error(_('ERROR Failed to get my own IPs?')) + self.logger.error('ERROR Failed to get my own IPs?') return + + if self.handoffs_only or self.handoff_delete: + self.logger.warning( + 'Starting replication pass with handoffs_only ' + 'and/or handoffs_delete enabled. ' + 'These modes are not intended for normal ' + 'operation; use these options with care.') + + self._local_device_ids = {} + found_local = False for node in self.ring.devs: - if node and node['ip'] in ips and node['port'] == self.port: - if self.mount_check and not os.path.ismount( - os.path.join(self.root, node['device'])): - self.logger.warn( - _('Skipping %(device)s as it is not mounted') % node) + if node and is_local_device(ips, self.port, + node['replication_ip'], + node['replication_port']): + found_local = True + try: + dev_path = check_drive(self.root, node['device'], + self.mount_check) + except ValueError as err: + self._add_failure_stats( + [(failure_dev['replication_ip'], + failure_dev['device']) + for failure_dev in self.ring.devs if failure_dev]) + self.logger.warning('Skipping: %s', err) + continue + if node['device'] not in devices_to_replicate: + self.logger.debug( + 'Skipping device %s due to given arguments', + node['device']) continue unlink_older_than( - os.path.join(self.root, node['device'], 'tmp'), + os.path.join(dev_path, 'tmp'), time.time() - self.reclaim_age) datadir = os.path.join(self.root, node['device'], self.datadir) if os.path.isdir(datadir): - dirs.append((datadir, node['id'])) - self.logger.info(_('Beginning replication run')) - for part, object_file, node_id in roundrobin_datadirs(dirs): + self._local_device_ids[node['id']] = node + part_filt = self._partition_dir_filter( + node['id'], partitions_to_replicate) + dirs.append((datadir, node['id'], part_filt)) + if not found_local: + self.logger.error("Can't find itself %s with port %s in ring " + "file, not replicating", + ", ".join(ips), self.port) + self.logger.info('Beginning replication run') + for part, object_file, node_id in self.roundrobin_datadirs(dirs): self.cpool.spawn_n( self._replicate_object, part, object_file, node_id) self.cpool.waitall() - self.logger.info(_('Replication run OVER')) + self.logger.info('Replication run OVER') + if self.handoffs_only or self.handoff_delete: + self.logger.warning( + 'Finished replication pass with handoffs_only and/or ' + 'handoffs_delete enabled. If these are no longer required, ' + 'disable them.') self._report_stats() def run_forever(self, *args, **kwargs): @@ -538,7 +937,7 @@ def run_forever(self, *args, **kwargs): try: self.run_once() except (Exception, Timeout): - self.logger.exception(_('ERROR trying to replicate')) + self.logger.exception('ERROR trying to replicate') elapsed = time.time() - begin if elapsed < self.interval: sleep(self.interval - elapsed) @@ -554,16 +953,21 @@ def __init__(self, root, datadir, broker_class, mount_check=True, self.broker_class = broker_class self.mount_check = mount_check self.logger = logger or get_logger({}, log_route='replicator-rpc') + self.db_logger = BrokerAnnotatedLogger(logger=self.logger) + + def _db_file_exists(self, db_path): + return os.path.exists(db_path) def dispatch(self, replicate_args, args): if not hasattr(args, 'pop'): return HTTPBadRequest(body='Invalid object type') op = args.pop(0) drive, partition, hsh = replicate_args - if self.mount_check and \ - not os.path.ismount(os.path.join(self.root, drive)): + try: + dev_path = check_drive(self.root, drive, self.mount_check) + except ValueError: return Response(status='507 %s is not mounted' % drive) - db_file = os.path.join(self.root, drive, + db_file = os.path.join(dev_path, storage_directory(self.datadir, partition, hsh), hsh + '.db') if op == 'rsync_then_merge': @@ -574,59 +978,102 @@ def dispatch(self, replicate_args, args): # someone might be about to rsync a db to us, # make sure there's a tmp dir to receive it. mkdirs(os.path.join(self.root, drive, 'tmp')) - if not os.path.exists(db_file): + if not self._db_file_exists(db_file): return HTTPNotFound() - return getattr(self, op)(self.broker_class(db_file), args) + return getattr(self, op)( + self.broker_class(db_file, logger=self.logger), args) - def sync(self, broker, args): - (remote_sync, hash_, id_, created_at, put_timestamp, - delete_timestamp, metadata) = args + @contextmanager + def debug_timing(self, name): timemark = time.time() - try: - info = broker.get_replication_info() - except (Exception, Timeout), e: - if 'no such table' in str(e): - self.logger.error(_("Quarantining DB %s") % broker.db_file) - quarantine_db(broker.db_file, broker.db_type) - return HTTPNotFound() - raise + yield timespan = time.time() - timemark if timespan > DEBUG_TIMINGS_THRESHOLD: - self.logger.debug(_('replicator-rpc-sync time for info: %.02fs') % - timespan) + self.logger.debug( + 'replicator-rpc-sync time for %s: %.02fs' % ( + name, timespan)) + + def _parse_sync_args(self, args): + """ + Convert remote sync args to remote_info dictionary. + """ + (remote_sync, hash_, id_, created_at, put_timestamp, + delete_timestamp, metadata) = args[:7] + remote_metadata = {} if metadata: - timemark = time.time() - broker.update_metadata(simplejson.loads(metadata)) - timespan = time.time() - timemark - if timespan > DEBUG_TIMINGS_THRESHOLD: - self.logger.debug(_('replicator-rpc-sync time for ' - 'update_metadata: %.02fs') % timespan) - if info['put_timestamp'] != put_timestamp or \ - info['created_at'] != created_at or \ - info['delete_timestamp'] != delete_timestamp: - timemark = time.time() - broker.merge_timestamps( - created_at, put_timestamp, delete_timestamp) - timespan = time.time() - timemark - if timespan > DEBUG_TIMINGS_THRESHOLD: - self.logger.debug(_('replicator-rpc-sync time for ' - 'merge_timestamps: %.02fs') % timespan) - timemark = time.time() - info['point'] = broker.get_sync(id_) - timespan = time.time() - timemark - if timespan > DEBUG_TIMINGS_THRESHOLD: - self.logger.debug(_('replicator-rpc-sync time for get_sync: ' - '%.02fs') % timespan) - if hash_ == info['hash'] and info['point'] < remote_sync: - timemark = time.time() - broker.merge_syncs([{'remote_id': id_, - 'sync_point': remote_sync}]) - info['point'] = remote_sync - timespan = time.time() - timemark - if timespan > DEBUG_TIMINGS_THRESHOLD: - self.logger.debug(_('replicator-rpc-sync time for ' - 'merge_syncs: %.02fs') % timespan) - return Response(simplejson.dumps(info)) + try: + remote_metadata = json.loads(metadata) + except ValueError: + self.logger.error("Unable to decode remote metadata %r", + metadata) + remote_info = { + 'point': remote_sync, + 'hash': hash_, + 'id': id_, + 'created_at': created_at, + 'put_timestamp': put_timestamp, + 'delete_timestamp': delete_timestamp, + 'metadata': remote_metadata, + } + return remote_info + + def sync(self, broker, args): + remote_info = self._parse_sync_args(args) + return self._handle_sync_request(broker, remote_info) + + def _get_synced_replication_info(self, broker, remote_info): + """ + Apply any changes to the broker based on remote_info and return the + current replication info. + + :param broker: the database broker + :param remote_info: the remote replication info + + :returns: local broker replication info + """ + return broker.get_replication_info() + + def _handle_sync_request(self, broker, remote_info): + """ + Update metadata, timestamps, sync points. + """ + with self.debug_timing('info'): + try: + info = self._get_synced_replication_info(broker, remote_info) + except (Exception, Timeout) as e: + if 'no such table' in str(e): + self.db_logger.error(broker, "Quarantining DB", ) + quarantine_db(broker.db_file, broker.db_type) + return HTTPNotFound() + raise + # TODO(mattoliverau) At this point in the RPC, we have the callers + # replication info and ours, so it would be cool to be able to make + # an educated guess here on the size of the incoming replication (maybe + # average object table row size * difference in ROWIDs or something) + # and the fallocate_reserve setting so we could return a 507. + # This would make db fallocate_reserve more or less on par with the + # object's. + if remote_info['metadata']: + with self.debug_timing('update_metadata'): + broker.update_metadata(remote_info['metadata']) + sync_timestamps = ('created_at', 'put_timestamp', 'delete_timestamp') + if any(info[ts] != remote_info[ts] for ts in sync_timestamps): + with self.debug_timing('merge_timestamps'): + broker.merge_timestamps(*(remote_info[ts] for ts in + sync_timestamps)) + with self.debug_timing('get_sync'): + info['point'] = broker.get_sync(remote_info['id']) + if remote_info['hash'] == info['hash'] and \ + info['point'] < remote_info['point']: + with self.debug_timing('merge_syncs'): + translate = { + 'remote_id': 'id', + 'sync_point': 'point', + } + data = dict((k, remote_info[v]) for k, v in translate.items()) + broker.merge_syncs([data]) + info['point'] = remote_info['point'] + return Response(json.dumps(info)) def merge_syncs(self, broker, args): broker.merge_syncs(args[0]) @@ -638,21 +1085,32 @@ def merge_items(self, broker, args): def complete_rsync(self, drive, db_file, args): old_filename = os.path.join(self.root, drive, 'tmp', args[0]) + if args[1:]: + db_file = os.path.join(os.path.dirname(db_file), args[1]) if os.path.exists(db_file): return HTTPNotFound() if not os.path.exists(old_filename): return HTTPNotFound() - broker = self.broker_class(old_filename) + broker = self.broker_class(old_filename, logger=self.logger) broker.newid(args[0]) renamer(old_filename, db_file) return HTTPNoContent() + def _abort_rsync_then_merge(self, db_file, tmp_filename): + return not (self._db_file_exists(db_file) and + os.path.exists(tmp_filename)) + + def _post_rsync_then_merge_hook(self, existing_broker, new_broker): + # subclasses may override to make custom changes to the new broker + pass + def rsync_then_merge(self, drive, db_file, args): - old_filename = os.path.join(self.root, drive, 'tmp', args[0]) - if not os.path.exists(db_file) or not os.path.exists(old_filename): + tmp_filename = os.path.join(self.root, drive, 'tmp', args[0]) + if self._abort_rsync_then_merge(db_file, tmp_filename): return HTTPNotFound() - new_broker = self.broker_class(old_filename) - existing_broker = self.broker_class(db_file) + new_broker = self.broker_class(tmp_filename, logger=self.logger) + existing_broker = self.broker_class(db_file, logger=self.logger) + db_file = existing_broker.db_file point = -1 objects = existing_broker.get_items_since(point, 1000) while len(objects): @@ -660,8 +1118,13 @@ def rsync_then_merge(self, drive, db_file, args): point = objects[-1]['ROWID'] objects = existing_broker.get_items_since(point, 1000) sleep() + new_broker.merge_syncs(existing_broker.get_syncs()) + self._post_rsync_then_merge_hook(existing_broker, new_broker) new_broker.newid(args[0]) - renamer(old_filename, db_file) + new_broker.update_metadata(existing_broker.metadata) + if self._abort_rsync_then_merge(db_file, tmp_filename): + return HTTPNotFound() + renamer(tmp_filename, db_file) return HTTPNoContent() # Footnote [1]: diff --git a/swift/common/digest.py b/swift/common/digest.py new file mode 100644 index 0000000000..36bc47a9bd --- /dev/null +++ b/swift/common/digest.py @@ -0,0 +1,141 @@ +# Copyright (c) 2022 NVIDIA +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import binascii +import hmac + +from swift.common.utils import strict_b64decode + + +DEFAULT_ALLOWED_DIGESTS = 'sha1 sha256 sha512' +DEPRECATED_DIGESTS = {'sha1'} +SUPPORTED_DIGESTS = set(DEFAULT_ALLOWED_DIGESTS.split()) | DEPRECATED_DIGESTS + + +def get_hmac(request_method, path, expires, key, digest="sha1", + ip_range=None): + """ + Returns the hexdigest string of the HMAC (see RFC 2104) for + the request. + + :param request_method: Request method to allow. + :param path: The path to the resource to allow access to. + :param expires: Unix timestamp as an int for when the URL + expires. + :param key: HMAC shared secret. + :param digest: constructor or the string name for the digest to use in + calculating the HMAC + Defaults to SHA1 + :param ip_range: The ip range from which the resource is allowed + to be accessed. We need to put the ip_range as the + first argument to hmac to avoid manipulation of the path + due to newlines being valid in paths + e.g. /v1/a/c/o\\n127.0.0.1 + :returns: hexdigest str of the HMAC for the request using the specified + digest algorithm. + """ + # These are the three mandatory fields. + parts = [request_method, str(expires), path] + formats = [b"%s", b"%s", b"%s"] + + if ip_range: + parts.insert(0, ip_range) + formats.insert(0, b"ip=%s") + + if isinstance(key, str): + key = key.encode('utf8') + + message = b'\n'.join( + fmt % (part if isinstance(part, bytes) + else part.encode("utf-8")) + for fmt, part in zip(formats, parts)) + + return hmac.new(key, message, digest).hexdigest() + + +def get_allowed_digests(conf_digests, logger=None): + """ + Pulls out 'allowed_digests' from the supplied conf. Then compares them with + the list of supported and deprecated digests and returns whatever remain. + + When something is unsupported or deprecated it'll log a warning. + + :param conf_digests: iterable of allowed digests. If empty, defaults to + DEFAULT_ALLOWED_DIGESTS. + :param logger: optional logger; if provided, use it issue deprecation + warnings + :returns: A set of allowed digests that are supported and a set of + deprecated digests. + :raises: ValueError, if there are no digests left to return. + """ + allowed_digests = set(digest.lower() for digest in conf_digests) + if not allowed_digests: + allowed_digests = SUPPORTED_DIGESTS + + not_supported = allowed_digests - SUPPORTED_DIGESTS + if not_supported: + if logger: + logger.warning('The following digest algorithms are configured ' + 'but not supported: %s', ', '.join(not_supported)) + allowed_digests -= not_supported + deprecated = allowed_digests & DEPRECATED_DIGESTS + if deprecated and logger: + if not conf_digests: + logger.warning('The following digest algorithms are allowed by ' + 'default but deprecated: %s. Support will be ' + 'disabled by default in a future release, and ' + 'later removed entirely.', ', '.join(deprecated)) + else: + logger.warning('The following digest algorithms are configured ' + 'but deprecated: %s. Support will be removed in a ' + 'future release.', ', '.join(deprecated)) + if not allowed_digests: + raise ValueError('No valid digest algorithms are configured') + + return allowed_digests, deprecated + + +def extract_digest_and_algorithm(value): + """ + Returns a tuple of (digest_algorithm, hex_encoded_digest) + from a client-provided string of the form:: + + + + or:: + + : + + Note that hex-encoded strings must use one of sha1, sha256, or sha512. + + :raises: ValueError on parse failures + """ + if ':' in value: + algo, value = value.split(':', 1) + # accept both standard and url-safe base64 + if ('-' in value or '_' in value) and not ( + '+' in value or '/' in value): + value = value.replace('-', '+').replace('_', '/') + value = binascii.hexlify( + strict_b64decode(value + '==')).decode('ascii') + else: + binascii.unhexlify(value) # make sure it decodes + algo = { + 40: 'sha1', + 64: 'sha256', + 128: 'sha512', + }.get(len(value)) + if not algo: + raise ValueError('Bad digest length') + return algo, value diff --git a/swift/common/direct_client.py b/swift/common/direct_client.py index 4078ec3348..94a4558384 100644 --- a/swift/common/direct_client.py +++ b/swift/common/direct_client.py @@ -1,4 +1,4 @@ -# Copyright (c) 2010-2012 OpenStack, LLC. +# Copyright (c) 2010-2012 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,80 +18,269 @@ through the proxy. """ +import json +import os import socket -from httplib import HTTPException -from time import time -from urllib import quote as _quote from eventlet import sleep, Timeout - -from swift.common.bufferedhttp import http_connect -from swiftclient import ClientException, json_loads -from swift.common.utils import normalize_timestamp +import pickle # nosec: B403 +from http.client import HTTPException + +from swift.common.bufferedhttp import http_connect, http_connect_raw +from swift.common.exceptions import ClientException +from swift.common.request_helpers import USE_REPLICATION_NETWORK_HEADER, \ + get_ip_port +from swift.common.swob import normalize_etag +from swift.common.utils import Timestamp, FileLikeIter, quote from swift.common.http import HTTP_NO_CONTENT, HTTP_INSUFFICIENT_STORAGE, \ is_success, is_server_error +from swift.common.header_key_dict import HeaderKeyDict + + +class DirectClientException(ClientException): + + def __init__(self, stype, method, node, part, path, resp, host=None): + # host can be used to override the node ip and port reported in + # the exception + host = host if host is not None else node + if isinstance(path, bytes): + path = path.decode("utf-8") + full_path = quote('/%s/%s%s' % (node['device'], part, path)) + msg = '%s server %s:%s direct %s %r gave status %s' % ( + stype, host['ip'], host['port'], method, full_path, resp.status) + headers = HeaderKeyDict(resp.getheaders()) + super(DirectClientException, self).__init__( + msg, http_host=host['ip'], http_port=host['port'], + http_device=node['device'], http_status=resp.status, + http_reason=resp.reason, http_headers=headers) -def quote(value, safe='/'): - if isinstance(value, unicode): - value = value.encode('utf8') - return _quote(value, safe) +class DirectClientReconException(ClientException): + def __init__(self, method, node, path, resp): + if isinstance(path, bytes): + path = path.decode("utf-8") + msg = 'server %s:%s direct %s %r gave status %s' % ( + node['ip'], node['port'], method, path, resp.status) + headers = HeaderKeyDict(resp.getheaders()) + super(DirectClientReconException, self).__init__( + msg, http_host=node['ip'], http_port=node['port'], + http_status=resp.status, http_reason=resp.reason, + http_headers=headers) -def direct_get_account(node, part, account, marker=None, limit=None, - prefix=None, delimiter=None, conn_timeout=5, - response_timeout=15): + +def _make_path(*components): + return u'/' + u'/'.join( + x.decode('utf-8') if isinstance(x, bytes) else x + for x in components) + + +def _make_req(node, part, method, path, headers, stype, + conn_timeout=5, response_timeout=15, send_timeout=15, + contents=None, content_length=None, chunk_size=65535): """ - Get listings directly from the account server. + Make request to backend storage node. + (i.e. 'Account', 'Container', 'Object') + :param node: a node dict from a ring + :param part: an integer, the partition number + :param method: a string, the HTTP method (e.g. 'PUT', 'DELETE', etc) + :param path: a string, the request path + :param headers: a dict, header name => value + :param stype: a string, describing the type of service + :param conn_timeout: timeout while waiting for connection; default is 5 + seconds + :param response_timeout: timeout while waiting for response; default is 15 + seconds + :param send_timeout: timeout for sending request body; default is 15 + seconds + :param contents: an iterable or string to read object data from + :param content_length: value to send as content-length header + :param chunk_size: if defined, chunk size of data to send + :returns: an HTTPResponse object + :raises DirectClientException: if the response status is not 2xx + :raises eventlet.Timeout: if either conn_timeout or response_timeout is + exceeded + """ + if contents is not None: + if content_length is not None: + headers['Content-Length'] = str(content_length) + else: + for n, v in headers.items(): + if n.lower() == 'content-length': + content_length = int(v) + if not contents: + headers['Content-Length'] = '0' + if isinstance(contents, str): + contents = [contents] + if content_length is None: + headers['Transfer-Encoding'] = 'chunked' + + ip, port = get_ip_port(node, headers) + headers.setdefault('X-Backend-Allow-Reserved-Names', 'true') + with Timeout(conn_timeout): + conn = http_connect(ip, port, node['device'], part, + method, path, headers=headers) + + if contents is not None: + contents_f = FileLikeIter(contents) + + with Timeout(send_timeout): + if content_length is None: + chunk = contents_f.read(chunk_size) + while chunk: + conn.send(b'%x\r\n%s\r\n' % (len(chunk), chunk)) + chunk = contents_f.read(chunk_size) + conn.send(b'0\r\n\r\n') + else: + left = content_length + while left > 0: + size = chunk_size + if size > left: + size = left + chunk = contents_f.read(size) + if not chunk: + break + conn.send(chunk) + left -= len(chunk) - :param node: node dictionary from the ring - :param part: partition the account is on - :param account: account name - :param marker: marker query - :param limit: query limit - :param prefix: prefix query - :param delimeter: delimeter for the query - :param conn_timeout: timeout in seconds for establishing the connection - :param response_timeout: timeout in seconds for getting the response - :returns: a tuple of (response headers, a list of containers) The response - headers will be a dict and all header names will be lowercase. + with Timeout(response_timeout): + resp = conn.getresponse() + resp.read() + if not is_success(resp.status): + raise DirectClientException(stype, method, node, part, path, resp) + return resp + + +def _get_direct_account_container(path, stype, node, part, + marker=None, limit=None, + prefix=None, delimiter=None, + conn_timeout=5, response_timeout=15, + end_marker=None, reverse=None, headers=None, + extra_params=None): + """Base function for get direct account and container. + + Do not use directly use the direct_get_account or + direct_get_container instead. """ - path = '/' + account - qs = 'format=json' + if headers is None: + headers = {} + + params = {'format': 'json'} + if extra_params: + for key, value in extra_params.items(): + if value is not None: + params[key] = value if marker: - qs += '&marker=%s' % quote(marker) + if 'marker' in params: + raise TypeError('duplicate values for keyword arg: marker') + params['marker'] = quote(marker) if limit: - qs += '&limit=%d' % limit + if 'limit' in params: + raise TypeError('duplicate values for keyword arg: limit') + params['limit'] = '%d' % limit if prefix: - qs += '&prefix=%s' % quote(prefix) + if 'prefix' in params: + raise TypeError('duplicate values for keyword arg: prefix') + params['prefix'] = quote(prefix) if delimiter: - qs += '&delimiter=%s' % quote(delimiter) + if 'delimiter' in params: + raise TypeError('duplicate values for keyword arg: delimiter') + params['delimiter'] = quote(delimiter) + if end_marker: + if 'end_marker' in params: + raise TypeError('duplicate values for keyword arg: end_marker') + params['end_marker'] = quote(end_marker) + if reverse: + if 'reverse' in params: + raise TypeError('duplicate values for keyword arg: reverse') + params['reverse'] = quote(reverse) + qs = '&'.join('%s=%s' % (k, v) for k, v in params.items()) + + ip, port = get_ip_port(node, headers) with Timeout(conn_timeout): - conn = http_connect(node['ip'], node['port'], node['device'], part, - 'GET', path, query_string=qs) + conn = http_connect(ip, port, node['device'], part, + 'GET', path, query_string=qs, + headers=gen_headers(hdrs_in=headers)) with Timeout(response_timeout): resp = conn.getresponse() if not is_success(resp.status): resp.read() - raise ClientException( - 'Account server %s:%s direct GET %s gave status %s' % - (node['ip'], node['port'], - repr('/%s/%s%s' % (node['device'], part, path)), - resp.status), - http_host=node['ip'], http_port=node['port'], - http_device=node['device'], http_status=resp.status, - http_reason=resp.reason) - resp_headers = {} + raise DirectClientException(stype, 'GET', node, part, path, resp) + + resp_headers = HeaderKeyDict() for header, value in resp.getheaders(): - resp_headers[header.lower()] = value + resp_headers[header] = value if resp.status == HTTP_NO_CONTENT: resp.read() return resp_headers, [] - return resp_headers, json_loads(resp.read()) + return resp_headers, json.loads(resp.read()) + + +def gen_headers(hdrs_in=None, add_ts=True): + """ + Get the headers ready for a request. All requests should have a User-Agent + string, but if one is passed in don't over-write it. Not all requests will + need an X-Timestamp, but if one is passed in do not over-write it. + + :param headers: dict or None, base for HTTP headers + :param add_ts: boolean, should be True for any "unsafe" HTTP request + + :returns: HeaderKeyDict based on headers and ready for the request + """ + hdrs_out = HeaderKeyDict(hdrs_in) if hdrs_in else HeaderKeyDict() + if add_ts and 'X-Timestamp' not in hdrs_out: + hdrs_out['X-Timestamp'] = Timestamp.now().internal + if 'user-agent' not in hdrs_out: + hdrs_out['User-Agent'] = 'direct-client %s' % os.getpid() + hdrs_out.setdefault('X-Backend-Allow-Reserved-Names', 'true') + return hdrs_out + + +def direct_get_account(node, part, account, marker=None, limit=None, + prefix=None, delimiter=None, conn_timeout=5, + response_timeout=15, end_marker=None, reverse=None, + headers=None): + """ + Get listings directly from the account server. + + :param node: node dictionary from the ring + :param part: partition the account is on + :param account: account name + :param marker: marker query + :param limit: query limit + :param prefix: prefix query + :param delimiter: delimiter for the query + :param conn_timeout: timeout in seconds for establishing the connection + :param response_timeout: timeout in seconds for getting the response + :param end_marker: end_marker query + :param reverse: reverse the returned listing + :returns: a tuple of (response headers, a list of containers) The response + headers will HeaderKeyDict. + """ + path = _make_path(account) + return _get_direct_account_container(path, "Account", node, part, + headers=headers, + marker=marker, + limit=limit, prefix=prefix, + delimiter=delimiter, + end_marker=end_marker, + reverse=reverse, + conn_timeout=conn_timeout, + response_timeout=response_timeout) + + +def direct_delete_account(node, part, account, conn_timeout=5, + response_timeout=15, headers=None): + if headers is None: + headers = {} + + path = _make_path(account) + _make_req(node, part, 'DELETE', path, gen_headers(headers, True), + 'Account', conn_timeout, response_timeout) def direct_head_container(node, part, account, container, conn_timeout=5, - response_timeout=15): + response_timeout=15, headers=None): """ Request container information directly from the container server. @@ -101,34 +290,26 @@ def direct_head_container(node, part, account, container, conn_timeout=5, :param container: container name :param conn_timeout: timeout in seconds for establishing the connection :param response_timeout: timeout in seconds for getting the response - :returns: a dict containing the response's headers (all header names will - be lowercase) + :returns: a dict containing the response's headers in a HeaderKeyDict + :raises ClientException: HTTP HEAD request failed """ - path = '/%s/%s' % (account, container) - with Timeout(conn_timeout): - conn = http_connect(node['ip'], node['port'], node['device'], part, - 'HEAD', path) - with Timeout(response_timeout): - resp = conn.getresponse() - resp.read() - if not is_success(resp.status): - raise ClientException( - 'Container server %s:%s direct HEAD %s gave status %s' % - (node['ip'], node['port'], - repr('/%s/%s%s' % (node['device'], part, path)), - resp.status), - http_host=node['ip'], http_port=node['port'], - http_device=node['device'], http_status=resp.status, - http_reason=resp.reason) - resp_headers = {} + if headers is None: + headers = {} + + path = _make_path(account, container) + resp = _make_req(node, part, 'HEAD', path, gen_headers(headers), + 'Container', conn_timeout, response_timeout) + + resp_headers = HeaderKeyDict() for header, value in resp.getheaders(): - resp_headers[header.lower()] = value + resp_headers[header] = value return resp_headers def direct_get_container(node, part, account, container, marker=None, limit=None, prefix=None, delimiter=None, - conn_timeout=5, response_timeout=15): + conn_timeout=5, response_timeout=15, end_marker=None, + reverse=None, headers=None, extra_params=None): """ Get container listings directly from the container server. @@ -139,68 +320,142 @@ def direct_get_container(node, part, account, container, marker=None, :param marker: marker query :param limit: query limit :param prefix: prefix query - :param delimeter: delimeter for the query + :param delimiter: delimiter for the query :param conn_timeout: timeout in seconds for establishing the connection :param response_timeout: timeout in seconds for getting the response + :param end_marker: end_marker query + :param reverse: reverse the returned listing + :param headers: headers to be included in the request + :param extra_params: a dict of extra parameters to be included in the + request. It can be used to pass additional parameters, e.g, + {'states':'updating'} can be used with shard_range/namespace listing. + It can also be used to pass the existing keyword args, like 'marker' or + 'limit', but if the same parameter appears twice in both keyword arg + (not None) and extra_params, this function will raise TypeError. :returns: a tuple of (response headers, a list of objects) The response - headers will be a dict and all header names will be lowercase. + headers will be a HeaderKeyDict. """ - path = '/%s/%s' % (account, container) - qs = 'format=json' - if marker: - qs += '&marker=%s' % quote(marker) - if limit: - qs += '&limit=%d' % limit - if prefix: - qs += '&prefix=%s' % quote(prefix) - if delimiter: - qs += '&delimiter=%s' % quote(delimiter) - with Timeout(conn_timeout): - conn = http_connect(node['ip'], node['port'], node['device'], part, - 'GET', path, query_string=qs) - with Timeout(response_timeout): - resp = conn.getresponse() - if not is_success(resp.status): - resp.read() - raise ClientException( - 'Container server %s:%s direct GET %s gave stats %s' % - (node['ip'], node['port'], - repr('/%s/%s%s' % (node['device'], part, path)), - resp.status), - http_host=node['ip'], http_port=node['port'], - http_device=node['device'], http_status=resp.status, - http_reason=resp.reason) - resp_headers = {} - for header, value in resp.getheaders(): - resp_headers[header.lower()] = value - if resp.status == HTTP_NO_CONTENT: - resp.read() - return resp_headers, [] - return resp_headers, json_loads(resp.read()) + path = _make_path(account, container) + return _get_direct_account_container(path, "Container", node, + part, marker=marker, + limit=limit, prefix=prefix, + delimiter=delimiter, + end_marker=end_marker, + reverse=reverse, + conn_timeout=conn_timeout, + response_timeout=response_timeout, + headers=headers, + extra_params=extra_params) def direct_delete_container(node, part, account, container, conn_timeout=5, - response_timeout=15, headers={}): - path = '/%s/%s' % (account, container) - headers['X-Timestamp'] = normalize_timestamp(time()) - with Timeout(conn_timeout): - conn = http_connect(node['ip'], node['port'], node['device'], part, - 'DELETE', path, headers) - with Timeout(response_timeout): - resp = conn.getresponse() - resp.read() - if not is_success(resp.status): - raise ClientException( - 'Container server %s:%s direct DELETE %s gave status %s' % - (node['ip'], node['port'], - repr('/%s/%s%s' % (node['device'], part, path)), resp.status), - http_host=node['ip'], http_port=node['port'], - http_device=node['device'], http_status=resp.status, - http_reason=resp.reason) + response_timeout=15, headers=None): + """ + Delete container directly from the container server. + + :param node: node dictionary from the ring + :param part: partition the container is on + :param account: account name + :param container: container name + :param conn_timeout: timeout in seconds for establishing the connection + :param response_timeout: timeout in seconds for getting the response + :param headers: dict to be passed into HTTPConnection headers + :raises ClientException: HTTP DELETE request failed + """ + if headers is None: + headers = {} + + path = _make_path(account, container) + add_timestamp = 'x-timestamp' not in (k.lower() for k in headers) + _make_req(node, part, 'DELETE', path, gen_headers(headers, add_timestamp), + 'Container', conn_timeout, response_timeout) + + +def direct_put_container(node, part, account, container, conn_timeout=5, + response_timeout=15, headers=None, contents=None, + content_length=None, chunk_size=65535): + """ + Make a PUT request to a container server. + + :param node: node dictionary from the ring + :param part: partition the container is on + :param account: account name + :param container: container name + :param conn_timeout: timeout in seconds for establishing the connection + :param response_timeout: timeout in seconds for getting the response + :param headers: additional headers to include in the request + :param contents: an iterable or string to send in request body (optional) + :param content_length: value to send as content-length header (optional) + :param chunk_size: chunk size of data to send (optional) + :raises ClientException: HTTP PUT request failed + """ + if headers is None: + headers = {} + + lower_headers = set(k.lower() for k in headers) + headers_out = gen_headers(headers, + add_ts='x-timestamp' not in lower_headers) + path = _make_path(account, container) + _make_req(node, part, 'PUT', path, headers_out, 'Container', conn_timeout, + response_timeout, contents=contents, + content_length=content_length, chunk_size=chunk_size) + + +def direct_post_container(node, part, account, container, conn_timeout=5, + response_timeout=15, headers=None): + """ + Make a POST request to a container server. + + :param node: node dictionary from the ring + :param part: partition the container is on + :param account: account name + :param container: container name + :param conn_timeout: timeout in seconds for establishing the connection + :param response_timeout: timeout in seconds for getting the response + :param headers: additional headers to include in the request + :raises ClientException: HTTP PUT request failed + """ + if headers is None: + headers = {} + + lower_headers = set(k.lower() for k in headers) + headers_out = gen_headers(headers, + add_ts='x-timestamp' not in lower_headers) + path = _make_path(account, container) + return _make_req(node, part, 'POST', path, headers_out, 'Container', + conn_timeout, response_timeout) + + +def direct_put_container_object(node, part, account, container, obj, + conn_timeout=5, response_timeout=15, + headers=None): + if headers is None: + headers = {} + + have_x_timestamp = 'x-timestamp' in (k.lower() for k in headers) + + path = _make_path(account, container, obj) + _make_req(node, part, 'PUT', path, + gen_headers(headers, add_ts=(not have_x_timestamp)), + 'Container', conn_timeout, response_timeout) + + +def direct_delete_container_object(node, part, account, container, obj, + conn_timeout=5, response_timeout=15, + headers=None): + if headers is None: + headers = {} + + headers = gen_headers(headers, add_ts='x-timestamp' not in ( + k.lower() for k in headers)) + + path = _make_path(account, container, obj) + _make_req(node, part, 'DELETE', path, headers, + 'Container', conn_timeout, response_timeout) def direct_head_object(node, part, account, container, obj, conn_timeout=5, - response_timeout=15): + response_timeout=15, headers=None): """ Request object information directly from the object server. @@ -211,33 +466,27 @@ def direct_head_object(node, part, account, container, obj, conn_timeout=5, :param obj: object name :param conn_timeout: timeout in seconds for establishing the connection :param response_timeout: timeout in seconds for getting the response - :returns: a dict containing the response's headers (all header names will - be lowercase) + :param headers: dict to be passed into HTTPConnection headers + :returns: a dict containing the response's headers in a HeaderKeyDict + :raises ClientException: HTTP HEAD request failed """ - path = '/%s/%s/%s' % (account, container, obj) - with Timeout(conn_timeout): - conn = http_connect(node['ip'], node['port'], node['device'], part, - 'HEAD', path) - with Timeout(response_timeout): - resp = conn.getresponse() - resp.read() - if not is_success(resp.status): - raise ClientException( - 'Object server %s:%s direct HEAD %s gave status %s' % - (node['ip'], node['port'], - repr('/%s/%s%s' % (node['device'], part, path)), - resp.status), - http_host=node['ip'], http_port=node['port'], - http_device=node['device'], http_status=resp.status, - http_reason=resp.reason) - resp_headers = {} + if headers is None: + headers = {} + + headers = gen_headers(headers) + + path = _make_path(account, container, obj) + resp = _make_req(node, part, 'HEAD', path, headers, + 'Object', conn_timeout, response_timeout) + + resp_headers = HeaderKeyDict() for header, value in resp.getheaders(): - resp_headers[header.lower()] = value + resp_headers[header] = value return resp_headers def direct_get_object(node, part, account, container, obj, conn_timeout=5, - response_timeout=15, resp_chunk_size=None, headers={}): + response_timeout=15, resp_chunk_size=None, headers=None): """ Get object directly from the object server. @@ -251,23 +500,23 @@ def direct_get_object(node, part, account, container, obj, conn_timeout=5, :param resp_chunk_size: if defined, chunk size of data to read. :param headers: dict to be passed into HTTPConnection headers :returns: a tuple of (response headers, the object's contents) The response - headers will be a dict and all header names will be lowercase. + headers will be a HeaderKeyDict. + :raises ClientException: HTTP GET request failed """ - path = '/%s/%s/%s' % (account, container, obj) + if headers is None: + headers = {} + + ip, port = get_ip_port(node, headers) + path = _make_path(account, container, obj) with Timeout(conn_timeout): - conn = http_connect(node['ip'], node['port'], node['device'], part, - 'GET', path, headers=headers) + conn = http_connect(ip, port, node['device'], part, + 'GET', path, headers=gen_headers(headers)) with Timeout(response_timeout): resp = conn.getresponse() if not is_success(resp.status): resp.read() - raise ClientException( - 'Object server %s:%s direct GET %s gave status %s' % - (node['ip'], node['port'], - repr('/%s/%s%s' % (node['device'], part, path)), resp.status), - http_host=node['ip'], http_port=node['port'], - http_device=node['device'], http_status=resp.status, - http_reason=resp.reason) + raise DirectClientException('Object', 'GET', node, part, path, resp) + if resp_chunk_size: def _object_body(): @@ -278,16 +527,16 @@ def _object_body(): object_body = _object_body() else: object_body = resp.read() - resp_headers = {} + resp_headers = HeaderKeyDict() for header, value in resp.getheaders(): - resp_headers[header.lower()] = value + resp_headers[header] = value return resp_headers, object_body def direct_put_object(node, part, account, container, name, contents, content_length=None, etag=None, content_type=None, headers=None, conn_timeout=5, response_timeout=15, - resp_chunk_size=None): + chunk_size=65535): """ Put object directly from the object server. @@ -305,42 +554,27 @@ def direct_put_object(node, part, account, container, name, contents, :param response_timeout: timeout in seconds for getting the response :param chunk_size: if defined, chunk size of data to send. :returns: etag from the server response + :raises ClientException: HTTP PUT request failed """ - # TODO: Add chunked puts - path = '/%s/%s/%s' % (account, container, name) + + path = _make_path(account, container, name) if headers is None: headers = {} if etag: - headers['ETag'] = etag.strip('"') - if content_length is not None: - headers['Content-Length'] = str(content_length) + headers['ETag'] = normalize_etag(etag) if content_type is not None: headers['Content-Type'] = content_type else: headers['Content-Type'] = 'application/octet-stream' - if not contents: - headers['Content-Length'] = '0' - if isinstance(contents, basestring): - contents = [contents] - headers['X-Timestamp'] = normalize_timestamp(time()) - with Timeout(conn_timeout): - conn = http_connect(node['ip'], node['port'], node['device'], part, - 'PUT', path, headers=headers) - for chunk in contents: - conn.send(chunk) - with Timeout(response_timeout): - resp = conn.getresponse() - resp.read() - if not is_success(resp.status): - raise ClientException( - 'Object server %s:%s direct PUT %s gave status %s' % - (node['ip'], node['port'], - repr('/%s/%s%s' % (node['device'], part, path)), - resp.status), - http_host=node['ip'], http_port=node['port'], - http_device=node['device'], http_status=resp.status, - http_reason=resp.reason) - return resp.getheader('etag').strip('"') + # Incase the caller want to insert an object with specific age + add_ts = 'X-Timestamp' not in headers + + resp = _make_req( + node, part, 'PUT', path, gen_headers(headers, add_ts=add_ts), + 'Object', conn_timeout, response_timeout, contents=contents, + content_length=content_length, chunk_size=chunk_size) + + return normalize_etag(resp.getheader('etag')) def direct_post_object(node, part, account, container, name, headers, @@ -358,27 +592,13 @@ def direct_post_object(node, part, account, container, name, headers, :param response_timeout: timeout in seconds for getting the response :raises ClientException: HTTP POST request failed """ - path = '/%s/%s/%s' % (account, container, name) - headers['X-Timestamp'] = normalize_timestamp(time()) - with Timeout(conn_timeout): - conn = http_connect(node['ip'], node['port'], node['device'], part, - 'POST', path, headers=headers) - with Timeout(response_timeout): - resp = conn.getresponse() - resp.read() - if not is_success(resp.status): - raise ClientException( - 'Object server %s:%s direct POST %s gave status %s' % - (node['ip'], node['port'], - repr('/%s/%s%s' % (node['device'], part, path)), - resp.status), - http_host=node['ip'], http_port=node['port'], - http_device=node['device'], http_status=resp.status, - http_reason=resp.reason) + path = _make_path(account, container, name) + _make_req(node, part, 'POST', path, gen_headers(headers, True), + 'Object', conn_timeout, response_timeout) def direct_delete_object(node, part, account, container, obj, - conn_timeout=5, response_timeout=15, headers={}): + conn_timeout=5, response_timeout=15, headers=None): """ Delete object directly from the object server. @@ -389,25 +609,54 @@ def direct_delete_object(node, part, account, container, obj, :param obj: object name :param conn_timeout: timeout in seconds for establishing the connection :param response_timeout: timeout in seconds for getting the response - :returns: response from server + :raises ClientException: HTTP DELETE request failed """ - path = '/%s/%s/%s' % (account, container, obj) - headers['X-Timestamp'] = normalize_timestamp(time()) + if headers is None: + headers = {} + + headers = gen_headers(headers, add_ts='x-timestamp' not in ( + k.lower() for k in headers)) + + path = _make_path(account, container, obj) + _make_req(node, part, 'DELETE', path, headers, + 'Object', conn_timeout, response_timeout) + + +def direct_get_suffix_hashes(node, part, suffixes, conn_timeout=5, + response_timeout=15, headers=None): + """ + Get suffix hashes directly from the object server. + + Note that unlike other ``direct_client`` functions, this one defaults + to using the replication network to make requests. + + :param node: node dictionary from the ring + :param part: partition the container is on + :param conn_timeout: timeout in seconds for establishing the connection + :param response_timeout: timeout in seconds for getting the response + :param headers: dict to be passed into HTTPConnection headers + :returns: dict of suffix hashes + :raises ClientException: HTTP REPLICATE request failed + """ + if headers is None: + headers = {} + + headers.setdefault(USE_REPLICATION_NETWORK_HEADER, 'true') + ip, port = get_ip_port(node, headers) + path = '/%s' % '-'.join(suffixes) with Timeout(conn_timeout): - conn = http_connect(node['ip'], node['port'], node['device'], part, - 'DELETE', path, headers) + conn = http_connect(ip, port, + node['device'], part, 'REPLICATE', path, + headers=gen_headers(headers)) with Timeout(response_timeout): resp = conn.getresponse() - resp.read() if not is_success(resp.status): - raise ClientException( - 'Object server %s:%s direct DELETE %s gave status %s' % - (node['ip'], node['port'], - repr('/%s/%s%s' % (node['device'], part, path)), - resp.status), - http_host=node['ip'], http_port=node['port'], - http_device=node['device'], http_status=resp.status, - http_reason=resp.reason) + raise DirectClientException('Object', 'REPLICATE', + node, part, path, resp, + host={'ip': node['replication_ip'], + 'port': node['replication_port']} + ) + return pickle.loads(resp.read()) # nosec: B301 def retry(func, *args, **kwargs): @@ -421,28 +670,23 @@ def retry(func, *args, **kwargs): :param kwargs: keyward arguments to send to func (if retries or error_log are sent, they will be deleted from kwargs before sending on to func) - :returns: restult of func - """ - retries = 5 - if 'retries' in kwargs: - retries = kwargs['retries'] - del kwargs['retries'] - error_log = None - if 'error_log' in kwargs: - error_log = kwargs['error_log'] - del kwargs['error_log'] + :returns: result of func + :raises ClientException: all retries failed + """ + retries = kwargs.pop('retries', 5) + error_log = kwargs.pop('error_log', None) attempts = 0 backoff = 1 while attempts <= retries: attempts += 1 try: return attempts, func(*args, **kwargs) - except (socket.error, HTTPException, Timeout), err: + except (socket.error, HTTPException, Timeout) as err: if error_log: error_log(err) if attempts > retries: raise - except ClientException, err: + except ClientException as err: if error_log: error_log(err) if attempts > retries or not is_server_error(err.http_status) or \ @@ -453,8 +697,36 @@ def retry(func, *args, **kwargs): # Shouldn't actually get down here, but just in case. if args and 'ip' in args[0]: raise ClientException('Raise too many retries', - http_host=args[ - 0]['ip'], http_port=args[0]['port'], + http_host=args[0]['ip'], + http_port=args[0]['port'], http_device=args[0]['device']) else: raise ClientException('Raise too many retries') + + +def direct_get_recon(node, recon_command, conn_timeout=5, response_timeout=15, + headers=None): + """ + Get recon json directly from the storage server. + + :param node: node dictionary from the ring + :param recon_command: recon string (post /recon/) + :param conn_timeout: timeout in seconds for establishing the connection + :param response_timeout: timeout in seconds for getting the response + :param headers: dict to be passed into HTTPConnection headers + :returns: deserialized json response + :raises DirectClientReconException: HTTP GET request failed + """ + if headers is None: + headers = {} + + ip, port = get_ip_port(node, headers) + path = '/recon/%s' % recon_command + with Timeout(conn_timeout): + conn = http_connect_raw(ip, port, 'GET', path, + headers=gen_headers(headers)) + with Timeout(response_timeout): + resp = conn.getresponse() + if not is_success(resp.status): + raise DirectClientReconException('GET', node, path, resp) + return json.loads(resp.read()) diff --git a/swift/common/error_limiter.py b/swift/common/error_limiter.py new file mode 100644 index 0000000000..715c326847 --- /dev/null +++ b/swift/common/error_limiter.py @@ -0,0 +1,93 @@ +# Copyright (c) 2021 NVIDIA +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import collections +from time import time + +from swift.common.utils import node_to_string + + +class ErrorLimiter(object): + """ + Tracks the number of errors that have occurred for nodes. A node will be + considered to be error-limited for a given interval of time after it has + accumulated more errors than a given limit. + + :param suppression_interval: The number of seconds for which a node is + error-limited once it has accumulated more than ``suppression_limit`` + errors. Should be a float value. + :param suppression_limit: The number of errors that a node must accumulate + before it is considered to be error-limited. Should be an int value. + """ + def __init__(self, suppression_interval, suppression_limit): + self.suppression_interval = float(suppression_interval) + self.suppression_limit = int(suppression_limit) + self.stats = collections.defaultdict(dict) + + def node_key(self, node): + """ + Get the key under which a node's error stats will be stored. + + :param node: dictionary describing a node. + :return: string key. + """ + return node_to_string(node) + + def is_limited(self, node): + """ + Check if the node is currently error limited. + + :param node: dictionary of node to check + :returns: True if error limited, False otherwise + """ + now = time() + node_key = self.node_key(node) + error_stats = self.stats.get(node_key) + + if error_stats is None or 'errors' not in error_stats: + return False + + if 'last_error' in error_stats and error_stats['last_error'] < \ + now - self.suppression_interval: + self.stats.pop(node_key) + return False + return error_stats['errors'] > self.suppression_limit + + def limit(self, node): + """ + Mark a node as error limited. This immediately pretends the + node received enough errors to trigger error suppression. Use + this for errors like Insufficient Storage. For other errors + use :func:`increment`. + + :param node: dictionary of node to error limit + """ + node_key = self.node_key(node) + error_stats = self.stats[node_key] + error_stats['errors'] = self.suppression_limit + 1 + error_stats['last_error'] = time() + + def increment(self, node): + """ + Increment the error count and update the time of the last error for + the given ``node``. + + :param node: dictionary describing a node. + :returns: True if suppression_limit is exceeded, False otherwise + """ + node_key = self.node_key(node) + error_stats = self.stats[node_key] + error_stats['errors'] = error_stats.get('errors', 0) + 1 + error_stats['last_error'] = time() + return error_stats['errors'] > self.suppression_limit diff --git a/swift/common/exceptions.py b/swift/common/exceptions.py index d377009698..74d1540c0d 100644 --- a/swift/common/exceptions.py +++ b/swift/common/exceptions.py @@ -1,4 +1,4 @@ -# Copyright (c) 2010-2012 OpenStack, LLC. +# Copyright (c) 2010-2012 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ # limitations under the License. from eventlet import Timeout +from swift.common.utils.timestamp import Timestamp class MessageTimeout(Timeout): @@ -30,11 +31,33 @@ class SwiftException(Exception): pass -class SwiftConfigurationError(SwiftException): +class PutterConnectError(Exception): + + def __init__(self, status=None): + self.status = status + + +class InvalidTimestamp(SwiftException): + pass + + +class InsufficientStorage(SwiftException): + pass + + +class FooterNotSupported(SwiftException): + pass + + +class MultiphasePUTNotSupported(SwiftException): + pass + + +class SuffixSyncError(SwiftException): pass -class AuditException(SwiftException): +class RangeAlreadyComplete(SwiftException): pass @@ -42,7 +65,67 @@ class DiskFileError(SwiftException): pass -class DiskFileNotExist(SwiftException): +class DiskFileNotOpen(DiskFileError): + pass + + +class DiskFileQuarantined(DiskFileError): + pass + + +class DiskFileCollision(DiskFileError): + pass + + +class DiskFileNotExist(DiskFileError): + pass + + +class DiskFileStateChanged(DiskFileError): + """ + Raised when state of a DiskFile changes while it is being opened (e.g. the + list of on-disk files changes). Retrying the DiskFile interface may + succeed. + """ + + +class DiskFileDeleted(DiskFileNotExist): + + def __init__(self, metadata=None): + self.metadata = metadata or {} + self.timestamp = Timestamp( + self.metadata.get('X-Timestamp', Timestamp.zero())) + + +class DiskFileExpired(DiskFileDeleted): + pass + + +class DiskFileNoSpace(DiskFileError): + pass + + +class DiskFileDeviceUnavailable(DiskFileError): + pass + + +class DiskFileXattrNotSupported(DiskFileError): + pass + + +class DiskFileBadMetadataChecksum(DiskFileError): + pass + + +class DeviceUnavailable(SwiftException): + pass + + +class DatabaseAuditorException(SwiftException): + pass + + +class InvalidAccountInfo(DatabaseAuditorException): pass @@ -50,7 +133,15 @@ class PathNotDir(OSError): pass -class AuthException(SwiftException): +class DevIdBytesTooSmall(ValueError): + pass + + +class ChunkReadError(SwiftException): + pass + + +class ShortReadError(SwiftException): pass @@ -66,6 +157,10 @@ class ConnectionTimeout(Timeout): pass +class ResponseTimeout(Timeout): + pass + + class DriveNotMounted(SwiftException): pass @@ -74,6 +169,10 @@ class LockTimeout(MessageTimeout): pass +class RingLoadError(SwiftException): + pass + + class RingBuilderError(SwiftException): pass @@ -90,6 +189,18 @@ class DuplicateDeviceError(RingBuilderError): pass +class UnPicklingError(SwiftException): + pass + + +class FileNotFoundError(SwiftException): + pass + + +class PermissionError(SwiftException): + pass + + class ListingIterError(SwiftException): pass @@ -102,3 +213,114 @@ class ListingIterNotAuthorized(ListingIterError): def __init__(self, aresp): self.aresp = aresp + + +class SegmentError(SwiftException): + pass + + +class LinkIterError(SwiftException): + pass + + +class ReplicationException(Exception): + pass + + +class ReplicationLockTimeout(LockTimeout): + pass + + +class PartitionLockTimeout(LockTimeout): + pass + + +class MimeInvalid(SwiftException): + pass + + +class APIVersionError(SwiftException): + pass + + +class EncryptionException(SwiftException): + pass + + +class UnknownSecretIdError(EncryptionException): + pass + + +class QuarantineRequest(SwiftException): + pass + + +class MemcacheConnectionError(Exception): + pass + + +class MemcacheIncrNotFoundError(MemcacheConnectionError): + pass + + +class MemcachePoolTimeout(Timeout): + pass + + +class ClientException(Exception): + + def __init__(self, msg, http_scheme='', http_host='', http_port='', + http_path='', http_query='', http_status=None, http_reason='', + http_device='', http_response_content='', http_headers=None): + super(ClientException, self).__init__(msg) + self.msg = msg + self.http_scheme = http_scheme + self.http_host = http_host + self.http_port = http_port + self.http_path = http_path + self.http_query = http_query + self.http_status = http_status + self.http_reason = http_reason + self.http_device = http_device + self.http_response_content = http_response_content + self.http_headers = http_headers or {} + + def __str__(self): + a = self.msg + b = '' + if self.http_scheme: + b += '%s://' % self.http_scheme + if self.http_host: + b += self.http_host + if self.http_port: + b += ':%s' % self.http_port + if self.http_path: + b += self.http_path + if self.http_query: + b += '?%s' % self.http_query + if self.http_status: + if b: + b = '%s %s' % (b, self.http_status) + else: + b = str(self.http_status) + if self.http_reason: + if b: + b = '%s %s' % (b, self.http_reason) + else: + b = '- %s' % self.http_reason + if self.http_device: + if b: + b = '%s: device %s' % (b, self.http_device) + else: + b = 'device %s' % self.http_device + if self.http_response_content: + if len(self.http_response_content) <= 60: + b += ' %s' % self.http_response_content + else: + b += ' [first 60 chars of response] %s' \ + % self.http_response_content[:60] + return b and '%s: %s' % (a, b) or a + + +class InvalidPidFileException(Exception): + pass diff --git a/swift/common/header_key_dict.py b/swift/common/header_key_dict.py new file mode 100644 index 0000000000..f6b906a15e --- /dev/null +++ b/swift/common/header_key_dict.py @@ -0,0 +1,69 @@ +# Copyright (c) 2010-2012 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class HeaderKeyDict(dict): + """ + A dict that title-cases all keys on the way in, so as to be + case-insensitive. + + Note that all keys and values are expected to be wsgi strings, + though some allowances are made when setting values. + """ + def __init__(self, base_headers=None, **kwargs): + if base_headers: + self.update(base_headers) + self.update(kwargs) + + @staticmethod + def _title(s): + return s.encode('latin1').title().decode('latin1') + + def update(self, other): + if hasattr(other, 'keys'): + for key in other.keys(): + self[self._title(key)] = other[key] + else: + for key, value in other: + self[self._title(key)] = value + + def __getitem__(self, key): + return dict.get(self, self._title(key)) + + def __setitem__(self, key, value): + key = self._title(key) + if value is None: + self.pop(key, None) + elif isinstance(value, bytes): + return dict.__setitem__(self, key, value.decode('latin-1')) + else: + return dict.__setitem__(self, key, str(value)) + + def __contains__(self, key): + return dict.__contains__(self, self._title(key)) + + def __delitem__(self, key): + return dict.__delitem__(self, self._title(key)) + + def get(self, key, default=None): + return dict.get(self, self._title(key), default) + + def setdefault(self, key, value=None): + if key not in self: + self[key] = value + return self[key] + + def pop(self, key, default=None): + return dict.pop(self, self._title(key), default) diff --git a/swift/common/http.py b/swift/common/http.py index cc4bd6fdcb..6dc396062d 100644 --- a/swift/common/http.py +++ b/swift/common/http.py @@ -1,4 +1,4 @@ -# Copyright (c) 2010-2012 OpenStack, LLC. +# Copyright (c) 2010-2012 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,7 +21,7 @@ def is_informational(status): :param status: http status code :returns: True if status is successful, else False """ - return 100 <= status and status <= 199 + return 100 <= status <= 199 def is_success(status): @@ -31,7 +31,7 @@ def is_success(status): :param status: http status code :returns: True if status is successful, else False """ - return 200 <= status and status <= 299 + return 200 <= status <= 299 def is_redirection(status): @@ -41,7 +41,7 @@ def is_redirection(status): :param status: http status code :returns: True if status is redirection, else False """ - return 300 <= status and status <= 399 + return 300 <= status <= 399 def is_client_error(status): @@ -51,7 +51,7 @@ def is_client_error(status): :param status: http status code :returns: True if status is client error, else False """ - return 400 <= status and status <= 499 + return 400 <= status <= 499 def is_server_error(status): @@ -61,13 +61,13 @@ def is_server_error(status): :param status: http status code :returns: True if status is server error, else False """ - return 500 <= status and status <= 599 + return 500 <= status <= 599 # List of HTTP status codes ############################################################################### -## 1xx Informational +# 1xx Informational ############################################################################### HTTP_CONTINUE = 100 @@ -77,7 +77,7 @@ def is_server_error(status): HTTP_REQUEST_URI_TOO_LONG = 122 ############################################################################### -## 2xx Success +# 2xx Success ############################################################################### HTTP_OK = 200 @@ -91,7 +91,7 @@ def is_server_error(status): HTTP_IM_USED = 226 ############################################################################### -## 3xx Redirection +# 3xx Redirection ############################################################################### HTTP_MULTIPLE_CHOICES = 300 @@ -105,7 +105,7 @@ def is_server_error(status): HTTP_RESUME_INCOMPLETE = 308 ############################################################################### -## 4xx Client Error +# 4xx Client Error ############################################################################### HTTP_BAD_REQUEST = 400 @@ -138,10 +138,11 @@ def is_server_error(status): HTTP_NO_RESPONSE = 444 HTTP_RETRY_WITH = 449 HTTP_BLOCKED_BY_WINDOWS_PARENTAL_CONTROLS = 450 +HTTP_RATE_LIMITED = 498 HTTP_CLIENT_CLOSED_REQUEST = 499 ############################################################################### -## 5xx Server Error +# 5xx Server Error ############################################################################### HTTP_INTERNAL_SERVER_ERROR = 500 diff --git a/swift/common/http_protocol.py b/swift/common/http_protocol.py new file mode 100644 index 0000000000..cce7a9ebd4 --- /dev/null +++ b/swift/common/http_protocol.py @@ -0,0 +1,373 @@ +# Copyright (c) 2010-2022 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from eventlet import wsgi, websocket + +from swift.common.utils import generate_trans_id +from swift.common.http import HTTP_NO_CONTENT, HTTP_RESET_CONTENT, \ + HTTP_NOT_MODIFIED + +from eventlet.green.http import client as http_client +from html import escape + + +class SwiftHttpProtocol(wsgi.HttpProtocol): + default_request_version = "HTTP/1.0" + reject_bad_requests = False + + def __init__(self, *args, **kwargs): + # See https://github.com/eventlet/eventlet/pull/590 + self.pre_shutdown_bugfix_eventlet = not getattr( + websocket.WebSocketWSGI, '_WSGI_APP_ALWAYS_IDLE', None) + super().__init__(*args, **kwargs) + + def log_request(self, *a): + """ + Turn off logging requests by the underlying WSGI software. + """ + pass + + def log_message(self, f, *a): + """ + Redirect logging other messages by the underlying WSGI software. + """ + logger = getattr(self.server.app, 'logger', None) + if logger: + logger.error('ERROR WSGI: ' + f, *a) + else: + # eventlet<=0.17.4 doesn't have an error method, and in newer + # versions the output from error is same as info anyway + self.server.log.info('ERROR WSGI: ' + f, *a) + + class MessageClass(wsgi.HttpProtocol.MessageClass): + """Subclass to see when the client didn't provide a Content-Type""" + def get_default_type(self): + """If the client didn't provide a content type, leave it blank.""" + return '' + + def parse_request(self): + """Parse a request (inlined from cpython@7e293984). + + The request should be stored in self.raw_requestline; the results + are in self.command, self.path, self.request_version and + self.headers. + + Return True for success, False for failure; on failure, any relevant + error response has already been sent back. + + """ + self.command = None # set in case of error on the first line + self.request_version = version = self.default_request_version + self.close_connection = True + requestline = self.raw_requestline.decode('iso-8859-1') + requestline = requestline.rstrip('\r\n') + self.requestline = requestline + # Split off \x20 explicitly (see https://bugs.python.org/issue33973) + words = requestline.split(' ') + if len(words) == 0: + return False + + if len(words) >= 3: # Enough to determine protocol version + version = words[-1] + try: + if not version.startswith('HTTP/'): + raise ValueError + base_version_number = version.split('/', 1)[1] + version_number = base_version_number.split(".") + # RFC 2145 section 3.1 says there can be only one "." and + # - major and minor numbers MUST be treated as + # separate integers; + # - HTTP/2.4 is a lower version than HTTP/2.13, which in + # turn is lower than HTTP/12.3; + # - Leading zeros MUST be ignored by recipients. + if len(version_number) != 2: + raise ValueError + version_number = int(version_number[0]), int(version_number[1]) + except (ValueError, IndexError): + self.send_error( + 400, + "Bad request version (%r)" % version) + return False + if version_number >= (1, 1) and \ + self.protocol_version >= "HTTP/1.1": + self.close_connection = False + if version_number >= (2, 0): + self.send_error( + 505, + "Invalid HTTP version (%s)" % base_version_number) + return False + self.request_version = version + + if not 2 <= len(words) <= 3: + self.send_error( + 400, + "Bad request syntax (%r)" % requestline) + return False + command, path = words[:2] + if len(words) == 2: + self.close_connection = True + if command != 'GET': + self.send_error( + 400, + "Bad HTTP/0.9 request type (%r)" % command) + return False + + if path.startswith(('http://', 'https://')): + host, sep, rest = path.partition('//')[2].partition('/') + if sep: + path = '/' + rest + + self.command, self.path = command, path + + # Examine the headers and look for a Connection directive. + try: + self.headers = http_client.parse_headers( + self.rfile, + _class=self.MessageClass) + except http_client.LineTooLong as err: + self.send_error( + 431, + "Line too long", + str(err)) + return False + except http_client.HTTPException as err: + self.send_error( + 431, + "Too many headers", + str(err) + ) + return False + + conntype = self.headers.get('Connection', "") + if conntype.lower() == 'close': + self.close_connection = True + elif (conntype.lower() == 'keep-alive' and + self.protocol_version >= "HTTP/1.1"): + self.close_connection = False + # Examine the headers and look for an Expect directive + expect = self.headers.get('Expect', "") + if (expect.lower() == "100-continue" and + self.protocol_version >= "HTTP/1.1" and + self.request_version >= "HTTP/1.1"): + if not self.handle_expect_100(): + return False + return True + + def get_environ(self, *args, **kwargs): + environ = wsgi.HttpProtocol.get_environ(self, *args, **kwargs) + header_payload = self.headers.get_payload() + if isinstance(header_payload, list) and len(header_payload) == 1: + header_payload = header_payload[0].get_payload() + if header_payload: + # This shouldn't be here. We must've bumped up against + # https://bugs.python.org/issue37093 + headers_raw = list(environ['headers_raw']) + for line in header_payload.rstrip('\r\n').split('\n'): + if ':' not in line or line[:1] in ' \t': + # Well, we're no more broken than we were before... + # Should we support line folding? + # Should we 400 a bad header line? + break + header, value = line.split(':', 1) + value = value.strip(' \t\n\r') + # NB: Eventlet looks at the headers obj to figure out + # whether the client said the connection should close; + # see https://github.com/eventlet/eventlet/blob/v0.25.0/ + # eventlet/wsgi.py#L504 + self.headers.add_header(header, value) + headers_raw.append((header, value)) + wsgi_key = 'HTTP_' + header.replace('-', '_').encode( + 'latin1').upper().decode('latin1') + if wsgi_key in ('HTTP_CONTENT_LENGTH', + 'HTTP_CONTENT_TYPE'): + wsgi_key = wsgi_key[5:] + environ[wsgi_key] = value + environ['headers_raw'] = tuple(headers_raw) + # Since we parsed some more headers, check to see if they + # change how our wsgi.input should behave + te = environ.get('HTTP_TRANSFER_ENCODING', '').lower() + if te.rsplit(',', 1)[-1].strip() == 'chunked': + environ['wsgi.input'].chunked_input = True + else: + length = environ.get('CONTENT_LENGTH') + if length: + length = int(length) + environ['wsgi.input'].content_length = length + if environ.get('HTTP_EXPECT', '').lower() == '100-continue': + environ['wsgi.input'].wfile = self.wfile + environ['wsgi.input'].wfile_line = \ + b'HTTP/1.1 100 Continue\r\n' + return environ + + def _read_request_line(self): + got = super()._read_request_line() + # See https://github.com/eventlet/eventlet/pull/590 + if self.pre_shutdown_bugfix_eventlet: + self.conn_state[2] = wsgi.STATE_REQUEST + return got + + def handle_one_request(self): + got = super().handle_one_request() + # See https://github.com/eventlet/eventlet/pull/590 + if self.pre_shutdown_bugfix_eventlet: + if self.conn_state[2] != wsgi.STATE_CLOSE: + self.conn_state[2] = wsgi.STATE_IDLE + return got + + def send_error(self, code, message=None, explain=None): + """Send and log an error reply, we are overriding the cpython parent + class method, so we can have logger generate txn_id's for error + response from wsgi since we are at the edge of the proxy server. + This sends an error response (so it must be called before any output + has been generated), logs the error, and finally sends a piece of HTML + explaining the error to the user. + + :param code: an HTTP error code + 3 digits + :param message: a simple optional 1 line reason phrase. + *( HTAB / SP / VCHAR / %x80-FF ) + defaults to short entry matching the response code + :param explain: a detailed message defaults to the long entry + matching the response code. + """ + + try: + shortmsg, longmsg = self.responses[code] + except KeyError: + shortmsg, longmsg = '???', '???' + if message is None: + message = shortmsg + if explain is None: + explain = longmsg + + try: + # assume we have a LogAdapter + txn_id = self.server.app.logger.txn_id # just in case it was set + except AttributeError: + # turns out we don't have a LogAdapter, so go direct + txn_id = generate_trans_id('') + self.log_error("code %d, message %s, (txn: %s)", code, + message, txn_id) + else: + # we do have a LogAdapter, but likely not yet a txn_id + txn_id = txn_id or generate_trans_id('') + self.server.app.logger.txn_id = txn_id + self.log_error("code %d, message %s", code, message) + self.send_response(code, message) + self.send_header('Connection', 'close') + + # Message body is omitted for cases described in: + # - RFC7230: 3.3. 1xx, 204(No Content), 304(Not Modified) + # - RFC7231: 6.3.6. 205(Reset Content) + body = None + exclude_status = (HTTP_NO_CONTENT, + HTTP_RESET_CONTENT, + HTTP_NOT_MODIFIED) + if (code >= 200 and + code not in exclude_status): + # HTML encode to prevent Cross Site Scripting attacks + # (see bug https://bugs.python.org/issue1100201) + content = (self.error_message_format % { + 'code': code, + 'message': escape(message, quote=False), + 'explain': escape(explain, quote=False) + }) + body = content.encode('UTF-8', 'replace') + self.send_header("Content-Type", self.error_content_type) + self.send_header('Content-Length', str(len(body))) + self.send_header('X-Trans-Id', txn_id) + self.send_header('X-Openstack-Request-Id', txn_id) + self.end_headers() + + if self.command != 'HEAD' and body: + self.wfile.write(body) + + +class SwiftHttpProxiedProtocol(SwiftHttpProtocol): + """ + Protocol object that speaks HTTP, including multiple requests, but with + a single PROXY line as the very first thing coming in over the socket. + This is so we can learn what the client's IP address is when Swift is + behind a TLS terminator, like hitch, that does not understand HTTP and + so cannot add X-Forwarded-For or other similar headers. + + See http://www.haproxy.org/download/1.7/doc/proxy-protocol.txt for + protocol details. + """ + def __init__(self, *a, **kw): + self.proxy_address = None + SwiftHttpProtocol.__init__(self, *a, **kw) + + def handle_error(self, connection_line): + connection_line = connection_line.decode('latin-1') + + # No further processing will proceed on this connection under any + # circumstances. We always send the request into the superclass to + # handle any cleanup - this ensures that the request will not be + # processed. + self.rfile.close() + # We don't really have any confidence that an HTTP Error will be + # processable by the client as our transmission broken down between + # ourselves and our gateway proxy before processing the client + # protocol request. Hopefully the operator will know what to do! + msg = 'Invalid PROXY line %r' % connection_line + # Even assuming HTTP we don't even known what version of HTTP the + # client is sending? This entire endeavor seems questionable. + self.request_version = self.default_request_version + # appease http.server + self.command = 'PROXY' + self.send_error(400, msg) + + def handle(self): + """Handle multiple requests if necessary.""" + # ensure the opening line for the connection is a valid PROXY protcol + # line; this is the only IO we do on this connection before any + # additional wrapping further pollutes the raw socket. + connection_line = self.rfile.readline(self.server.url_length_limit) + + if not connection_line.startswith(b'PROXY '): + return self.handle_error(connection_line) + + proxy_parts = connection_line.strip(b'\r\n').split(b' ') + if proxy_parts[1].startswith(b'UNKNOWN'): + # "UNKNOWN", in PROXY protocol version 1, means "not + # TCP4 or TCP6". This includes completely legitimate + # things like QUIC or Unix domain sockets. The PROXY + # protocol (section 2.1) states that the receiver + # (that's us) MUST ignore anything after "UNKNOWN" and + # before the CRLF, essentially discarding the first + # line. + pass + elif proxy_parts[1] in (b'TCP4', b'TCP6') and len(proxy_parts) == 6: + self.client_address = ( + proxy_parts[2].decode('latin-1'), + proxy_parts[4].decode('latin-1')) + self.proxy_address = ( + proxy_parts[3].decode('latin-1'), + proxy_parts[5].decode('latin-1')) + else: + self.handle_error(connection_line) + + return SwiftHttpProtocol.handle(self) + + def get_environ(self, *args, **kwargs): + environ = SwiftHttpProtocol.get_environ(self, *args, **kwargs) + if self.proxy_address: + environ['SERVER_ADDR'] = self.proxy_address[0] + environ['SERVER_PORT'] = self.proxy_address[1] + if self.proxy_address[1] == '443': + environ['wsgi.url_scheme'] = 'https' + environ['HTTPS'] = 'on' + return environ diff --git a/swift/common/internal_client.py b/swift/common/internal_client.py index f861412cae..883cd4e16e 100644 --- a/swift/common/internal_client.py +++ b/swift/common/internal_client.py @@ -1,4 +1,4 @@ -# Copyright (c) 2010-2012 OpenStack, LLC. +# Copyright (c) 2010-2012 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,17 +13,26 @@ # See the License for the specific language governing permissions and # limitations under the License. -from eventlet import sleep, Timeout +from eventlet import sleep, Timeout, spawn +from eventlet.green import socket +from eventlet.green.http import client as http_client +from eventlet.green.urllib import request as urllib_request import json -from paste.deploy import loadapp +import urllib import struct -from sys import exc_info -from urllib import quote +from sys import exit import zlib +from time import gmtime, strftime, time from zlib import compressobj -from swift.common.http import HTTP_NOT_FOUND -from swift.common.swob import Request +from swift.common.exceptions import ClientException +from swift.common.http import (HTTP_NOT_FOUND, HTTP_MULTIPLE_CHOICES, + is_client_error, is_server_error) +from swift.common.middleware.gatekeeper import GatekeeperMiddleware +from swift.common.request_helpers import USE_REPLICATION_NETWORK_HEADER +from swift.common.swob import Request, bytes_to_wsgi +from swift.common.utils import quote, close_if_possible, drain_and_close +from swift.common.wsgi import loadapp class UnexpectedResponse(Exception): @@ -35,7 +44,7 @@ class UnexpectedResponse(Exception): """ def __init__(self, message, resp): - super(UnexpectedResponse, self).__init__(self, message) + super(UnexpectedResponse, self).__init__(message) self.resp = resp @@ -49,13 +58,25 @@ class CompressingFileReader(object): :param file_obj: File object to wrap. :param compresslevel: Compression level, defaults to 9. + :param chunk_size: Size of chunks read when iterating using object, + defaults to 4096. """ - def __init__(self, file_obj, compresslevel=9): + def __init__(self, file_obj, compresslevel=9, chunk_size=4096): self._f = file_obj + self.compresslevel = compresslevel + self.chunk_size = chunk_size + self.set_initial_state() + + def set_initial_state(self): + """ + Sets the object to the state needed for the first read. + """ + + self._f.seek(0) self._compressor = compressobj( - compresslevel, zlib.DEFLATED, -zlib.MAX_WBITS, zlib.DEF_MEM_LEVEL, - 0) + self.compresslevel, zlib.DEFLATED, -zlib.MAX_WBITS, + zlib.DEF_MEM_LEVEL, 0) self.done = False self.first = True self.crc32 = 0 @@ -71,36 +92,41 @@ def read(self, *a, **kw): """ if self.done: - return '' + return b'' x = self._f.read(*a, **kw) if x: - self.crc32 = zlib.crc32(x, self.crc32) & 0xffffffffL + self.crc32 = zlib.crc32(x, self.crc32) & 0xffffffff self.total_size += len(x) compressed = self._compressor.compress(x) if not compressed: compressed = self._compressor.flush(zlib.Z_SYNC_FLUSH) else: compressed = self._compressor.flush(zlib.Z_FINISH) - crc32 = struct.pack("= HTTP_MULTIPLE_CHOICES: + b''.join(resp.app_iter) break data = json.loads(resp.body) if not data: @@ -245,15 +350,6 @@ def make_path(self, account, container=None, obj=None): not. """ - if isinstance(account, unicode): - account = account.encode('utf-8') - - if isinstance(container, unicode): - container = container.encode('utf-8') - - if isinstance(obj, unicode): - obj = obj.encode('utf-8') - path = '/v1/%s' % quote(account) if container: path += '/%s' % quote(container) @@ -286,17 +382,17 @@ def _set_metadata( """ headers = {} - for k, v in metadata.iteritems(): + for k, v in metadata.items(): if k.lower().startswith(metadata_prefix): headers[k] = v else: headers['%s%s' % (metadata_prefix, k)] = v - self.make_request('POST', path, headers, acceptable_statuses) + self.handle_request('POST', path, headers, acceptable_statuses) # account methods def iter_containers( - self, account, marker='', end_marker='', + self, account, marker='', end_marker='', prefix='', acceptable_statuses=(2, HTTP_NOT_FOUND)): """ Returns an iterator of containers dicts from an account. @@ -305,6 +401,7 @@ def iter_containers( :param marker: Prefix of first desired item, defaults to ''. :param end_marker: Last item returned will be 'less' than this, defaults to ''. + :param prefix: Prefix of containers :param acceptable_statuses: List of status for valid responses, defaults to (2, HTTP_NOT_FOUND). @@ -315,7 +412,36 @@ def iter_containers( """ path = self.make_path(account) - return self._iter_items(path, marker, end_marker, acceptable_statuses) + return self._iter_items(path, marker, end_marker, prefix, + acceptable_statuses) + + def create_account(self, account): + """ + Creates an account. + + :param account: Account to create. + :raises UnexpectedResponse: Exception raised when requests fail + to get a response with an acceptable status + :raises Exception: Exception is raised when code fails in an + unexpected way. + """ + path = self.make_path(account) + self.handle_request('PUT', path, {}, (201, 202)) + + def delete_account(self, account, acceptable_statuses=(2, HTTP_NOT_FOUND)): + """ + Deletes an account. + + :param account: Account to delete. + :param acceptable_statuses: List of status for valid responses, + defaults to (2, HTTP_NOT_FOUND). + :raises UnexpectedResponse: Exception raised when requests fail + to get a response with an acceptable status + :raises Exception: Exception is raised when code fails in an + unexpected way. + """ + path = self.make_path(account) + self.handle_request('DELETE', path, {}, acceptable_statuses) def get_account_info( self, account, acceptable_statuses=(2, HTTP_NOT_FOUND)): @@ -334,13 +460,15 @@ def get_account_info( path = self.make_path(account) resp = self.make_request('HEAD', path, {}, acceptable_statuses) + if not resp.status_int // 100 == 2: + return (0, 0) return (int(resp.headers.get('x-account-container-count', 0)), int(resp.headers.get('x-account-object-count', 0))) def get_account_metadata( - self, account, metadata_prefix='', acceptable_statuses=(2,)): - """ - Gets account metadata. + self, account, metadata_prefix='', acceptable_statuses=(2,), + params=None): + """Gets account metadata. :param account: Account on which to get the metadata. :param metadata_prefix: Used to filter values from the headers @@ -349,7 +477,7 @@ def get_account_metadata( :param acceptable_statuses: List of status for valid responses, defaults to (2,). - :returns : Returns dict of account metadata. + :returns: Returns dict of account metadata. Keys will be lowercase. :raises UnexpectedResponse: Exception raised when requests fail to get a response with an acceptable status @@ -358,7 +486,8 @@ def get_account_metadata( """ path = self.make_path(account) - return self._get_metadata(path, metadata_prefix, acceptable_statuses) + return self._get_metadata(path, metadata_prefix, acceptable_statuses, + headers=None, params=params) def set_account_metadata( self, account, metadata, metadata_prefix='', @@ -390,23 +519,22 @@ def set_account_metadata( # container methods def container_exists(self, account, container): - """ - Checks to see if a container exists. + """Checks to see if a container exists. :param account: The container's account. :param container: Container to check. - :returns : True if container exists, false otherwise. - :raises UnexpectedResponse: Exception raised when requests fail to get a response with an acceptable status :raises Exception: Exception is raised when code fails in an unexpected way. + + :returns: True if container exists, false otherwise. """ path = self.make_path(account, container) resp = self.make_request('HEAD', path, {}, (2, HTTP_NOT_FOUND)) - return resp.status_int != HTTP_NOT_FOUND + return not resp.status_int == HTTP_NOT_FOUND def create_container( self, account, container, headers=None, acceptable_statuses=(2,)): @@ -427,10 +555,11 @@ def create_container( headers = headers or {} path = self.make_path(account, container) - self.make_request('PUT', path, headers, acceptable_statuses) + self.handle_request('PUT', path, headers, acceptable_statuses) def delete_container( - self, account, container, acceptable_statuses=(2, HTTP_NOT_FOUND)): + self, account, container, headers=None, + acceptable_statuses=(2, HTTP_NOT_FOUND)): """ Deletes a container. @@ -445,14 +574,14 @@ def delete_container( unexpected way. """ + headers = headers or {} path = self.make_path(account, container) - self.make_request('DELETE', path, {}, acceptable_statuses) + self.handle_request('DELETE', path, headers, acceptable_statuses) def get_container_metadata( self, account, container, metadata_prefix='', - acceptable_statuses=(2,)): - """ - Gets container metadata. + acceptable_statuses=(2,), params=None): + """Gets container metadata. :param account: The container's account. :param container: Container to get metadata on. @@ -462,7 +591,7 @@ def get_container_metadata( :param acceptable_statuses: List of status for valid responses, defaults to (2,). - :returns : Returns dict of container metadata. + :returns: Returns dict of container metadata. Keys will be lowercase. :raises UnexpectedResponse: Exception raised when requests fail to get a response with an acceptable status @@ -471,10 +600,11 @@ def get_container_metadata( """ path = self.make_path(account, container) - return self._get_metadata(path, metadata_prefix, acceptable_statuses) + return self._get_metadata(path, metadata_prefix, acceptable_statuses, + params=params) def iter_objects( - self, account, container, marker='', end_marker='', + self, account, container, marker='', end_marker='', prefix='', acceptable_statuses=(2, HTTP_NOT_FOUND)): """ Returns an iterator of object dicts from a container. @@ -484,6 +614,7 @@ def iter_objects( :param marker: Prefix of first desired item, defaults to ''. :param end_marker: Last item returned will be 'less' than this, defaults to ''. + :param prefix: Prefix of objects :param acceptable_statuses: List of status for valid responses, defaults to (2, HTTP_NOT_FOUND). @@ -494,7 +625,8 @@ def iter_objects( """ path = self.make_path(account, container) - return self._iter_items(path, marker, end_marker, acceptable_statuses) + return self._iter_items(path, marker, end_marker, prefix, + acceptable_statuses) def set_container_metadata( self, account, container, metadata, metadata_prefix='', @@ -528,7 +660,8 @@ def set_container_metadata( def delete_object( self, account, container, obj, - acceptable_statuses=(2, HTTP_NOT_FOUND)): + acceptable_statuses=(2, HTTP_NOT_FOUND), + headers=None): """ Deletes an object. @@ -537,6 +670,7 @@ def delete_object( :param obj: The object. :param acceptable_statuses: List of status for valid responses, defaults to (2, HTTP_NOT_FOUND). + :param headers: extra headers to send with request :raises UnexpectedResponse: Exception raised when requests fail to get a response with an acceptable status @@ -545,13 +679,13 @@ def delete_object( """ path = self.make_path(account, container, obj) - self.make_request('DELETE', path, {}, acceptable_statuses) + self.handle_request('DELETE', path, (headers or {}), + acceptable_statuses) def get_object_metadata( self, account, container, obj, metadata_prefix='', - acceptable_statuses=(2,)): - """ - Gets object metadata. + acceptable_statuses=(2,), headers=None, params=None): + """Gets object metadata. :param account: The object's account. :param container: The object's container. @@ -561,8 +695,9 @@ def get_object_metadata( keys in the dict returned. Defaults to ''. :param acceptable_statuses: List of status for valid responses, defaults to (2,). + :param headers: extra headers to send with request - :returns : Dict of object metadata. + :returns: Dict of object metadata. :raises UnexpectedResponse: Exception raised when requests fail to get a response with an acceptable status @@ -571,7 +706,35 @@ def get_object_metadata( """ path = self.make_path(account, container, obj) - return self._get_metadata(path, metadata_prefix, acceptable_statuses) + return self._get_metadata(path, metadata_prefix, acceptable_statuses, + headers=headers, params=params) + + def get_object(self, account, container, obj, headers=None, + acceptable_statuses=(2,), params=None): + """ + Gets an object. + + :param account: The object's account. + :param container: The object's container. + :param obj: The object name. + :param headers: Headers to send with request, defaults to empty dict. + :param acceptable_statuses: List of status for valid responses, + defaults to (2,). + :param params: A dict of params to be set in request query string, + defaults to None. + + :raises UnexpectedResponse: Exception raised when requests fail + to get a response with an acceptable status + :raises Exception: Exception is raised when code fails in an + unexpected way. + :returns: A 3-tuple (status, headers, iterator of object body) + """ + + headers = headers or {} + path = self.make_path(account, container, obj) + resp = self.make_request( + 'GET', path, headers, acceptable_statuses, params=params) + return (resp.status_int, resp.headers, resp.app_iter) def iter_object_lines( self, account, container, obj, headers=None, @@ -584,7 +747,7 @@ def iter_object_lines( :param account: The object's account. :param container: The object's container. - :param objec_namet: The object. + :param obj: The object. :param acceptable_statuses: List of status for valid responses, defaults to (2,). @@ -596,10 +759,11 @@ def iter_object_lines( headers = headers or {} path = self.make_path(account, container, obj) - resp = self.make_request('GET', path, headers, acceptable_statuses) + if not resp.status_int // 100 == 2: + return - last_part = '' + last_part = b'' compressed = obj.endswith('.gz') # magic in the following zlib.decompressobj argument is courtesy of # Python decompressing gzip chunk-by-chunk @@ -608,7 +772,7 @@ def iter_object_lines( for chunk in resp.app_iter: if compressed: chunk = d.decompress(chunk) - parts = chunk.split('\n') + parts = chunk.split(b'\n') if len(parts) == 1: last_part = last_part + parts[0] else: @@ -647,13 +811,17 @@ def set_object_metadata( path, metadata, metadata_prefix, acceptable_statuses) def upload_object( - self, fobj, account, container, obj, headers=None): + self, fobj, account, container, obj, headers=None, + acceptable_statuses=(2,), params=None): """ :param fobj: File object to read object's content from. :param account: The object's account. :param container: The object's container. :param obj: The object. - :param headers: Headers to send with request, defaults ot empty dict. + :param headers: Headers to send with request, defaults to empty dict. + :param acceptable_statuses: List of acceptable statuses for request. + :param params: A dict of params to be set in request query string, + defaults to None. :raises UnexpectedResponse: Exception raised when requests fail to get a response with an acceptable status @@ -662,6 +830,174 @@ def upload_object( """ headers = dict(headers or {}) - headers['Transfer-Encoding'] = 'chunked' + if 'Content-Length' not in headers: + headers['Transfer-Encoding'] = 'chunked' path = self.make_path(account, container, obj) - self.make_request('PUT', path, headers, (2,), fobj) + self.handle_request('PUT', path, headers, acceptable_statuses, fobj, + params=params) + + +def get_auth(url, user, key, auth_version='1.0', **kwargs): + if auth_version != '1.0': + exit('ERROR: swiftclient missing, only auth v1.0 supported') + req = urllib_request.Request(url) + req.add_header('X-Auth-User', user) + req.add_header('X-Auth-Key', key) + conn = urllib_request.urlopen(req) + headers = conn.info() + return ( + headers.getheader('X-Storage-Url'), + headers.getheader('X-Auth-Token')) + + +class SimpleClient(object): + """ + Simple client that is used in bin/swift-dispersion-* and container sync + """ + def __init__(self, url=None, token=None, starting_backoff=1, + max_backoff=5, retries=5): + self.url = url + self.token = token + self.attempts = 0 # needed in swif-dispersion-populate + self.starting_backoff = starting_backoff + self.max_backoff = max_backoff + self.retries = retries + + def base_request(self, method, container=None, name=None, prefix=None, + headers=None, proxy=None, contents=None, + full_listing=None, logger=None, additional_info=None, + timeout=None, marker=None): + # Common request method + trans_start = time() + url = self.url + + if full_listing: + info, body_data = self.base_request( + method, container, name, prefix, headers, proxy, + timeout=timeout, marker=marker) + listing = body_data + while listing: + marker = listing[-1]['name'] + info, listing = self.base_request( + method, container, name, prefix, headers, proxy, + timeout=timeout, marker=marker) + if listing: + body_data.extend(listing) + return [info, body_data] + + if headers is None: + headers = {} + + if self.token: + headers['X-Auth-Token'] = self.token + + if container: + url = '%s/%s' % (url.rstrip('/'), quote(container)) + + if name: + url = '%s/%s' % (url.rstrip('/'), quote(name)) + else: + params = ['format=json'] + if prefix: + params.append('prefix=%s' % prefix) + + if marker: + params.append('marker=%s' % quote(marker)) + + url += '?' + '&'.join(params) + + req = urllib_request.Request(url, headers=headers, data=contents) + if proxy: + proxy = urllib.parse.urlparse(proxy) + req.set_proxy(proxy.netloc, proxy.scheme) + req.get_method = lambda: method + conn = urllib_request.urlopen(req, timeout=timeout) + body = conn.read() + info = conn.info() + try: + body_data = json.loads(body) + except ValueError: + body_data = None + trans_stop = time() + if logger: + sent_content_length = 0 + for n, v in headers.items(): + nl = n.lower() + if nl == 'content-length': + try: + sent_content_length = int(v) + break + except ValueError: + pass + logger.debug("-> " + " ".join( + quote(str(x) if x else "-", ":/") + for x in ( + strftime('%Y-%m-%dT%H:%M:%S', gmtime(trans_stop)), + method, + url, + conn.getcode(), + sent_content_length, + info['content-length'], + trans_start, + trans_stop, + trans_stop - trans_start, + additional_info + ))) + return [info, body_data] + + def retry_request(self, method, **kwargs): + retries = kwargs.pop('retries', self.retries) + self.attempts = 0 + backoff = self.starting_backoff + while self.attempts <= retries: + self.attempts += 1 + try: + return self.base_request(method, **kwargs) + except urllib_request.HTTPError as err: + if is_client_error(err.getcode() or 500): + raise ClientException('Client error', + http_status=err.getcode()) + elif self.attempts > retries: + raise ClientException('Raise too many retries', + http_status=err.getcode()) + except (socket.error, http_client.HTTPException, + urllib_request.URLError): + if self.attempts > retries: + raise + sleep(backoff) + backoff = min(backoff * 2, self.max_backoff) + + def get_account(self, *args, **kwargs): + # Used in swift-dispersion-populate + return self.retry_request('GET', **kwargs) + + def put_container(self, container, **kwargs): + # Used in swift-dispersion-populate + return self.retry_request('PUT', container=container, **kwargs) + + def get_container(self, container, **kwargs): + # Used in swift-dispersion-populate + return self.retry_request('GET', container=container, **kwargs) + + def put_object(self, container, name, contents, **kwargs): + # Used in swift-dispersion-populate + return self.retry_request('PUT', container=container, name=name, + contents=contents.read(), **kwargs) + + +def head_object(url, **kwargs): + """For usage with container sync """ + client = SimpleClient(url=url) + return client.retry_request('HEAD', **kwargs) + + +def put_object(url, **kwargs): + """For usage with container sync """ + client = SimpleClient(url=url) + client.retry_request('PUT', **kwargs) + + +def delete_object(url, **kwargs): + """For usage with container sync """ + client = SimpleClient(url=url) + client.retry_request('DELETE', **kwargs) diff --git a/swift/common/linkat.py b/swift/common/linkat.py new file mode 100644 index 0000000000..a85463a325 --- /dev/null +++ b/swift/common/linkat.py @@ -0,0 +1,82 @@ +# Copyright (c) 2016 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import ctypes +from ctypes.util import find_library + +__all__ = ['linkat'] + + +class Linkat(object): + + # From include/uapi/linux/fcntl.h + AT_FDCWD = -100 + AT_SYMLINK_FOLLOW = 0x400 + + __slots__ = '_c_linkat' + + def __init__(self): + libc = ctypes.CDLL(find_library('c'), use_errno=True) + + try: + c_linkat = libc.linkat + except AttributeError: + self._c_linkat = None + return + + c_linkat.argtypes = [ctypes.c_int, ctypes.c_char_p, + ctypes.c_int, ctypes.c_char_p, + ctypes.c_int] + c_linkat.restype = ctypes.c_int + + def errcheck(result, func, arguments): + if result == -1: + errno = ctypes.set_errno(0) + raise IOError(errno, 'linkat: %s' % os.strerror(errno)) + else: + return result + + c_linkat.errcheck = errcheck + + self._c_linkat = c_linkat + + @property + def available(self): + return self._c_linkat is not None + + def __call__(self, olddirfd, oldpath, newdirfd, newpath, flags): + """ + linkat() creates a new link (also known as a hard link) + to an existing file. + + See `man 2 linkat` for more info. + """ + if not self.available: + raise EnvironmentError('linkat not available') + + if not isinstance(olddirfd, int) or not isinstance(newdirfd, int): + raise TypeError("fd must be an integer.") + + if isinstance(oldpath, str): + oldpath = oldpath.encode('utf8') + if isinstance(newpath, str): + newpath = newpath.encode('utf8') + + return self._c_linkat(olddirfd, oldpath, newdirfd, newpath, flags) + + +linkat = Linkat() +del Linkat diff --git a/swift/common/manager.py b/swift/common/manager.py index 855a965d09..68dd2089a9 100644 --- a/swift/common/manager.py +++ b/swift/common/manager.py @@ -1,4 +1,4 @@ -# Copyright (c) 2010-2012 OpenStack, LLC. +# Copyright (c) 2010-2012 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,41 +13,52 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import with_statement + import functools import errno +from optparse import OptionParser import os import resource import signal import time import subprocess import re +import sys +import tempfile +from shutil import which -from swift.common.utils import search_tree, remove_file, write_file +from swift.common.utils import search_tree, remove_file, write_file, readconf +from swift.common.exceptions import InvalidPidFileException SWIFT_DIR = '/etc/swift' RUN_DIR = '/var/run/swift' +PROC_DIR = '/proc' -# auth-server has been removed from ALL_SERVERS, start it explicitly ALL_SERVERS = ['account-auditor', 'account-server', 'container-auditor', - 'container-replicator', 'container-server', 'container-sync', + 'container-replicator', 'container-reconciler', + 'container-server', 'container-sharder', 'container-sync', 'container-updater', 'object-auditor', 'object-server', - 'object-expirer', 'object-replicator', 'object-updater', + 'object-expirer', 'object-replicator', + 'object-reconstructor', 'object-updater', 'proxy-server', 'account-replicator', 'account-reaper'] MAIN_SERVERS = ['proxy-server', 'account-server', 'container-server', 'object-server'] REST_SERVERS = [s for s in ALL_SERVERS if s not in MAIN_SERVERS] -GRACEFUL_SHUTDOWN_SERVERS = MAIN_SERVERS + ['auth-server'] +# aliases mapping +ALIASES = {'all': ALL_SERVERS, 'main': MAIN_SERVERS, 'rest': REST_SERVERS} +GRACEFUL_SHUTDOWN_SERVERS = MAIN_SERVERS +SEAMLESS_SHUTDOWN_SERVERS = MAIN_SERVERS START_ONCE_SERVERS = REST_SERVERS # These are servers that match a type (account-*, container-*, object-*) but # don't use that type-server.conf file and instead use their own. -STANDALONE_SERVERS = ['object-expirer'] +STANDALONE_SERVERS = ['container-reconciler'] -KILL_WAIT = 15 # seconds to wait for servers to die +KILL_WAIT = 15 # seconds to wait for servers to die (by default) WARNING_WAIT = 3 # seconds to wait after message that may just be a warning MAX_DESCRIPTORS = 32768 MAX_MEMORY = (1024 * 1024 * 1024) * 2 # 2 GB +MAX_PROCS = 8192 # workers * disks, can get high def setup_env(): @@ -56,13 +67,26 @@ def setup_env(): try: resource.setrlimit(resource.RLIMIT_NOFILE, (MAX_DESCRIPTORS, MAX_DESCRIPTORS)) + except ValueError: + print("WARNING: Unable to modify file descriptor limit. " + "Running as non-root?") + + try: resource.setrlimit(resource.RLIMIT_DATA, (MAX_MEMORY, MAX_MEMORY)) except ValueError: - print _("WARNING: Unable to increase file descriptor limit. " - "Running as non-root?") + print("WARNING: Unable to modify memory limit. " + "Running as non-root?") + + try: + resource.setrlimit(resource.RLIMIT_NPROC, + (MAX_PROCS, MAX_PROCS)) + except ValueError: + print("WARNING: Unable to modify max process limit. " + "Running as non-root?") - os.environ['PYTHON_EGG_CACHE'] = '/tmp' + # Set PYTHON_EGG_CACHE if it isn't already set + os.environ.setdefault('PYTHON_EGG_CACHE', tempfile.gettempdir()) def command(func): @@ -75,14 +99,16 @@ def command(func): func.publicly_accessible = True @functools.wraps(func) - def wrapped(*a, **kw): - rv = func(*a, **kw) + def wrapped(self, *a, **kw): + rv = func(self, *a, **kw) + if len(self.servers) == 0: + return 1 return 1 if rv else 0 return wrapped def watch_server_pids(server_pids, interval=1, **kwargs): - """Monitor a collection of server pids yeilding back those pids that + """Monitor a collection of server pids yielding back those pids that aren't responding to signals. :param server_pids: a dict, lists of pids [int,...] keyed on @@ -98,7 +124,7 @@ def watch_server_pids(server_pids, interval=1, **kwargs): try: # let pid stop if it wants to os.waitpid(pid, os.WNOHANG) - except OSError, e: + except OSError as e: if e.errno not in (errno.ECHILD, errno.ESRCH): raise # else no such child/process # check running pids for server @@ -118,36 +144,117 @@ def watch_server_pids(server_pids, interval=1, **kwargs): time.sleep(0.1) +def safe_kill(pid, sig, name): + """Send signal to process and check process name + + : param pid: process id + : param sig: signal to send + : param name: name to ensure target process + """ + + # check process name for SIG_DFL + if sig == signal.SIG_DFL: + try: + proc_file = '%s/%d/cmdline' % (PROC_DIR, pid) + if os.path.exists(proc_file): + with open(proc_file, 'r') as fd: + if name not in fd.read(): + # unknown process is using the pid + raise InvalidPidFileException() + except IOError: + pass + + os.kill(pid, sig) + + +def kill_group(pid, sig): + """Send signal to process group + + : param pid: process id + : param sig: signal to send + """ + # Negative PID means process group + os.kill(-pid, sig) + + +def get_child_pids(pid): + """ + Get the current set of all child PIDs for a PID. + + :param pid: process id + """ + output = subprocess.check_output( + ["ps", "--ppid", str(pid), "--no-headers", "-o", "pid"]) + return {int(pid) for pid in output.split()} + + +def format_server_name(servername): + """ + Formats server name as swift compatible server names + E.g. swift-object-server + + :param servername: server name + :returns: swift compatible server name and its binary name + """ + if '.' in servername: + servername = servername.split('.', 1)[0] + if '-' not in servername: + servername = '%s-server' % servername + cmd = 'swift-%s' % servername + return servername, cmd + + +def verify_server(server): + """ + Check whether the server is among swift servers or not, and also + checks whether the server's binaries are installed or not. + + :param server: name of the server + :returns: True, when the server name is valid and its binaries are found. + False, otherwise. + """ + if not server: + return False + _, cmd = format_server_name(server) + if which(cmd) is None: + return False + return True + + class UnknownCommandError(Exception): pass -class Manager(): +class Manager(object): """Main class for performing commands on groups of servers. :param servers: list of server names as strings """ - def __init__(self, servers): - server_names = set() + def __init__(self, servers, run_dir=RUN_DIR): + self.server_names = set() + self._default_strict = True for server in servers: - if server == 'all': - server_names.update(ALL_SERVERS) - elif server == 'main': - server_names.update(MAIN_SERVERS) - elif server == 'rest': - server_names.update(REST_SERVERS) + if server in ALIASES: + self.server_names.update(ALIASES[server]) + self._default_strict = False elif '*' in server: # convert glob to regex - server_names.update([s for s in ALL_SERVERS if - re.match(server.replace('*', '.*'), s)]) + self.server_names.update([ + s for s in ALL_SERVERS if + re.match(server.replace('*', '.*'), s)]) + self._default_strict = False else: - server_names.add(server) + self.server_names.add(server) self.servers = set() - for name in server_names: - self.servers.add(Server(name)) + for name in self.server_names: + if verify_server(name): + self.servers.add(Server(name, run_dir)) + + def __iter__(self): + return iter(self.servers) @command def status(self, **kwargs): @@ -165,14 +272,23 @@ def start(self, **kwargs): setup_env() status = 0 + strict = kwargs.get('strict') + # if strict not set explicitly + if strict is None: + strict = self._default_strict + for server in self.servers: - server.launch(**kwargs) + status += 0 if server.launch(**kwargs) else 1 + + if not strict: + status = 0 + if not kwargs.get('daemon', True): for server in self.servers: try: status += server.interact(**kwargs) except KeyboardInterrupt: - print _('\nuser quit') + print('\nuser quit') self.stop(**kwargs) break elif kwargs.get('wait', True): @@ -209,7 +325,7 @@ def stop(self, **kwargs): for server in self.servers: signaled_pids = server.stop(**kwargs) if not signaled_pids: - print _('No %s running') % server + print('No %s running' % server) else: server_pids[server] = signaled_pids @@ -218,23 +334,56 @@ def stop(self, **kwargs): for p in pids] # keep track of the pids yeiled back as killed for all servers killed_pids = set() + kill_wait = kwargs.get('kill_wait', KILL_WAIT) for server, killed_pid in watch_server_pids(server_pids, - interval=KILL_WAIT, + interval=kill_wait, **kwargs): - print _("%s (%s) appears to have stopped") % (server, killed_pid) + print("%(server)s (%(pid)s) appears to have stopped" % + {'server': server, 'pid': killed_pid}) killed_pids.add(killed_pid) if not killed_pids.symmetric_difference(signaled_pids): - # all proccesses have been stopped + # all processes have been stopped return 0 # reached interval n watch_pids w/o killing all servers + kill_after_timeout = kwargs.get('kill_after_timeout', False) for server, pids in server_pids.items(): if not killed_pids.issuperset(pids): # some pids of this server were not killed - print _('Waited %s seconds for %s to die; giving up') % ( - KILL_WAIT, server) + if kill_after_timeout: + print('Waited %(kill_wait)s seconds for %(server)s ' + 'to die; killing' % + {'kill_wait': kill_wait, 'server': server}) + # Send SIGKILL to all remaining pids + for pid in set(pids.keys()) - killed_pids: + print('Signal %(server)s pid: %(pid)s signal: ' + '%(signal)s' % {'server': server, + 'pid': pid, + 'signal': signal.SIGKILL}) + # Send SIGKILL to process group + try: + kill_group(pid, signal.SIGKILL) + except OSError as e: + # PID died before kill_group can take action? + if e.errno != errno.ESRCH: + raise + else: + print('Waited %(kill_wait)s seconds for %(server)s ' + 'to die; giving up' % + {'kill_wait': kill_wait, 'server': server}) return 1 + @command + def kill(self, **kwargs): + """stop a server (no error if not running) + """ + status = self.stop(**kwargs) + kwargs['quiet'] = True + if status and not self.status(**kwargs): + # only exit error if the server is still running + return status + return 0 + @command def shutdown(self, **kwargs): """allow current requests to finish on supporting servers @@ -259,12 +408,37 @@ def reload(self, **kwargs): """ kwargs['graceful'] = True status = 0 - for server in self.servers: - m = Manager([server.server]) + for server in self.server_names: + m = Manager([server]) status += m.stop(**kwargs) status += m.start(**kwargs) return status + @command + def reload_seamless(self, **kwargs): + """seamlessly re-exec, then shutdown of old listen sockets on + supporting servers + """ + kwargs.pop('graceful', None) + kwargs['seamless'] = True + status = 0 + for server in self.servers: + signaled_pids = server.stop(**kwargs) + if not signaled_pids: + print('No %s running' % server) + status += 1 + return status + + def kill_child_pids(self, **kwargs): + """kill child pids, optionally servicing accepted connections""" + status = 0 + for server in self.servers: + signaled_pids = server.kill_child_pids(**kwargs) + if not signaled_pids: + print('No %s running' % server) + status += 1 + return status + @command def force_reload(self, **kwargs): """alias for reload @@ -279,9 +453,8 @@ def get_command(self, cmd): """ cmd = cmd.lower().replace('-', '_') - try: - f = getattr(self, cmd) - except AttributeError: + f = getattr(self, cmd, None) + if f is None: raise UnknownCommandError(cmd) if not hasattr(f, 'publicly_accessible'): raise UnknownCommandError(cmd) @@ -309,19 +482,22 @@ def run_command(self, cmd, **kwargs): return f(**kwargs) -class Server(): +class Server(object): """Manage operations on a server or group of servers of similar type :param server: name of server """ - def __init__(self, server): - if '-' not in server: - server = '%s-server' % server + def __init__(self, server, run_dir=RUN_DIR): self.server = server.lower() - self.type = server.rsplit('-', 1)[0] - self.cmd = 'swift-%s' % server + if '.' in self.server: + self.server, self.conf = self.server.rsplit('.', 1) + else: + self.conf = None + self.server, self.cmd = format_server_name(self.server) + self.type = self.server.rsplit('-', 1)[0] self.procs = [] + self.run_dir = run_dir def __str__(self): return self.server @@ -338,6 +514,9 @@ def __eq__(self, other): except AttributeError: return False + def __ne__(self, other): + return not self.__eq__(other) + def get_pid_file_name(self, conf_file): """Translate conf_file to a corresponding pid_file @@ -347,9 +526,9 @@ def get_pid_file_name(self, conf_file): """ return conf_file.replace( - os.path.normpath(SWIFT_DIR), RUN_DIR, 1).replace( - '%s-server' % self.type, self.server, 1).rsplit( - '.conf', 1)[0] + '.pid' + os.path.normpath(SWIFT_DIR), self.run_dir, 1).replace( + '%s-server' % self.type, self.server, 1).replace( + '.conf', '.pid', 1) def get_conf_file_name(self, pid_file): """Translate pid_file to a corresponding conf_file @@ -361,27 +540,50 @@ def get_conf_file_name(self, pid_file): """ if self.server in STANDALONE_SERVERS: return pid_file.replace( - os.path.normpath(RUN_DIR), SWIFT_DIR, 1)\ - .rsplit('.pid', 1)[0] + '.conf' + os.path.normpath(self.run_dir), SWIFT_DIR, 1).replace( + '.pid', '.conf', 1) else: return pid_file.replace( - os.path.normpath(RUN_DIR), SWIFT_DIR, 1).replace( - self.server, '%s-server' % self.type, 1).rsplit( - '.pid', 1)[0] + '.conf' + os.path.normpath(self.run_dir), SWIFT_DIR, 1).replace( + self.server, '%s-server' % self.type, 1).replace( + '.pid', '.conf', 1) + + def _find_conf_files(self, server_search): + if self.conf is not None: + return search_tree(SWIFT_DIR, server_search, self.conf + '.conf', + dir_ext=self.conf + '.conf.d') + else: + return search_tree(SWIFT_DIR, server_search + '*', '.conf', + dir_ext='.conf.d') def conf_files(self, **kwargs): """Get conf files for this server - :param: number, if supplied will only lookup the nth server + :param number: if supplied will only lookup the nth server :returns: list of conf files """ - if self.server in STANDALONE_SERVERS: - found_conf_files = search_tree(SWIFT_DIR, self.server + '*', - '.conf') + if self.server == 'object-expirer': + def has_expirer_section(conf_path): + try: + readconf(conf_path, section_name="object-expirer") + except ValueError: + return False + else: + return True + + # config of expirer is preferentially read from object-server + # section. If all object-server.conf doesn't have object-expirer + # section, object-expirer.conf is used. + found_conf_files = [ + conf for conf in self._find_conf_files("object-server") + if has_expirer_section(conf) + ] or self._find_conf_files("object-expirer") + elif self.server in STANDALONE_SERVERS: + found_conf_files = self._find_conf_files(self.server) else: - found_conf_files = search_tree(SWIFT_DIR, '%s-server*' % self.type, - '.conf') + found_conf_files = self._find_conf_files("%s-server" % self.type) + number = kwargs.get('number') if number: try: @@ -390,27 +592,47 @@ def conf_files(self, **kwargs): conf_files = [] else: conf_files = found_conf_files + + def dump_found_configs(): + if found_conf_files: + print('Found configs:') + for i, conf_file in enumerate(found_conf_files): + print(' %d) %s' % (i + 1, conf_file)) + if not conf_files: # maybe there's a config file(s) out there, but I couldn't find it! if not kwargs.get('quiet'): - print _('Unable to locate config %sfor %s') % ( - ('number %s ' % number if number else ''), self.server) + if number: + print('Unable to locate config number %(number)s for' + ' %(server)s' % + {'number': number, 'server': self.server}) + else: + print('Unable to locate config for %s' % self.server) if kwargs.get('verbose') and not kwargs.get('quiet'): - if found_conf_files: - print _('Found configs:') - for i, conf_file in enumerate(found_conf_files): - print ' %d) %s' % (i + 1, conf_file) + dump_found_configs() + elif any(["object-expirer" in name for name in conf_files]) and \ + not kwargs.get('quiet'): + print("WARNING: object-expirer.conf is deprecated. " + "Move object-expirers' configuration into " + "object-server.conf.") + if kwargs.get('verbose'): + dump_found_configs() return conf_files def pid_files(self, **kwargs): """Get pid files for this server - :param: number, if supplied will only lookup the nth server + :param number: if supplied will only lookup the nth server :returns: list of pid files """ - pid_files = search_tree(RUN_DIR, '%s*' % self.server, '.pid') + if self.conf is not None: + pid_files = search_tree(self.run_dir, '%s*' % self.server, + exts=[self.conf + '.pid', + self.conf + '.pid.d']) + else: + pid_files = search_tree(self.run_dir, '%s*' % self.server) if kwargs.get('number', 0): conf_files = self.conf_files(**kwargs) # filter pid_files to match the index of numbered conf_file @@ -422,7 +644,37 @@ def iter_pid_files(self, **kwargs): """Generator, yields (pid_file, pids) """ for pid_file in self.pid_files(**kwargs): - yield pid_file, int(open(pid_file).read().strip()) + try: + pid = int(open(pid_file).read().strip()) + except ValueError: + pid = None + yield pid_file, pid + + def _signal_pid(self, sig, pid, pid_file, verbose): + try: + if sig != signal.SIG_DFL: + print('Signal %(server)s pid: %(pid)s signal: ' + '%(signal)s' % + {'server': self.server, 'pid': pid, 'signal': sig}) + safe_kill(pid, sig, 'swift-%s' % self.server) + except InvalidPidFileException: + if verbose: + print('Removing pid file %(pid_file)s with wrong pid ' + '%(pid)d' % {'pid_file': pid_file, 'pid': pid}) + remove_file(pid_file) + return False + except OSError as e: + if e.errno == errno.ESRCH: + # pid does not exist + if verbose: + print("Removing stale pid file %s" % pid_file) + remove_file(pid_file) + elif e.errno == errno.EPERM: + print("No permission to signal PID %d" % pid) + return False + else: + # process exists + return True def signal_pids(self, sig, **kwargs): """Send a signal to pids for this server @@ -434,24 +686,33 @@ def signal_pids(self, sig, **kwargs): """ pids = {} for pid_file, pid in self.iter_pid_files(**kwargs): - try: - if sig != signal.SIG_DFL: - print _('Signal %s pid: %s signal: %s') % (self.server, - pid, sig) - os.kill(pid, sig) - except OSError, e: - if e.errno == errno.ESRCH: - # pid does not exist - if kwargs.get('verbose'): - print _("Removing stale pid file %s") % pid_file - remove_file(pid_file) - elif e.errno == errno.EPERM: - print _("No permission to signal PID %d") % pid - else: - # process exists + if not pid: # Catches None and 0 + print('Removing pid file %s with invalid pid' % pid_file) + remove_file(pid_file) + continue + if self._signal_pid(sig, pid, pid_file, kwargs.get('verbose')): pids[pid] = pid_file return pids + def signal_children(self, sig, **kwargs): + """Send a signal to child pids for this server + + :param sig: signal to send + + :returns: a dict mapping pids (ints) to pid_files (paths) + + """ + pids = {} + for pid_file, pid in self.iter_pid_files(**kwargs): + if not pid: # Catches None and 0 + print('Removing pid file %s with invalid pid' % pid_file) + remove_file(pid_file) + continue + for pid in get_child_pids(pid): + if self._signal_pid(sig, pid, pid_file, kwargs.get('verbose')): + pids[pid] = pid_file + return pids + def get_running_pids(self, **kwargs): """Get running pids @@ -464,22 +725,45 @@ def kill_running_pids(self, **kwargs): """Kill running pids :param graceful: if True, attempt SIGHUP on supporting servers + :param seamless: if True, attempt SIGUSR1 on supporting servers :returns: a dict mapping pids (ints) to pid_files (paths) """ graceful = kwargs.get('graceful') + seamless = kwargs.get('seamless') if graceful and self.server in GRACEFUL_SHUTDOWN_SERVERS: sig = signal.SIGHUP + elif seamless and self.server in SEAMLESS_SHUTDOWN_SERVERS: + sig = signal.SIGUSR1 else: sig = signal.SIGTERM return self.signal_pids(sig, **kwargs) + def kill_child_pids(self, **kwargs): + """Kill child pids, leaving server overseer to respawn them + + :param graceful: if True, attempt SIGHUP on supporting servers + :param seamless: if True, attempt SIGUSR1 on supporting servers + + :returns: a dict mapping pids (ints) to pid_files (paths) + + """ + graceful = kwargs.get('graceful') + seamless = kwargs.get('seamless') + if graceful and self.server in GRACEFUL_SHUTDOWN_SERVERS: + sig = signal.SIGHUP + elif seamless and self.server in SEAMLESS_SHUTDOWN_SERVERS: + sig = signal.SIGUSR1 + else: + sig = signal.SIGTERM + return self.signal_children(sig, **kwargs) + def status(self, pids=None, **kwargs): """Display status of server - :param: pids, if not supplied pids will be populated automatically - :param: number, if supplied will only lookup the nth server + :param pids: if not supplied pids will be populated automatically + :param number: if supplied will only lookup the nth server :returns: 1 if server is not running, 0 otherwise """ @@ -491,25 +775,30 @@ def status(self, pids=None, **kwargs): kwargs['quiet'] = True conf_files = self.conf_files(**kwargs) if conf_files: - print _("%s #%d not running (%s)") % (self.server, number, - conf_files[0]) + print("%(server)s #%(number)d not running (%(conf)s)" % + {'server': self.server, 'number': number, + 'conf': conf_files[0]}) else: - print _("No %s running") % self.server + print("No %s running" % self.server) return 1 for pid, pid_file in pids.items(): conf_file = self.get_conf_file_name(pid_file) - print _("%s running (%s - %s)") % (self.server, pid, conf_file) + print("%(server)s running (%(pid)s - %(conf)s)" % + {'server': self.server, 'pid': pid, 'conf': conf_file}) return 0 - def spawn(self, conf_file, once=False, wait=True, daemon=True, **kwargs): + def spawn(self, conf_file, once=False, wait=True, daemon=True, + additional_args=None, **kwargs): """Launch a subprocess for this server. :param conf_file: path to conf_file to use as first arg :param once: boolean, add once argument to command :param wait: boolean, if true capture stdout with a pipe - :param daemon: boolean, if true ask server to log to console + :param daemon: boolean, if false ask server to log to console + :param additional_args: list of additional arguments to pass + on the command line - :returns : the pid of the spawned process + :returns: the pid of the spawned process """ args = [self.cmd, conf_file] if once: @@ -517,6 +806,10 @@ def spawn(self, conf_file, once=False, wait=True, daemon=True, **kwargs): if not daemon: # ask the server to log to console args.append('verbose') + if additional_args: + if isinstance(additional_args, str): + additional_args = [additional_args] + args.extend(additional_args) # figure out what we're going to do with stdio if not daemon: @@ -542,10 +835,20 @@ def wait(self, **kwargs): """ status = 0 for proc in self.procs: - # wait for process to close its stdout - output = proc.stdout.read() + # wait for process to close its stdout (if we haven't done that) + if proc.stdout.closed: + output = '' + else: + output = proc.stdout.read().decode('utf8', 'backslashreplace') + proc.stdout.close() + + if kwargs.get('once', False): + # if you don't want once to wait you can send it to the + # background on the command line, I generally just run with + # no-daemon anyway, but this is quieter + proc.wait() if output: - print output + print(output) start = time.time() # wait for process to die (output may just be a warning) while time.time() - start < WARNING_WAIT: @@ -562,7 +865,7 @@ def interact(self, **kwargs): status = 0 for proc in self.procs: # wait for process to terminate - proc.communicate() + proc.communicate() # should handle closing pipes if proc.returncode: status += 1 return status @@ -573,7 +876,7 @@ def launch(self, **kwargs): """ conf_files = self.conf_files(**kwargs) if not conf_files: - return [] + return {} pids = self.get_running_pids(**kwargs) @@ -586,14 +889,17 @@ def launch(self, **kwargs): # any unstarted instances if conf_file in conf_files: already_started = True - print _("%s running (%s - %s)") % (self.server, pid, conf_file) + print("%(server)s running (%(pid)s - %(conf)s)" % + {'server': self.server, 'pid': pid, 'conf': conf_file}) elif not kwargs.get('number', 0): already_started = True - print _("%s running (%s - %s)") % (self.server, pid, pid_file) + print("%(server)s running (%(pid)s - %(pid_file)s)" % + {'server': self.server, 'pid': pid, + 'pid_file': pid_file}) if already_started: - print _("%s already started...") % self.server - return [] + print("%s already started..." % self.server) + return {} if self.server not in START_ONCE_SERVERS: kwargs['once'] = False @@ -601,17 +907,19 @@ def launch(self, **kwargs): pids = {} for conf_file in conf_files: if kwargs.get('once'): - msg = _('Running %s once') % self.server + msg = 'Running %s once' % self.server else: - msg = _('Starting %s') % self.server - print '%s...(%s)' % (msg, conf_file) + msg = 'Starting %s' % self.server + print('%s...(%s)' % (msg, conf_file)) try: pid = self.spawn(conf_file, **kwargs) - except OSError, e: + except OSError as e: if e.errno == errno.ENOENT: - # TODO: should I check if self.cmd exists earlier? - print _("%s does not exist") % self.cmd + # TODO(clayg): should I check if self.cmd exists earlier? + print("%s does not exist" % self.cmd) break + else: + raise pids[pid] = conf_file return pids @@ -623,3 +931,102 @@ def stop(self, **kwargs): """ return self.kill_running_pids(**kwargs) + + +USAGE = \ + """%prog [.] [[.] ...] [options] + +where: + is the name of a swift service e.g. proxy-server. + The '-server' part of the name may be omitted. + 'all', 'main' and 'rest' are reserved words that represent a + group of services. + all: Expands to all swift daemons. + main: Expands to main swift daemons. + (proxy, container, account, object) + rest: Expands to all remaining background daemons (beyond + "main"). + (updater, replicator, auditor, etc) + is an explicit configuration filename without the + .conf extension. If is specified then should + refer to a directory containing the configuration file, e.g.: + + swift-init object.1 start + + will start an object-server using the configuration file + /etc/swift/object-server/1.conf + is a command from the list below. + +Commands: +""" + '\n'.join(["%16s: %s" % x for x in Manager.list_commands()]) + + +def main(): + parser = OptionParser(USAGE) + parser.add_option('-v', '--verbose', action="store_true", + default=False, help="display verbose output") + parser.add_option('-w', '--no-wait', action="store_false", dest="wait", + default=True, help="won't wait for server to start " + "before returning") + parser.add_option('-o', '--once', action="store_true", + default=False, help="only run one pass of daemon") + # this is a negative option, default is options.daemon = True + parser.add_option('-n', '--no-daemon', action="store_false", dest="daemon", + default=True, help="start server interactively") + parser.add_option('-g', '--graceful', action="store_true", + default=False, help="send SIGHUP to supporting servers") + parser.add_option('-c', '--config-num', metavar="N", type="int", + dest="number", default=0, + help="send command to the Nth server only") + parser.add_option('-k', '--kill-wait', metavar="N", type="int", + dest="kill_wait", default=KILL_WAIT, + help="wait N seconds for processes to die (default 15)") + parser.add_option('-r', '--run-dir', type="str", + dest="run_dir", default=RUN_DIR, + help="alternative directory to store running pid files " + "default: %s" % RUN_DIR) + # Changing behaviour if missing config + parser.add_option('--strict', dest='strict', action='store_true', + help="Return non-zero status code if some config is " + "missing. Default mode if all servers are " + "explicitly named.") + # a negative option for strict + parser.add_option('--non-strict', dest='strict', action='store_false', + help="Return zero status code even if some config is " + "missing. Default mode if any server is a glob or " + "one of aliases `all`, `main` or `rest`.") + # SIGKILL daemon after kill_wait period + parser.add_option('--kill-after-timeout', dest='kill_after_timeout', + action='store_true', + help="Kill daemon and all children after kill-wait " + "period.") + + options, args = parser.parse_args() + + if len(args) < 2: + parser.print_help() + print('ERROR: specify server(s) and command') + return 1 + + command = args[-1] + servers = args[:-1] + + # this is just a silly swap for me cause I always try to "start main" + commands = dict(Manager.list_commands()).keys() + if command not in commands and servers[0] in commands: + servers.append(command) + command = servers.pop(0) + + manager = Manager(servers, run_dir=options.run_dir) + try: + status = manager.run_command(command, **options.__dict__) + except UnknownCommandError: + parser.print_help() + print('ERROR: unknown command, %s' % command) + status = 1 + + return 1 if status else 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/swift/common/memcached.py b/swift/common/memcached.py index 42e9b34efd..9ffd861f34 100644 --- a/swift/common/memcached.py +++ b/swift/common/memcached.py @@ -1,4 +1,4 @@ -# Copyright (c) 2010-2012 OpenStack, LLC. +# Copyright (c) 2010-2012 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,43 +14,83 @@ # limitations under the License. """ +Why our own memcache client? +By Michael Barton + +python-memcached doesn't use consistent hashing, so adding or +removing a memcache server from the pool invalidates a huge +percentage of cached items. + +If you keep a pool of python-memcached client objects, each client +object has its own connection to every memcached server, only one of +which is ever in use. So you wind up with n * m open sockets and +almost all of them idle. This client effectively has a pool for each +server, so the number of backend connections is hopefully greatly +reduced. + +python-memcache uses pickle to store things, and there was already a +huge stink about Swift using pickles in memcache +(http://osvdb.org/show/osvdb/86581). That seemed sort of unfair, +since nova and keystone and everyone else use pickles for memcache +too, but it's hidden behind a "standard" library. But changing would +be a security regression at this point. + +Also, pylibmc wouldn't work for us because it needs to use python +sockets in order to play nice with eventlet. + Lucid comes with memcached: v1.4.2. Protocol documentation for that version is at: http://github.com/memcached/memcached/blob/1.4.2/doc/protocol.txt """ -import cPickle as pickle +import os +import json import logging -import socket -import time +# the name of 'time' module is changed to 'tm', to avoid changing the +# signatures of member functions in this file. +import time as tm from bisect import bisect -from hashlib import md5 -try: - import simplejson as json -except ImportError: - import json +from eventlet.green import socket, ssl +from eventlet.pools import Pool +from eventlet import Timeout +from configparser import ConfigParser, NoSectionError, NoOptionError +from swift.common import utils +from swift.common.exceptions import MemcacheConnectionError, \ + MemcacheIncrNotFoundError, MemcachePoolTimeout +from swift.common.utils import md5, human_readable, config_true_value, \ + memcached_timing_stats DEFAULT_MEMCACHED_PORT = 11211 CONN_TIMEOUT = 0.3 +POOL_TIMEOUT = 1.0 # WAG IO_TIMEOUT = 2.0 PICKLE_FLAG = 1 JSON_FLAG = 2 NODE_WEIGHT = 50 -PICKLE_PROTOCOL = 2 TRY_COUNT = 3 # if ERROR_LIMIT_COUNT errors occur in ERROR_LIMIT_TIME seconds, the server # will be considered failed for ERROR_LIMIT_DURATION seconds. ERROR_LIMIT_COUNT = 10 -ERROR_LIMIT_TIME = 60 -ERROR_LIMIT_DURATION = 60 +ERROR_LIMIT_TIME = ERROR_LIMIT_DURATION = 60 +DEFAULT_ITEM_SIZE_WARNING_THRESHOLD = -1 + +# Different sample rates for emitting Memcached timing stats. +TIMING_SAMPLE_RATE_HIGH = 0.1 +TIMING_SAMPLE_RATE_MEDIUM = 0.01 +TIMING_SAMPLE_RATE_LOW = 0.001 + +# The max value of a delta expiration time. +EXPTIME_MAXDELTA = 30 * 24 * 60 * 60 def md5hash(key): - return md5(key).hexdigest() + if not isinstance(key, bytes): + key = key.encode('utf-8', errors='surrogateescape') + return md5(key, usedforsecurity=False).hexdigest().encode('ascii') def sanitize_timeout(timeout): @@ -60,13 +100,95 @@ def sanitize_timeout(timeout): translates negative values to mean a delta of 30 days in seconds (and 1 additional second), client beware. """ - if timeout > (30 * 24 * 60 * 60): - timeout += time.time() - return timeout + if timeout > EXPTIME_MAXDELTA: + timeout += tm.time() + return int(timeout) + + +def set_msg(key, flags, timeout, value): + if not isinstance(key, bytes): + raise TypeError('key must be bytes') + if not isinstance(value, bytes): + raise TypeError('value must be bytes') + return b' '.join([ + b'set', + key, + str(flags).encode('ascii'), + str(timeout).encode('ascii'), + str(len(value)).encode('ascii'), + ]) + (b'\r\n' + value + b'\r\n') + + +class MemcacheConnPool(Pool): + """ + Connection pool for Memcache Connections + + The *server* parameter can be a hostname, an IPv4 address, or an IPv6 + address with an optional port. See + :func:`swift.common.utils.parse_socket_string` for details. + """ + + def __init__(self, server, size, connect_timeout, tls_context=None): + Pool.__init__(self, max_size=size) + self.host, self.port = utils.parse_socket_string( + server, DEFAULT_MEMCACHED_PORT) + self._connect_timeout = connect_timeout + self._tls_context = tls_context + + def create(self): + addrs = socket.getaddrinfo(self.host, self.port, socket.AF_UNSPEC, + socket.SOCK_STREAM) + family, socktype, proto, canonname, sockaddr = addrs[0] + sock = socket.socket(family, socket.SOCK_STREAM) + sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) + try: + with Timeout(self._connect_timeout): + sock.connect(sockaddr) + if self._tls_context: + sock = self._tls_context.wrap_socket(sock, + server_hostname=self.host) + except (Exception, Timeout): + sock.close() + raise + return (sock.makefile('rwb'), sock) + + def get(self): + fp, sock = super(MemcacheConnPool, self).get() + try: + if fp is None: + # An error happened previously, so we need a new connection + fp, sock = self.create() + return fp, sock + except MemcachePoolTimeout: + # This is the only place that knows an item was successfully taken + # from the pool, so it has to be responsible for repopulating it. + # Any other errors should get handled in _get_conns(); see the + # comment about timeouts during create() there. + self.put((None, None)) + raise + + +class MemcacheCommand(object): + """ + Helper class that encapsulates common parameters of a command. + + :param method: the name of the MemcacheRing method that was called. + :param key: the memcached key. + """ + __slots__ = ('method', 'key', 'command', 'hash_key') + def __init__(self, method, key): + self.method = method + self.key = key + self.command = method.encode() + self.hash_key = md5hash(key) -class MemcacheConnectionError(Exception): - pass + @property + def key_prefix(self): + # get the prefix of a user provided memcache key by removing the + # content after the last '/', all current usages within swift are using + # prefix, such as "shard-updating-v2", "nvratelimit" and etc. + return self.key.rsplit('/', 1)[0] class MemcacheRing(object): @@ -74,140 +196,299 @@ class MemcacheRing(object): Simple, consistent-hashed memcache client. """ - def __init__(self, servers, connect_timeout=CONN_TIMEOUT, - io_timeout=IO_TIMEOUT, tries=TRY_COUNT, - allow_pickle=False, allow_unpickle=False): + def __init__( + self, servers, connect_timeout=CONN_TIMEOUT, + io_timeout=IO_TIMEOUT, pool_timeout=POOL_TIMEOUT, + tries=TRY_COUNT, + max_conns=2, tls_context=None, logger=None, + error_limit_count=ERROR_LIMIT_COUNT, + error_limit_time=ERROR_LIMIT_TIME, + error_limit_duration=ERROR_LIMIT_DURATION, + item_size_warning_threshold=DEFAULT_ITEM_SIZE_WARNING_THRESHOLD): self._ring = {} self._errors = dict(((serv, []) for serv in servers)) self._error_limited = dict(((serv, 0) for serv in servers)) + self._error_limit_count = error_limit_count + self._error_limit_time = error_limit_time + self._error_limit_duration = error_limit_duration for server in sorted(servers): - for i in xrange(NODE_WEIGHT): + for i in range(NODE_WEIGHT): self._ring[md5hash('%s-%s' % (server, i))] = server self._tries = tries if tries <= len(servers) else len(servers) - self._sorted = sorted(self._ring.keys()) - self._client_cache = dict(((server, []) for server in servers)) + self._sorted = sorted(self._ring) + self._client_cache = dict(( + (server, MemcacheConnPool(server, max_conns, connect_timeout, + tls_context=tls_context)) + for server in servers)) self._connect_timeout = connect_timeout self._io_timeout = io_timeout - self._allow_pickle = allow_pickle - self._allow_unpickle = allow_unpickle or allow_pickle + self._pool_timeout = pool_timeout + if logger is None: + self.logger = logging.getLogger() + else: + self.logger = logger + self.item_size_warning_threshold = item_size_warning_threshold + + @property + def memcache_servers(self): + return list(self._client_cache.keys()) + + def _log_error(self, server, cmd, action, msg): + self.logger.error( + "Error %(action)s to memcached: %(server)s" + ": with key_prefix %(key_prefix)s, method %(method)s: %(msg)s", + {'action': action, 'server': server, 'key_prefix': cmd.key_prefix, + 'method': cmd.method, 'msg': msg}) - def _exception_occurred(self, server, e, action='talking'): - if isinstance(e, socket.timeout): - logging.error(_("Timeout %(action)s to memcached: %(server)s"), - {'action': action, 'server': server}) + """ + Handles exceptions. + + :param server: a server. + :param e: an exception. + :param cmd: an instance of MemcacheCommand. + :param conn_start_time: the time at which the failed operation started. + :param action: a verb describing the operation. + :param sock: an optional socket that needs to be closed by this method. + :param fp: an optional file pointer that needs to be closed by this method. + :param got_connection: if ``True``, the server's connection will be reset + in the cached connection pool. + """ + def _exception_occurred(self, server, e, cmd, conn_start_time, + action='talking', sock=None, + fp=None, got_connection=True): + if isinstance(e, Timeout): + self.logger.error( + "Timeout %(action)s to memcached: %(server)s" + ": with key_prefix %(key_prefix)s, method %(method)s, " + "config_timeout %(config_timeout)s, time_spent %(time_spent)s", + {'action': action, 'server': server, + 'key_prefix': cmd.key_prefix, 'method': cmd.method, + 'config_timeout': e.seconds, + 'time_spent': tm.time() - conn_start_time}) + self.logger.timing_since( + 'memcached.' + cmd.method + '.timeout.timing', + conn_start_time) + elif isinstance(e, (socket.error, MemcacheConnectionError)): + self.logger.error( + "Error %(action)s to memcached: %(server)s: " + "with key_prefix %(key_prefix)s, method %(method)s, " + "time_spent %(time_spent)s, %(err)s", + {'action': action, 'server': server, + 'key_prefix': cmd.key_prefix, 'method': cmd.method, + 'time_spent': tm.time() - conn_start_time, 'err': e}) + self.logger.timing_since( + 'memcached.' + cmd.method + '.conn_err.timing', + conn_start_time) else: - logging.exception(_("Error %(action)s to memcached: %(server)s"), - {'action': action, 'server': server}) - now = time.time() - self._errors[server].append(time.time()) - if len(self._errors[server]) > ERROR_LIMIT_COUNT: + self.logger.exception( + "Error %(action)s to memcached: %(server)s" + ": with key_prefix %(key_prefix)s, method %(method)s, " + "time_spent %(time_spent)s", + {'action': action, 'server': server, + 'key_prefix': cmd.key_prefix, 'method': cmd.method, + 'time_spent': tm.time() - conn_start_time}) + self.logger.timing_since( + 'memcached.' + cmd.method + '.errors.timing', conn_start_time) + + try: + if fp: + fp.close() + del fp + except Exception: + pass + try: + if sock: + sock.close() + del sock + except Exception: + pass + if got_connection: + # We need to return something to the pool + # A new connection will be created the next time it is retrieved + self._return_conn(server, None, None) + + if isinstance(e, MemcacheIncrNotFoundError): + # these errors can be caused by other greenthreads not yielding to + # the incr greenthread often enough, rather than a server problem, + # so don't error limit the server + return + + if self._error_limit_time <= 0 or self._error_limit_duration <= 0: + return + + now = tm.time() + self._errors[server].append(now) + if len(self._errors[server]) > self._error_limit_count: self._errors[server] = [err for err in self._errors[server] - if err > now - ERROR_LIMIT_TIME] - if len(self._errors[server]) > ERROR_LIMIT_COUNT: - self._error_limited[server] = now + ERROR_LIMIT_DURATION - logging.error(_('Error limiting server %s'), server) + if err > now - self._error_limit_time] + if len(self._errors[server]) > self._error_limit_count: + self._error_limited[server] = now + self._error_limit_duration + self.logger.error('Error limiting server %s', server) - def _get_conns(self, key): + def _get_conns(self, cmd): """ Retrieves a server conn from the pool, or connects a new one. Chooses the server based on a consistent hash of "key". + + :param cmd: an instance of MemcacheCommand. + :return: generator to serve memcached connection """ - pos = bisect(self._sorted, key) + pos = bisect(self._sorted, cmd.hash_key) served = [] + any_yielded = False while len(served) < self._tries: pos = (pos + 1) % len(self._sorted) server = self._ring[self._sorted[pos]] if server in served: continue served.append(server) - if self._error_limited[server] > time.time(): + pool_start_time = tm.time() + if self._error_limited[server] > pool_start_time: continue + sock = None try: - fp, sock = self._client_cache[server].pop() + with MemcachePoolTimeout(self._pool_timeout): + fp, sock = self._client_cache[server].get() + any_yielded = True yield server, fp, sock - except IndexError: - try: - if ':' in server: - host, port = server.split(':') - else: - host = server - port = DEFAULT_MEMCACHED_PORT - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) - sock.settimeout(self._connect_timeout) - sock.connect((host, int(port))) - sock.settimeout(self._io_timeout) - yield server, sock.makefile(), sock - except Exception, e: - self._exception_occurred(server, e, 'connecting') + except MemcachePoolTimeout as e: + self._exception_occurred(server, e, cmd, pool_start_time, + action='getting a connection', + got_connection=False) + except (Exception, Timeout) as e: + # Typically a Timeout exception caught here is the one raised + # by the create() method of this server's MemcacheConnPool + # object. + self._exception_occurred(server, e, cmd, pool_start_time, + action='connecting', sock=sock) + if not any_yielded: + self._log_error('ALL', cmd, 'connecting', + 'No more memcached servers to try') def _return_conn(self, server, fp, sock): - """ Returns a server connection to the pool """ - self._client_cache[server].append((fp, sock)) - - def set(self, key, value, serialize=True, timeout=0): + """Returns a server connection to the pool.""" + self._client_cache[server].put((fp, sock)) + + # Sample rates of different memcached operations are based on generic + # swift usage patterns. + @memcached_timing_stats(sample_rate=TIMING_SAMPLE_RATE_HIGH) + def set(self, key, value, serialize=True, time=0, + min_compress_len=0, raise_on_error=False): """ Set a key/value pair in memcache :param key: key :param value: value :param serialize: if True, value is serialized with JSON before sending - to memcache, or with pickle if configured to use - pickle instead of JSON (to avoid cache poisoning) - :param timeout: ttl in memcache + to memcache + :param time: the time to live + :param min_compress_len: minimum compress length, this parameter was + added to keep the signature compatible with + python-memcached interface. This + implementation ignores it. + :param raise_on_error: if True, propagate Timeouts and other errors. + By default, errors are ignored. """ - key = md5hash(key) - timeout = sanitize_timeout(timeout) + cmd = MemcacheCommand('set', key) + timeout = sanitize_timeout(time) flags = 0 - if serialize and self._allow_pickle: - value = pickle.dumps(value, PICKLE_PROTOCOL) - flags |= PICKLE_FLAG - elif serialize: - value = json.dumps(value) + if serialize: + if isinstance(value, bytes): + value = value.decode('utf8') + value = json.dumps(value).encode('ascii') flags |= JSON_FLAG - for (server, fp, sock) in self._get_conns(key): + elif not isinstance(value, bytes): + value = str(value).encode('utf-8') + + if 0 <= self.item_size_warning_threshold <= len(value): + self.logger.warning( + "Item size larger than warning threshold: " + "%d (%s) >= %d (%s)", len(value), + human_readable(len(value)), + self.item_size_warning_threshold, + human_readable(self.item_size_warning_threshold)) + + for (server, fp, sock) in self._get_conns(cmd): + conn_start_time = tm.time() try: - sock.sendall('set %s %d %d %s noreply\r\n%s\r\n' % - (key, flags, timeout, len(value), value)) - self._return_conn(server, fp, sock) - return - except Exception, e: - self._exception_occurred(server, e) - - def get(self, key): + with Timeout(self._io_timeout): + sock.sendall(set_msg(cmd.hash_key, flags, timeout, value)) + # Wait for the set to complete + msg = fp.readline().strip() + if msg != b'STORED': + msg = msg.decode('ascii') + raise MemcacheConnectionError('failed set: %s' % msg) + self._return_conn(server, fp, sock) + return + except (Exception, Timeout) as e: + self._exception_occurred(server, e, cmd, conn_start_time, + sock=sock, fp=fp) + if raise_on_error: + raise MemcacheConnectionError( + "No memcached connections succeeded.") + + @memcached_timing_stats(sample_rate=TIMING_SAMPLE_RATE_MEDIUM) + def get(self, key, raise_on_error=False): """ Gets the object specified by key. It will also unserialize the object - before returning if it is serialized in memcache with JSON, or if it - is pickled and unpickling is allowed. + before returning if it is serialized in memcache with JSON. :param key: key + :param raise_on_error: if True, propagate Timeouts and other errors. + By default, errors are treated as cache misses. :returns: value of the key in memcache """ - key = md5hash(key) + cmd = MemcacheCommand('get', key) value = None - for (server, fp, sock) in self._get_conns(key): + for (server, fp, sock) in self._get_conns(cmd): + conn_start_time = tm.time() try: - sock.sendall('get %s\r\n' % key) - line = fp.readline().strip().split() - while line[0].upper() != 'END': - if line[0].upper() == 'VALUE' and line[1] == key: - size = int(line[3]) - value = fp.read(size) - if int(line[2]) & PICKLE_FLAG: - if self._allow_unpickle: - value = pickle.loads(value) - else: - value = None - elif int(line[2]) & JSON_FLAG: - value = json.loads(value) - fp.readline() + with Timeout(self._io_timeout): + sock.sendall(b'get ' + cmd.hash_key + b'\r\n') line = fp.readline().strip().split() - self._return_conn(server, fp, sock) - return value - except Exception, e: - self._exception_occurred(server, e) - - def incr(self, key, delta=1, timeout=0): + while True: + if not line: + raise MemcacheConnectionError('incomplete read') + if line[0].upper() == b'END': + break + if (line[0].upper() == b'VALUE' and + line[1] == cmd.hash_key): + size = int(line[3]) + value = fp.read(size) + if int(line[2]) & PICKLE_FLAG: + value = None + if int(line[2]) & JSON_FLAG: + value = json.loads(value) + fp.readline() + line = fp.readline().strip().split() + self._return_conn(server, fp, sock) + return value + except (Exception, Timeout) as e: + self._exception_occurred(server, e, cmd, conn_start_time, + sock=sock, fp=fp) + if raise_on_error: + raise MemcacheConnectionError( + "No memcached connections succeeded.") + + def _incr_or_decr(self, fp, sock, cmd, delta): + sock.sendall(b' '.join([cmd.command, cmd.hash_key, delta]) + b'\r\n') + line = fp.readline().strip().split() + if not line: + raise MemcacheConnectionError('incomplete read') + if line[0].upper() == b'NOT_FOUND': + return None + return int(line[0].strip()) + + def _add(self, fp, sock, cmd, add_val, timeout): + sock.sendall(b' '.join([ + b'add', cmd.hash_key, b'0', str(timeout).encode('ascii'), + str(len(add_val)).encode('ascii') + ]) + b'\r\n' + add_val + b'\r\n') + line = fp.readline().strip().split() + return None if line[0].upper() == b'NOT_STORED' else int(add_val) + + @memcached_timing_stats(sample_rate=TIMING_SAMPLE_RATE_LOW) + def incr(self, key, delta=1, time=0): """ Increments a key which has a numeric value by delta. If the key can't be found, it's added as delta or 0 if delta < 0. @@ -220,41 +501,40 @@ def incr(self, key, delta=1, timeout=0): :param key: key :param delta: amount to add to the value of key (or set as the value if the key is not found) will be cast to an int - :param timeout: ttl in memcache + :param time: the time to live + :returns: result of incrementing :raises MemcacheConnectionError: """ - key = md5hash(key) - command = 'incr' - if delta < 0: - command = 'decr' - delta = str(abs(int(delta))) - timeout = sanitize_timeout(timeout) - for (server, fp, sock) in self._get_conns(key): + cmd = MemcacheCommand('incr' if delta >= 0 else 'decr', key) + delta_val = str(abs(int(delta))).encode('ascii') + timeout = sanitize_timeout(time) + for (server, fp, sock) in self._get_conns(cmd): + conn_start_time = tm.time() try: - sock.sendall('%s %s %s\r\n' % (command, key, delta)) - line = fp.readline().strip().split() - if line[0].upper() == 'NOT_FOUND': - add_val = delta - if command == 'decr': - add_val = '0' - sock.sendall('add %s %d %d %s\r\n%s\r\n' % - (key, 0, timeout, len(add_val), add_val)) - line = fp.readline().strip().split() - if line[0].upper() == 'NOT_STORED': - sock.sendall('%s %s %s\r\n' % (command, key, delta)) - line = fp.readline().strip().split() - ret = int(line[0].strip()) - else: - ret = int(add_val) - else: - ret = int(line[0].strip()) - self._return_conn(server, fp, sock) - return ret - except Exception, e: - self._exception_occurred(server, e) - raise MemcacheConnectionError("No Memcached connections succeeded.") - - def decr(self, key, delta=1, timeout=0): + with Timeout(self._io_timeout): + new_val = self._incr_or_decr(fp, sock, cmd, delta_val) + if new_val is None: + add_val = b'0' if cmd.method == 'decr' else delta_val + new_val = self._add(fp, sock, cmd, add_val, timeout) + if new_val is None: + new_val = self._incr_or_decr( + fp, sock, cmd, delta_val) + if new_val is None: + # This can happen if this thread takes more + # than the TTL to get from the first failed + # incr to the second incr, during which time + # the key was concurrently added and expired. + raise MemcacheIncrNotFoundError( + 'expired ttl=%s' % time) + self._return_conn(server, fp, sock) + return new_val + except (Exception, Timeout) as e: + self._exception_occurred(server, e, cmd, conn_start_time, + sock=sock, fp=fp) + raise MemcacheConnectionError("No memcached connections succeeded.") + + @memcached_timing_stats(sample_rate=TIMING_SAMPLE_RATE_LOW) + def decr(self, key, delta=1, time=0): """ Decrements a key which has a numeric value by delta. Calls incr with -delta. @@ -263,97 +543,215 @@ def decr(self, key, delta=1, timeout=0): :param delta: amount to subtract to the value of key (or set the value to 0 if the key is not found) will be cast to an int - :param timeout: ttl in memcache + :param time: the time to live + :returns: result of decrementing :raises MemcacheConnectionError: """ - self.incr(key, delta=-delta, timeout=timeout) + return self.incr(key, delta=-delta, time=time) - def delete(self, key): + @memcached_timing_stats(sample_rate=TIMING_SAMPLE_RATE_HIGH) + def delete(self, key, server_key=None): """ Deletes a key/value pair from memcache. :param key: key to be deleted + :param server_key: key to use in determining which server in the ring + is used """ - key = md5hash(key) - for (server, fp, sock) in self._get_conns(key): + cmd = server_cmd = MemcacheCommand('delete', key) + if server_key: + server_cmd = MemcacheCommand('delete', server_key) + for (server, fp, sock) in self._get_conns(server_cmd): + conn_start_time = tm.time() try: - sock.sendall('delete %s noreply\r\n' % key) - self._return_conn(server, fp, sock) - return - except Exception, e: - self._exception_occurred(server, e) - - def set_multi(self, mapping, server_key, serialize=True, timeout=0): + with Timeout(self._io_timeout): + sock.sendall(b'delete ' + cmd.hash_key + b'\r\n') + # Wait for the delete to complete + fp.readline() + self._return_conn(server, fp, sock) + return + except (Exception, Timeout) as e: + self._exception_occurred(server, e, cmd, conn_start_time, + sock=sock, fp=fp) + + @memcached_timing_stats(sample_rate=TIMING_SAMPLE_RATE_HIGH) + def set_multi(self, mapping, server_key, serialize=True, time=0, + min_compress_len=0): """ Sets multiple key/value pairs in memcache. - :param mapping: dictonary of keys and values to be set in memcache - :param servery_key: key to use in determining which server in the ring + :param mapping: dictionary of keys and values to be set in memcache + :param server_key: key to use in determining which server in the ring is used :param serialize: if True, value is serialized with JSON before sending - to memcache, or with pickle if configured to use - pickle instead of JSON (to avoid cache poisoning) - :param timeout: ttl for memcache + to memcache. + :param time: the time to live + :min_compress_len: minimum compress length, this parameter was added + to keep the signature compatible with + python-memcached interface. This implementation + ignores it """ - server_key = md5hash(server_key) - timeout = sanitize_timeout(timeout) - msg = '' - for key, value in mapping.iteritems(): + cmd = MemcacheCommand('set_multi', server_key) + timeout = sanitize_timeout(time) + msg = [] + for key, value in mapping.items(): key = md5hash(key) flags = 0 - if serialize and self._allow_pickle: - value = pickle.dumps(value, PICKLE_PROTOCOL) - flags |= PICKLE_FLAG - elif serialize: - value = json.dumps(value) + if serialize: + if isinstance(value, bytes): + value = value.decode('utf8') + value = json.dumps(value).encode('ascii') flags |= JSON_FLAG - msg += ('set %s %d %d %s noreply\r\n%s\r\n' % - (key, flags, timeout, len(value), value)) - for (server, fp, sock) in self._get_conns(server_key): + msg.append(set_msg(key, flags, timeout, value)) + for (server, fp, sock) in self._get_conns(cmd): + conn_start_time = tm.time() try: - sock.sendall(msg) - self._return_conn(server, fp, sock) - return - except Exception, e: - self._exception_occurred(server, e) + with Timeout(self._io_timeout): + sock.sendall(b''.join(msg)) + # Wait for the set to complete + for line in range(len(mapping)): + fp.readline() + self._return_conn(server, fp, sock) + return + except (Exception, Timeout) as e: + self._exception_occurred(server, e, cmd, conn_start_time, + sock=sock, fp=fp) + @memcached_timing_stats(sample_rate=TIMING_SAMPLE_RATE_HIGH) def get_multi(self, keys, server_key): """ Gets multiple values from memcache for the given keys. :param keys: keys for values to be retrieved from memcache - :param servery_key: key to use in determining which server in the ring - is used + :param server_key: key to use in determining which server in the ring + is used :returns: list of values """ - server_key = md5hash(server_key) - keys = [md5hash(key) for key in keys] - for (server, fp, sock) in self._get_conns(server_key): + cmd = MemcacheCommand('get_multi', server_key) + hash_keys = [md5hash(key) for key in keys] + for (server, fp, sock) in self._get_conns(cmd): + conn_start_time = tm.time() try: - sock.sendall('get %s\r\n' % ' '.join(keys)) - line = fp.readline().strip().split() - responses = {} - while line[0].upper() != 'END': - if line[0].upper() == 'VALUE': - size = int(line[3]) - value = fp.read(size) - if int(line[2]) & PICKLE_FLAG: - if self._allow_unpickle: - value = pickle.loads(value) - else: - value = None - elif int(line[2]) & JSON_FLAG: - value = json.loads(value) - responses[line[1]] = value - fp.readline() + with Timeout(self._io_timeout): + sock.sendall(b'get ' + b' '.join(hash_keys) + b'\r\n') line = fp.readline().strip().split() - values = [] - for key in keys: - if key in responses: - values.append(responses[key]) - else: - values.append(None) - self._return_conn(server, fp, sock) - return values - except Exception, e: - self._exception_occurred(server, e) + responses = {} + while True: + if not line: + raise MemcacheConnectionError('incomplete read') + if line[0].upper() == b'END': + break + if line[0].upper() == b'VALUE': + size = int(line[3]) + value = fp.read(size) + if int(line[2]) & PICKLE_FLAG: + value = None + elif int(line[2]) & JSON_FLAG: + value = json.loads(value) + responses[line[1]] = value + fp.readline() + line = fp.readline().strip().split() + values = [] + for key in hash_keys: + if key in responses: + values.append(responses[key]) + else: + values.append(None) + self._return_conn(server, fp, sock) + return values + except (Exception, Timeout) as e: + self._exception_occurred(server, e, cmd, conn_start_time, + sock=sock, fp=fp) + + +def load_memcache(conf, logger): + """ + Build a MemcacheRing object from the given config. It will also use the + passed in logger. + + :param conf: a dict, the config options + :param logger: a logger + """ + memcache_servers = conf.get('memcache_servers') + try: + # Originally, while we documented using memcache_max_connections + # we only accepted max_connections + max_conns = int(conf.get('memcache_max_connections', + conf.get('max_connections', 0))) + except ValueError: + max_conns = 0 + + memcache_options = {} + if (not memcache_servers + or max_conns <= 0): + path = os.path.join(conf.get('swift_dir', '/etc/swift'), + 'memcache.conf') + memcache_conf = ConfigParser() + if memcache_conf.read(path): + # if memcache.conf exists we'll start with those base options + try: + memcache_options = dict(memcache_conf.items('memcache')) + except NoSectionError: + pass + + if not memcache_servers: + try: + memcache_servers = \ + memcache_conf.get('memcache', 'memcache_servers') + except (NoSectionError, NoOptionError): + pass + if max_conns <= 0: + try: + new_max_conns = \ + memcache_conf.get('memcache', + 'memcache_max_connections') + max_conns = int(new_max_conns) + except (NoSectionError, NoOptionError, ValueError): + pass + + # while memcache.conf options are the base for the memcache + # middleware, if you set the same option also in the filter + # section of the proxy config it is more specific. + memcache_options.update(conf) + connect_timeout = float(memcache_options.get( + 'connect_timeout', CONN_TIMEOUT)) + pool_timeout = float(memcache_options.get( + 'pool_timeout', POOL_TIMEOUT)) + tries = int(memcache_options.get('tries', TRY_COUNT)) + io_timeout = float(memcache_options.get('io_timeout', IO_TIMEOUT)) + if config_true_value(memcache_options.get('tls_enabled', 'false')): + tls_cafile = memcache_options.get('tls_cafile') + tls_certfile = memcache_options.get('tls_certfile') + tls_keyfile = memcache_options.get('tls_keyfile') + tls_context = ssl.create_default_context( + cafile=tls_cafile) + if tls_certfile: + tls_context.load_cert_chain(tls_certfile, tls_keyfile) + else: + tls_context = None + error_suppression_interval = float(memcache_options.get( + 'error_suppression_interval', ERROR_LIMIT_TIME)) + error_suppression_limit = float(memcache_options.get( + 'error_suppression_limit', ERROR_LIMIT_COUNT)) + item_size_warning_threshold = int(memcache_options.get( + 'item_size_warning_threshold', DEFAULT_ITEM_SIZE_WARNING_THRESHOLD)) + + if not memcache_servers: + memcache_servers = '127.0.0.1:11211' + if max_conns <= 0: + max_conns = 2 + + return MemcacheRing( + [s.strip() for s in memcache_servers.split(',') + if s.strip()], + connect_timeout=connect_timeout, + pool_timeout=pool_timeout, + tries=tries, + io_timeout=io_timeout, + max_conns=max_conns, + tls_context=tls_context, + logger=logger, + error_limit_count=error_suppression_limit, + error_limit_time=error_suppression_interval, + error_limit_duration=error_suppression_interval, + item_size_warning_threshold=item_size_warning_threshold) diff --git a/swift/common/middleware/__init__.py b/swift/common/middleware/__init__.py index e69de29bb2..c465a5caf3 100644 --- a/swift/common/middleware/__init__.py +++ b/swift/common/middleware/__init__.py @@ -0,0 +1,40 @@ +# Copyright (c) 2010-2017 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from swift.common.wsgi import WSGIContext + + +def app_property(name): + return property(lambda self: getattr(self.app, name)) + + +class RewriteContext(WSGIContext): + base_re = None + + def __init__(self, app, requested, rewritten): + super(RewriteContext, self).__init__(app) + self.requested = requested + self.rewritten_re = re.compile(self.base_re % re.escape(rewritten)) + + def handle_request(self, env, start_response): + resp_iter = self._app_call(env) + for i, (header, value) in enumerate(self._response_headers): + if header.lower() in ('location', 'content-location'): + self._response_headers[i] = (header, self.rewritten_re.sub( + r'\1%s\2' % self.requested, value)) + start_response(self._response_status, self._response_headers, + self._response_exc_info) + return resp_iter diff --git a/swift/common/middleware/account_quotas.py b/swift/common/middleware/account_quotas.py new file mode 100644 index 0000000000..ca8d1f0d7a --- /dev/null +++ b/swift/common/middleware/account_quotas.py @@ -0,0 +1,283 @@ +# Copyright (c) 2013 OpenStack Foundation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +``account_quotas`` is a middleware which blocks write requests (PUT, POST) if a +given account quota (in bytes) is exceeded while DELETE requests are still +allowed. + +``account_quotas`` uses the following metadata entries to store the account +quota + ++---------------------------------------------+-------------------------------+ +|Metadata | Use | ++=============================================+===============================+ +| X-Account-Meta-Quota-Bytes (obsoleted) | Maximum overall bytes stored | +| | in account across containers. | ++---------------------------------------------+-------------------------------+ +| X-Account-Quota-Bytes | Maximum overall bytes stored | +| | in account across containers. | ++---------------------------------------------+-------------------------------+ +| X-Account-Quota-Bytes-Policy- | Maximum overall bytes stored | +| | in account across containers, | +| | for the given policy. | ++---------------------------------------------+-------------------------------+ +| X-Account-Quota-Count | Maximum object count under | +| | account. | ++---------------------------------------------+-------------------------------+ +| X-Account-Quota-Count-Policy- | Maximum object count under | +| | account, for the given policy.| ++---------------------------------------------+-------------------------------+ + + +Write requests to those metadata entries are only permitted for resellers. +There is no overall byte or object count limit set if the corresponding +metadata entries are not set. + +Additionally, account quotas, of type quota-bytes or quota-count, may be set +for each storage policy, using metadata of the form ``x-account--\ +policy-``. Again, only resellers may update these metadata, and +there will be no limit for a particular policy if the corresponding metadata +is not set. + +.. note:: + Per-policy quotas need not sum to the overall account quota, and the sum of + all :ref:`container_quotas` for a given policy need not sum to the account's + policy quota. + +The ``account_quotas`` middleware should be added to the pipeline in your +``/etc/swift/proxy-server.conf`` file just after any auth middleware. +For example:: + + [pipeline:main] + pipeline = catch_errors cache tempauth account_quotas proxy-server + + [filter:account_quotas] + use = egg:swift#account_quotas + +To set the quota on an account:: + + swift -A http://127.0.0.1:8080/auth/v1.0 -U account:reseller -K secret \ +post -m quota-bytes:10000 + +Remove the quota:: + + swift -A http://127.0.0.1:8080/auth/v1.0 -U account:reseller -K secret \ +post -m quota-bytes: + +The same limitations apply for the account quotas as for the container quotas. + +For example, when uploading an object without a content-length header the proxy +server doesn't know the final size of the currently uploaded object and the +upload will be allowed if the current account size is within the quota. +Due to the eventual consistency further uploads might be possible until the +account size has been updated. +""" + +from swift.common.swob import HTTPForbidden, HTTPBadRequest, \ + HTTPRequestEntityTooLarge, wsgify +from swift.common.registry import register_swift_info +from swift.common.storage_policy import POLICIES +from swift.proxy.controllers.base import get_account_info, get_container_info + + +class AccountQuotaMiddleware(object): + """Account quota middleware + + See above for a full description. + + """ + def __init__(self, app, *args, **kwargs): + self.app = app + + def quota_exceeded(self, request, body): + # request.environ['swift.authorize'](req) is delayed and not called + # immediately to support container acls. However, the middleware should + # still return immediately if any quota is exceeded. + resp = HTTPRequestEntityTooLarge(body=body) + if 'swift.authorize' in request.environ: + orig_authorize = request.environ['swift.authorize'] + + def reject_authorize(*args, **kwargs): + aresp = orig_authorize(*args, **kwargs) + if aresp: + return aresp + return resp + request.environ['swift.authorize'] = reject_authorize + return self.app + else: + return resp + + def validate_and_translate_quotas(self, request, quota_type): + new_quotas = {} + new_quotas[None] = request.headers.get( + 'X-Account-%s' % quota_type) + if request.headers.get( + 'X-Remove-Account-%s' % quota_type): + new_quotas[None] = '' # X-Remove dominates if both are present + + for policy in POLICIES: + tail = 'Account-%s-Policy-%s' % (quota_type, policy.name) + if request.headers.get('X-Remove-' + tail): + new_quotas[policy.idx] = '' + else: + quota = request.headers.pop('X-' + tail, None) + new_quotas[policy.idx] = quota + + if request.environ.get('reseller_request') is True: + if any(quota and not quota.isdigit() + for quota in new_quotas.values()): + raise HTTPBadRequest() + for idx, quota in new_quotas.items(): + if idx is None: + hdr = 'X-Account-Sysmeta-%s' % quota_type + else: + hdr = 'X-Account-Sysmeta-%s-Policy-%d' % (quota_type, idx) + request.headers[hdr] = quota + elif any(quota is not None for quota in new_quotas.values()): + # deny quota set for non-reseller + raise HTTPForbidden() + + def handle_account(self, request): + if request.method in ("POST", "PUT"): + # Support old meta format + for legacy_header in [ + 'X-Account-Meta-Quota-Bytes', + 'X-Remove-Account-Meta-Quota-Bytes', + ]: + new_header = legacy_header.replace('-Meta-', '-') + legacy_value = request.headers.get(legacy_header) + if legacy_value is not None and not \ + request.headers.get(new_header): + request.headers[new_header] = legacy_value + # account request, so we pay attention to the quotas + self.validate_and_translate_quotas(request, "Quota-Bytes") + self.validate_and_translate_quotas(request, "Quota-Count") + resp = request.get_response(self.app) + # Non-resellers can't update quotas, but they *can* see them + # Global quotas + postfixes = ('Quota-Bytes', 'Quota-Count') + for postfix in postfixes: + value = resp.headers.get('X-Account-Sysmeta-%s' % postfix) + if value: + resp.headers['X-Account-%s' % postfix] = value + + # Per policy quotas + for policy in POLICIES: + infixes = ('Quota-Bytes-Policy', 'Quota-Count-Policy') + for infix in infixes: + value = resp.headers.get('X-Account-Sysmeta-%s-%d' % ( + infix, policy.idx)) + if value: + resp.headers['X-Account-%s-%s' % ( + infix, policy.name)] = value + return resp + + @wsgify + def __call__(self, request): + + try: + ver, account, container, obj = request.split_path( + 2, 4, rest_with_last=True) + except ValueError: + return self.app + + if not container: + return self.handle_account(request) + # container or object request; even if the quota headers are set + # in the request, they're meaningless + + if not (request.method == "PUT" and obj): + return self.app + # OK, object PUT + + if request.environ.get('reseller_request') is True: + # but resellers aren't constrained by quotas :-) + return self.app + + # Object PUT request + content_length = (request.content_length or 0) + + account_info = get_account_info(request.environ, self.app, + swift_source='AQ') + if not account_info: + return self.app + + # Check for quota byte violation + try: + quota = int( + account_info["sysmeta"].get( + "quota-bytes", account_info["meta"].get("quota-bytes", -1) + ) + ) + except ValueError: + quota = -1 + if quota >= 0: + new_size = int(account_info['bytes']) + content_length + if quota < new_size: + return self.quota_exceeded(request, "Upload exceeds quota.") + + # Check for quota count violation + try: + quota = int(account_info['sysmeta'].get('quota-count', -1)) + except ValueError: + quota = -1 + if quota >= 0: + new_count = int(account_info['total_object_count']) + 1 + if quota < new_count: + return self.quota_exceeded(request, "Upload exceeds quota.") + + container_info = get_container_info(request.environ, self.app, + swift_source='AQ') + if not container_info: + return self.app + policy_idx = container_info['storage_policy'] + + # Check quota-byte per policy + sysmeta_key = 'quota-bytes-policy-%s' % policy_idx + try: + policy_quota = int(account_info['sysmeta'].get(sysmeta_key, -1)) + except ValueError: + policy_quota = -1 + if policy_quota >= 0: + policy_stats = account_info['storage_policies'].get(policy_idx, {}) + new_size = int(policy_stats.get('bytes', 0)) + content_length + if policy_quota < new_size: + return self.quota_exceeded( + request, "Upload exceeds policy quota.") + + # Check quota-count per policy + sysmeta_key = 'quota-count-policy-%s' % policy_idx + try: + policy_quota = int(account_info['sysmeta'].get(sysmeta_key, -1)) + except ValueError: + policy_quota = -1 + if policy_quota >= 0: + policy_stats = account_info['storage_policies'].get(policy_idx, {}) + new_size = int(policy_stats.get('object_count', 0)) + 1 + if policy_quota < new_size: + return self.quota_exceeded( + request, "Upload exceeds policy quota.") + + return self.app + + +def filter_factory(global_conf, **local_conf): + """Returns a WSGI filter app for use with paste.deploy.""" + register_swift_info('account_quotas') + + def account_quota_filter(app): + return AccountQuotaMiddleware(app) + return account_quota_filter diff --git a/swift/common/middleware/acl.py b/swift/common/middleware/acl.py index b542493fff..a8c0e04bce 100644 --- a/swift/common/middleware/acl.py +++ b/swift/common/middleware/acl.py @@ -1,4 +1,4 @@ -# Copyright (c) 2010-2012 OpenStack, LLC. +# Copyright (c) 2010-2012 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,7 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from swift.common.utils import urlparse +import json +from urllib.parse import unquote, urlparse def clean_acl(name, value): @@ -89,35 +90,98 @@ def clean_acl(name, value): values = [] for raw_value in value.split(','): raw_value = raw_value.strip() - if raw_value: - if ':' not in raw_value: - values.append(raw_value) - else: - first, second = (v.strip() for v in raw_value.split(':', 1)) - if not first or first[0] != '.': - values.append(raw_value) - elif first in ('.r', '.ref', '.referer', '.referrer'): - if 'write' in name: - raise ValueError('Referrers not allowed in write ACL: ' - '%s' % repr(raw_value)) - negate = False - if second and second[0] == '-': - negate = True - second = second[1:].strip() - if second and second != '*' and second[0] == '*': - second = second[1:].strip() - if not second or second == '.': - raise ValueError('No host/domain value after referrer ' - 'designation in ACL: %s' % - repr(raw_value)) - values.append('.r:%s%s' % (negate and '-' or '', second)) - else: - raise ValueError('Unknown designator %s in ACL: %s' % - (repr(first), repr(raw_value))) + if not raw_value: + continue + if ':' not in raw_value: + values.append(raw_value) + continue + first, second = (v.strip() for v in raw_value.split(':', 1)) + if not first or not first.startswith('.'): + values.append(raw_value) + elif first in ('.r', '.ref', '.referer', '.referrer'): + if 'write' in name: + raise ValueError('Referrers not allowed in write ACL: ' + '%s' % repr(raw_value)) + negate = False + if second and second.startswith('-'): + negate = True + second = second[1:].strip() + if second and second != '*' and second.startswith('*'): + second = second[1:].strip() + if not second or second == '.': + raise ValueError('No host/domain value after referrer ' + 'designation in ACL: %s' % repr(raw_value)) + values.append('.r:%s%s' % ('-' if negate else '', second)) + else: + raise ValueError('Unknown designator %s in ACL: %s' % + (repr(first), repr(raw_value))) return ','.join(values) -def parse_acl(acl_string): +def format_acl_v1(groups=None, referrers=None, header_name=None): + """ + Returns a standard Swift ACL string for the given inputs. + + Caller is responsible for ensuring that :referrers: parameter is only given + if the ACL is being generated for X-Container-Read. (X-Container-Write + and the account ACL headers don't support referrers.) + + :param groups: a list of groups (and/or members in most auth systems) to + grant access + :param referrers: a list of referrer designations (without the leading .r:) + :param header_name: (optional) header name of the ACL we're preparing, for + clean_acl; if None, returned ACL won't be cleaned + :returns: a Swift ACL string for use in X-Container-{Read,Write}, + X-Account-Access-Control, etc. + """ + groups, referrers = groups or [], referrers or [] + referrers = ['.r:%s' % r for r in referrers] + result = ','.join(groups + referrers) + return (clean_acl(header_name, result) if header_name else result) + + +def format_acl_v2(acl_dict): + r""" + Returns a version-2 Swift ACL JSON string. + + HTTP headers for Version 2 ACLs have the following form: + Header-Name: {"arbitrary":"json","encoded":"string"} + + JSON will be forced ASCII (containing six-char \uNNNN sequences rather + than UTF-8; UTF-8 is valid JSON but clients vary in their support for + UTF-8 headers), and without extraneous whitespace. + + Advantages over V1: forward compatibility (new keys don't cause parsing + exceptions); Unicode support; no reserved words (you can have a user + named .rlistings if you want). + + :param acl_dict: dict of arbitrary data to put in the ACL; see specific + auth systems such as tempauth for supported values + :returns: a JSON string which encodes the ACL + """ + return json.dumps(acl_dict, ensure_ascii=True, separators=(',', ':'), + sort_keys=True) + + +def format_acl(version=1, **kwargs): + """ + Compatibility wrapper to help migrate ACL syntax from version 1 to 2. + Delegates to the appropriate version-specific format_acl method, defaulting + to version 1 for backward compatibility. + + :param kwargs: keyword args appropriate for the selected ACL syntax version + (see :func:`format_acl_v1` or :func:`format_acl_v2`) + """ + if version == 1: + return format_acl_v1( + groups=kwargs.get('groups'), referrers=kwargs.get('referrers'), + header_name=kwargs.get('header_name')) + elif version == 2: + return format_acl_v2(kwargs.get('acl_dict')) + raise ValueError("Invalid ACL version: %r" % version) + + +def parse_acl_v1(acl_string): """ Parses a standard Swift ACL string into a referrers list and groups list. @@ -135,10 +199,53 @@ def parse_acl(acl_string): if value.startswith('.r:'): referrers.append(value[len('.r:'):]) else: - groups.append(value) + groups.append(unquote(value)) return referrers, groups +def parse_acl_v2(data): + """ + Parses a version-2 Swift ACL string and returns a dict of ACL info. + + :param data: string containing the ACL data in JSON format + :returns: A dict (possibly empty) containing ACL info, e.g.: + {"groups": [...], "referrers": [...]} + :returns: None if data is None, is not valid JSON or does not parse + as a dict + :returns: empty dictionary if data is an empty string + """ + if data is None: + return None + if data == '': + return {} + try: + result = json.loads(data) + return (result if type(result) is dict else None) + except ValueError: + return None + + +def parse_acl(*args, **kwargs): + """ + Compatibility wrapper to help migrate ACL syntax from version 1 to 2. + Delegates to the appropriate version-specific parse_acl method, attempting + to determine the version from the types of args/kwargs. + + :param args: positional args for the selected ACL syntax version + :param kwargs: keyword args for the selected ACL syntax version + (see :func:`parse_acl_v1` or :func:`parse_acl_v2`) + :returns: the return value of :func:`parse_acl_v1` or :func:`parse_acl_v2` + """ + version = kwargs.pop('version', None) + if version in (1, None): + return parse_acl_v1(*args) + elif version == 2: + return parse_acl_v2(*args, **kwargs) + else: + raise ValueError('Unknown ACL version: parse_acl(%r, %r)' % + (args, kwargs)) + + def referrer_allowed(referrer, referrer_acl): """ Returns True if the referrer should be allowed based on the referrer_acl @@ -155,12 +262,39 @@ def referrer_allowed(referrer, referrer_acl): if referrer_acl: rhost = urlparse(referrer or '').hostname or 'unknown' for mhost in referrer_acl: - if mhost[0] == '-': + if mhost.startswith('-'): mhost = mhost[1:] - if mhost == rhost or (mhost[0] == '.' and + if mhost == rhost or (mhost.startswith('.') and rhost.endswith(mhost)): allow = False elif mhost == '*' or mhost == rhost or \ - (mhost[0] == '.' and rhost.endswith(mhost)): + (mhost.startswith('.') and rhost.endswith(mhost)): allow = True return allow + + +def acls_from_account_info(info): + """ + Extract the account ACLs from the given account_info, and return the ACLs. + + :param info: a dict of the form returned by get_account_info + :returns: None (no ACL system metadata is set), or a dict of the form:: + {'admin': [...], 'read-write': [...], 'read-only': [...]} + + :raises ValueError: on a syntactically invalid header + """ + acl = parse_acl( + version=2, data=info.get('sysmeta', {}).get('core-access-control')) + if acl is None: + return None + admin_members = acl.get('admin', []) + readwrite_members = acl.get('read-write', []) + readonly_members = acl.get('read-only', []) + if not any((admin_members, readwrite_members, readonly_members)): + return None + + return { + 'admin': admin_members, + 'read-write': readwrite_members, + 'read-only': readonly_members, + } diff --git a/swift/common/middleware/backend_ratelimit.py b/swift/common/middleware/backend_ratelimit.py new file mode 100644 index 0000000000..696d926ae8 --- /dev/null +++ b/swift/common/middleware/backend_ratelimit.py @@ -0,0 +1,230 @@ +# Copyright (c) 2022 NVIDIA +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import time + +from swift.common.request_helpers import split_and_validate_path +from swift.common.swob import Request, HTTPTooManyBackendRequests, \ + HTTPException +from swift.common.utils import get_logger, non_negative_float, \ + EventletRateLimiter, readconf + +RATE_LIMITED_METHODS = ('GET', 'HEAD', 'PUT', 'POST', 'DELETE', 'UPDATE', + 'REPLICATE') +BACKEND_RATELIMIT_CONFIG_SECTION = 'backend_ratelimit' +DEFAULT_BACKEND_RATELIMIT_CONF_FILE = 'backend-ratelimit.conf' +DEFAULT_CONFIG_RELOAD_INTERVAL = 60.0 +DEFAULT_REQUESTS_PER_DEVICE_PER_SECOND = 0.0 +DEFAULT_REQUESTS_PER_DEVICE_RATE_BUFFER = 1.0 + + +class BackendRateLimitMiddleware(object): + """ + Backend rate-limiting middleware. + + Rate-limits requests to backend storage node devices. Each (device, request + method) combination is independently rate-limited. All requests with a + 'GET', 'HEAD', 'PUT', 'POST', 'DELETE', 'UPDATE' or 'REPLICATE' method are + rate limited on a per-device basis by both a method-specific rate and an + overall device rate limit. + + If a request would cause the rate-limit to be exceeded for the method + and/or device then a response with a 529 status code is returned. + """ + def __init__(self, app, filter_conf, logger=None): + self.app = app + self.filter_conf = filter_conf + self.logger = logger or get_logger(self.filter_conf, + log_route='backend_ratelimit') + self.requests_per_device_rate_buffer = \ + DEFAULT_REQUESTS_PER_DEVICE_RATE_BUFFER + # map (device, method) -> rate + self.requests_per_device_per_second = {} + # map (device, method) -> RateLimiter, populated on-demand + self.rate_limiters = {} + + # some config options are *only* read from filter conf at startup... + default_conf_path = os.path.join( + self.filter_conf.get('swift_dir', '/etc/swift'), + DEFAULT_BACKEND_RATELIMIT_CONF_FILE) + try: + self.conf_path = self.filter_conf['backend_ratelimit_conf_path'] + self.is_config_file_expected = True + except KeyError: + self.conf_path = default_conf_path + self.is_config_file_expected = False + self.config_reload_interval = non_negative_float( + filter_conf.get('config_reload_interval', + DEFAULT_CONFIG_RELOAD_INTERVAL)) + + # other conf options are read from filter section at startup but may + # also be overridden by options in a separate config file... + self._last_config_reload_attempt = time.time() + self._apply_config(self.filter_conf) + self._load_config_file() + + def _refresh_ratelimiters(self): + # note: if we ever wanted to prune the ratelimiters (in case devices + # have been removed) we could inspect each ratelimiter's running_time + # and remove those with very old running_time + for (dev, method), rl in self.rate_limiters.items(): + rl.set_max_rate(self.requests_per_device_per_second[method]) + rl.set_rate_buffer(self.requests_per_device_rate_buffer) + + def _apply_config(self, conf): + modified = False + reqs_per_device_rate_buffer = non_negative_float( + conf.get('requests_per_device_rate_buffer', + DEFAULT_REQUESTS_PER_DEVICE_RATE_BUFFER)) + + # note: 'None' key holds the aggregate per-device limit for all methods + reqs_per_device_per_second = {None: non_negative_float( + conf.get('requests_per_device_per_second', 0.0))} + for method in RATE_LIMITED_METHODS: + val = non_negative_float( + conf.get('%s_requests_per_device_per_second' + % method.lower(), 0.0)) + reqs_per_device_per_second[method] = val + + if reqs_per_device_rate_buffer != self.requests_per_device_rate_buffer: + self.requests_per_device_rate_buffer = reqs_per_device_rate_buffer + modified = True + if reqs_per_device_per_second != self.requests_per_device_per_second: + self.requests_per_device_per_second = reqs_per_device_per_second + self.is_any_rate_limit_configured = any( + self.requests_per_device_per_second.values()) + modified = True + if modified: + self._refresh_ratelimiters() + return modified + + def _load_config_file(self): + # If conf file can be read then apply its options to the filter conf + # options, discarding *all* options previously loaded from the conf + # file i.e. options deleted from the conf file will revert to the + # filter conf value or default value. If the conf file cannot be read + # or is invalid, then the current config is left unchanged. + try: + new_conf = dict(self.filter_conf) # filter_conf not current conf + new_conf.update( + readconf(self.conf_path, BACKEND_RATELIMIT_CONFIG_SECTION)) + modified = self._apply_config(new_conf) + if modified: + self.logger.info('Loaded config file %s, config changed', + self.conf_path) + elif not self.is_config_file_expected: + self.logger.info('Loaded new config file %s, config unchanged', + self.conf_path) + else: + self.logger.debug( + 'Loaded existing config file %s, config unchanged', + self.conf_path) + self.is_config_file_expected = True + except IOError as err: + if self.is_config_file_expected: + self.logger.warning( + 'Failed to load config file, config unchanged: %s', err) + self.is_config_file_expected = False + except ValueError as err: + # ...but if it exists it should be valid + self.logger.warning('Invalid config file %s, config unchanged: %s', + self.conf_path, err) + + def _maybe_reload_config(self): + if self.config_reload_interval: + now = time.time() + if (now - self._last_config_reload_attempt + >= self.config_reload_interval): + try: + self._load_config_file() + except Exception: # noqa + self.logger.exception('Error reloading config file') + finally: + # always reset last loaded time to avoid re-try storm + self._last_config_reload_attempt = now + + def _get_ratelimiter(self, device, method=None): + """ + Get a rate limiter for the (device, method) combination. If a rate + limiter does not yet exist for the given (device, method) combination + then it is created and added to the map of rate limiters. + + :param: the device. + :method: the request method; if None then the aggregate rate limiter + for all requests to the device is returned. + :returns: an instance of ``EventletRateLimiter``. + """ + try: + rl = self.rate_limiters[(device, method)] + except KeyError: + rl = EventletRateLimiter( + max_rate=self.requests_per_device_per_second[method], + rate_buffer=self.requests_per_device_rate_buffer, + running_time=time.time(), + burst_after_idle=True) + self.rate_limiters[(device, method)] = rl + return rl + + def _is_allowed(self, device, method): + """ + Evaluate backend rate-limiting policies for the incoming request. + + A request is allowed when neither the per-(device, method) rate-limit + nor the per-device rate-limit has been reached. + + Note: a request will be disallowed if the aggregate per-device + rate-limit has been reached, even if the per-(device, method) + rate-limit has not been reached for the request's method. + + :param: the device. + :method: the request method. + :returns: boolean, is_allowed. + """ + return (self._get_ratelimiter(device, None).is_allowed() + and self._get_ratelimiter(device, method).is_allowed()) + + def __call__(self, env, start_response): + """ + WSGI entry point. + + :param env: WSGI environment dictionary + :param start_response: WSGI callable + """ + self._maybe_reload_config() + req = Request(env) + handler = self.app + if (self.is_any_rate_limit_configured + and req.method in RATE_LIMITED_METHODS): + try: + device, partition, _ = split_and_validate_path(req, 1, 3, True) + int(partition) # check it's a valid partition + except (ValueError, HTTPException): + # request may not have device/partition e.g. a healthcheck req + pass + else: + if not self._is_allowed(device, req.method): + self.logger.increment('backend.ratelimit') + handler = HTTPTooManyBackendRequests() + return handler(env, start_response) + + +def filter_factory(global_conf, **local_conf): + conf = global_conf.copy() + conf.update(local_conf) + + def backend_ratelimit_filter(app): + return BackendRateLimitMiddleware(app, conf) + + return backend_ratelimit_filter diff --git a/swift/common/middleware/bulk.py b/swift/common/middleware/bulk.py new file mode 100644 index 0000000000..cf26c7ef5a --- /dev/null +++ b/swift/common/middleware/bulk.py @@ -0,0 +1,728 @@ +# Copyright (c) 2013 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Middleware that will perform many operations on a single request. + +--------------- +Extract Archive +--------------- + +Expand tar files into a Swift account. Request must be a PUT with the +query parameter ``?extract-archive=format`` specifying the format of archive +file. Accepted formats are tar, tar.gz, and tar.bz2. + +For a PUT to the following url:: + + /v1/AUTH_Account/$UPLOAD_PATH?extract-archive=tar.gz + +UPLOAD_PATH is where the files will be expanded to. UPLOAD_PATH can be a +container, a pseudo-directory within a container, or an empty string. The +destination of a file in the archive will be built as follows:: + + /v1/AUTH_Account/$UPLOAD_PATH/$FILE_PATH + +Where FILE_PATH is the file name from the listing in the tar file. + +If the UPLOAD_PATH is an empty string, containers will be auto created +accordingly and files in the tar that would not map to any container (files +in the base directory) will be ignored. + +Only regular files will be uploaded. Empty directories, symlinks, etc will +not be uploaded. + +------------ +Content Type +------------ + +If the content-type header is set in the extract-archive call, Swift will +assign that content-type to all the underlying files. The bulk middleware +will extract the archive file and send the internal files using PUT +operations using the same headers from the original request +(e.g. auth-tokens, content-Type, etc.). Notice that any middleware call +that follows the bulk middleware does not know if this was a bulk request +or if these were individual requests sent by the user. + +In order to make Swift detect the content-type for the files based on the +file extension, the content-type in the extract-archive call should not be +set. Alternatively, it is possible to explicitly tell Swift to detect the +content type using this header:: + + X-Detect-Content-Type: true + +For example:: + + curl -X PUT http://127.0.0.1/v1/AUTH_acc/cont/$?extract-archive=tar + -T backup.tar + -H "Content-Type: application/x-tar" + -H "X-Auth-Token: xxx" + -H "X-Detect-Content-Type: true" + +------------------ +Assigning Metadata +------------------ + +The tar file format (1) allows for UTF-8 key/value pairs to be associated +with each file in an archive. If a file has extended attributes, then tar +will store those as key/value pairs. The bulk middleware can read those +extended attributes and convert them to Swift object metadata. Attributes +starting with "user.meta" are converted to object metadata, and +"user.mime_type" is converted to Content-Type. + +For example:: + + setfattr -n user.mime_type -v "application/python-setup" setup.py + setfattr -n user.meta.lunch -v "burger and fries" setup.py + setfattr -n user.meta.dinner -v "baked ziti" setup.py + setfattr -n user.stuff -v "whee" setup.py + +Will get translated to headers:: + + Content-Type: application/python-setup + X-Object-Meta-Lunch: burger and fries + X-Object-Meta-Dinner: baked ziti + +The bulk middleware will handle xattrs stored by both GNU and BSD tar (2). +Only xattrs ``user.mime_type`` and ``user.meta.*`` are processed. Other +attributes are ignored. + +In addition to the extended attributes, the object metadata and the +x-delete-at/x-delete-after headers set in the request are also assigned to the +extracted objects. + +Notes: + +(1) The POSIX 1003.1-2001 (pax) format. The default format on GNU tar +1.27.1 or later. + +(2) Even with pax-format tarballs, different encoders store xattrs slightly +differently; for example, GNU tar stores the xattr "user.userattribute" as +pax header "SCHILY.xattr.user.userattribute", while BSD tar (which uses +libarchive) stores it as "LIBARCHIVE.xattr.user.userattribute". + +-------- +Response +-------- + +The response from bulk operations functions differently from other Swift +responses. This is because a short request body sent from the client could +result in many operations on the proxy server and precautions need to be +made to prevent the request from timing out due to lack of activity. To +this end, the client will always receive a 200 OK response, regardless of +the actual success of the call. The body of the response must be parsed to +determine the actual success of the operation. In addition to this the +client may receive zero or more whitespace characters prepended to the +actual response body while the proxy server is completing the request. + +The format of the response body defaults to text/plain but can be either +json or xml depending on the ``Accept`` header. Acceptable formats are +``text/plain``, ``application/json``, ``application/xml``, and ``text/xml``. +An example body is as follows:: + + {"Response Status": "201 Created", + "Response Body": "", + "Errors": [], + "Number Files Created": 10} + +If all valid files were uploaded successfully the Response Status will be +201 Created. If any files failed to be created the response code +corresponds to the subrequest's error. Possible codes are 400, 401, 502 (on +server errors), etc. In both cases the response body will specify the +number of files successfully uploaded and a list of the files that failed. + +There are proxy logs created for each file (which becomes a subrequest) in +the tar. The subrequest's proxy log will have a swift.source set to "EA" +the log's content length will reflect the unzipped size of the file. If +double proxy-logging is used the leftmost logger will not have a +swift.source set and the content length will reflect the size of the +payload sent to the proxy (the unexpanded size of the tar.gz). + +----------- +Bulk Delete +----------- + +Will delete multiple objects or containers from their account with a +single request. Responds to POST requests with query parameter +``?bulk-delete`` set. The request url is your storage url. The Content-Type +should be set to ``text/plain``. The body of the POST request will be a +newline separated list of url encoded objects to delete. You can delete +10,000 (configurable) objects per request. The objects specified in the +POST request body must be URL encoded and in the form:: + + /container_name/obj_name + +or for a container (which must be empty at time of delete):: + + /container_name + +The response is similar to extract archive as in every response will be a +200 OK and you must parse the response body for actual results. An example +response is:: + + {"Number Not Found": 0, + "Response Status": "200 OK", + "Response Body": "", + "Errors": [], + "Number Deleted": 6} + +If all items were successfully deleted (or did not exist), the Response +Status will be 200 OK. If any failed to delete, the response code +corresponds to the subrequest's error. Possible codes are 400, 401, 502 (on +server errors), etc. In all cases the response body will specify the number +of items successfully deleted, not found, and a list of those that failed. +The return body will be formatted in the way specified in the request's +``Accept`` header. Acceptable formats are ``text/plain``, ``application/json``, +``application/xml``, and ``text/xml``. + +There are proxy logs created for each object or container (which becomes a +subrequest) that is deleted. The subrequest's proxy log will have a +swift.source set to "BD" the log's content length of 0. If double +proxy-logging is used the leftmost logger will not have a +swift.source set and the content length will reflect the size of the +payload sent to the proxy (the list of objects/containers to be deleted). +""" + +from swift.common.request_helpers import get_heartbeat_response_body +import tarfile +from time import time +from eventlet import sleep +import zlib +from swift.common.swob import Request, HTTPBadGateway, \ + HTTPCreated, HTTPBadRequest, HTTPNotFound, HTTPUnauthorized, HTTPOk, \ + HTTPPreconditionFailed, HTTPRequestEntityTooLarge, HTTPNotAcceptable, \ + HTTPLengthRequired, HTTPException, HTTPServerError, wsgify, \ + bytes_to_wsgi, str_to_wsgi, wsgi_unquote, wsgi_quote, wsgi_to_str +from swift.common.utils import get_logger, StreamingPile +from swift.common.registry import register_swift_info +from swift.common import constraints +from swift.common.http import HTTP_UNAUTHORIZED, HTTP_NOT_FOUND, HTTP_CONFLICT +from swift.common.request_helpers import is_user_meta +from swift.common.wsgi import make_subrequest + + +class CreateContainerError(Exception): + def __init__(self, msg, status_int, status): + self.status_int = status_int + self.status = status + super(CreateContainerError, self).__init__(msg) + + +ACCEPTABLE_FORMATS = ['text/plain', 'application/json', 'application/xml', + 'text/xml'] + + +def pax_key_to_swift_header(pax_key): + if (pax_key == u"SCHILY.xattr.user.mime_type" or + pax_key == u"LIBARCHIVE.xattr.user.mime_type"): + return "Content-Type" + elif pax_key.startswith(u"SCHILY.xattr.user.meta."): + useful_part = pax_key[len(u"SCHILY.xattr.user.meta."):] + return str_to_wsgi("X-Object-Meta-" + useful_part) + elif pax_key.startswith(u"LIBARCHIVE.xattr.user.meta."): + useful_part = pax_key[len(u"LIBARCHIVE.xattr.user.meta."):] + return str_to_wsgi("X-Object-Meta-" + useful_part) + else: + # You can get things like atime/mtime/ctime or filesystem ACLs in + # pax headers; those aren't really user metadata. The same goes for + # other, non-user metadata. + return None + + +class Bulk(object): + + def __init__(self, app, conf, max_containers_per_extraction=10000, + max_failed_extractions=1000, max_deletes_per_request=10000, + max_failed_deletes=1000, yield_frequency=10, + delete_concurrency=2, retry_count=0, retry_interval=1.5, + logger=None): + self.app = app + self.logger = logger or get_logger(conf, log_route='bulk') + self.max_containers = max_containers_per_extraction + self.max_failed_extractions = max_failed_extractions + self.max_failed_deletes = max_failed_deletes + self.max_deletes_per_request = max_deletes_per_request + self.yield_frequency = yield_frequency + self.delete_concurrency = min(1000, max(1, delete_concurrency)) + self.retry_count = retry_count + self.retry_interval = retry_interval + self.max_path_length = constraints.MAX_OBJECT_NAME_LENGTH \ + + constraints.MAX_CONTAINER_NAME_LENGTH + 2 + + def create_container(self, req, container_path): + """ + Checks if the container exists and if not try to create it. + :params container_path: an unquoted path to a container to be created + :returns: True if created container, False if container exists + :raises CreateContainerError: when unable to create container + """ + head_cont_req = make_subrequest( + req.environ, method='HEAD', path=wsgi_quote(container_path), + headers={'X-Auth-Token': req.headers.get('X-Auth-Token')}, + swift_source='EA') + resp = head_cont_req.get_response(self.app) + if resp.is_success: + return False + if resp.status_int == HTTP_NOT_FOUND: + create_cont_req = make_subrequest( + req.environ, method='PUT', path=wsgi_quote(container_path), + headers={'X-Auth-Token': req.headers.get('X-Auth-Token')}, + swift_source='EA') + resp = create_cont_req.get_response(self.app) + if resp.is_success: + return True + raise CreateContainerError( + "Create Container Failed: " + container_path, + resp.status_int, resp.status) + + def get_objs_to_delete(self, req): + """ + Will populate objs_to_delete with data from request input. + :params req: a Swob request + :returns: a list of the contents of req.body when separated by newline. + :raises HTTPException: on failures + """ + line = b'' + data_remaining = True + objs_to_delete = [] + if req.content_length is None and \ + req.headers.get('transfer-encoding', '').lower() != 'chunked': + raise HTTPLengthRequired(request=req) + + while data_remaining: + if b'\n' in line: + obj_to_delete, line = line.split(b'\n', 1) + # yeah, all this chaining is pretty terrible... + # but it gets even worse trying to use UTF-8 and + # errors='surrogateescape' when dealing with terrible + # input like b'\xe2%98\x83' + obj_to_delete = wsgi_to_str(wsgi_unquote( + bytes_to_wsgi(obj_to_delete.strip()))) + objs_to_delete.append({'name': obj_to_delete}) + else: + data = req.body_file.read(self.max_path_length) + if data: + line += data + else: + data_remaining = False + obj_to_delete = wsgi_to_str(wsgi_unquote( + bytes_to_wsgi(line.strip()))) + if obj_to_delete: + objs_to_delete.append({'name': obj_to_delete}) + if len(objs_to_delete) > self.max_deletes_per_request: + raise HTTPRequestEntityTooLarge( + 'Maximum Bulk Deletes: %d per request' % + self.max_deletes_per_request) + if len(line) > self.max_path_length * 2: + raise HTTPBadRequest('Invalid File Name') + return objs_to_delete + + def handle_delete_iter(self, req, objs_to_delete=None, + user_agent='BulkDelete', swift_source='BD', + out_content_type='text/plain'): + """ + A generator that can be assigned to a swob Response's app_iter which, + when iterated over, will delete the objects specified in request body. + Will occasionally yield whitespace while request is being processed. + When the request is completed will yield a response body that can be + parsed to determine success. See above documentation for details. + + :params req: a swob Request + :params objs_to_delete: a list of dictionaries that specifies the + (native string) objects to be deleted. If None, uses + self.get_objs_to_delete to query request. + """ + last_yield = time() + if out_content_type and out_content_type.endswith('/xml'): + to_yield = b'\n' + else: + to_yield = b' ' + separator = b'' + failed_files = [] + resp_dict = {'Response Status': HTTPOk().status, + 'Response Body': '', + 'Number Deleted': 0, + 'Number Not Found': 0} + req.environ['eventlet.minimum_write_chunk_size'] = 0 + try: + if not out_content_type: + raise HTTPNotAcceptable(request=req) + + try: + vrs, account, _junk = req.split_path(2, 3, True) + except ValueError: + raise HTTPNotFound(request=req) + vrs = wsgi_to_str(vrs) + account = wsgi_to_str(account) + + incoming_format = req.headers.get('Content-Type') + if incoming_format and \ + not incoming_format.startswith('text/plain'): + # For now only accept newline separated object names + raise HTTPNotAcceptable(request=req) + + if objs_to_delete is None: + objs_to_delete = self.get_objs_to_delete(req) + failed_file_response = {'type': HTTPBadRequest} + + def delete_filter(predicate, objs_to_delete): + for obj_to_delete in objs_to_delete: + obj_name = obj_to_delete['name'] + if not obj_name: + continue + if not predicate(obj_name): + continue + if obj_to_delete.get('error'): + if obj_to_delete['error']['code'] == HTTP_NOT_FOUND: + resp_dict['Number Not Found'] += 1 + else: + failed_files.append([ + wsgi_quote(str_to_wsgi(obj_name)), + obj_to_delete['error']['message']]) + continue + delete_path = '/'.join(['', vrs, account, + obj_name.lstrip('/')]) + if not constraints.check_utf8(delete_path): + failed_files.append([wsgi_quote(str_to_wsgi(obj_name)), + HTTPPreconditionFailed().status]) + continue + yield (obj_name, delete_path, + obj_to_delete.get('version_id')) + + def objs_then_containers(objs_to_delete): + # process all objects first + yield delete_filter(lambda name: '/' in name.strip('/'), + objs_to_delete) + # followed by containers + yield delete_filter(lambda name: '/' not in name.strip('/'), + objs_to_delete) + + def do_delete(obj_name, delete_path, version_id): + delete_obj_req = make_subrequest( + req.environ, method='DELETE', + path=wsgi_quote(str_to_wsgi(delete_path)), + headers={'X-Auth-Token': req.headers.get('X-Auth-Token')}, + body='', agent='%(orig)s ' + user_agent, + swift_source=swift_source) + if version_id is None: + delete_obj_req.params = {} + else: + delete_obj_req.params = {'version-id': version_id} + return (delete_obj_req.get_response(self.app), obj_name, 0) + + with StreamingPile(self.delete_concurrency) as pile: + for names_to_delete in objs_then_containers(objs_to_delete): + for resp, obj_name, retry in pile.asyncstarmap( + do_delete, names_to_delete): + if last_yield + self.yield_frequency < time(): + last_yield = time() + yield to_yield + to_yield, separator = b' ', b'\r\n\r\n' + self._process_delete(resp, pile, obj_name, + resp_dict, failed_files, + failed_file_response, retry) + if len(failed_files) >= self.max_failed_deletes: + # Abort, but drain off the in-progress deletes + for resp, obj_name, retry in pile: + if last_yield + self.yield_frequency < time(): + last_yield = time() + yield to_yield + to_yield, separator = b' ', b'\r\n\r\n' + # Don't pass in the pile, as we shouldn't retry + self._process_delete( + resp, None, obj_name, resp_dict, + failed_files, failed_file_response, retry) + msg = 'Max delete failures exceeded' + raise HTTPBadRequest(msg) + + if failed_files: + resp_dict['Response Status'] = \ + failed_file_response['type']().status + elif not (resp_dict['Number Deleted'] or + resp_dict['Number Not Found']): + resp_dict['Response Status'] = HTTPBadRequest().status + resp_dict['Response Body'] = 'Invalid bulk delete.' + + except HTTPException as err: + resp_dict['Response Status'] = err.status + resp_dict['Response Body'] = err.body.decode('utf-8') + except Exception: + self.logger.exception('Error in bulk delete.') + resp_dict['Response Status'] = HTTPServerError().status + + yield separator + get_heartbeat_response_body(out_content_type, + resp_dict, failed_files, + 'delete') + + def handle_extract_iter(self, req, compress_type, + out_content_type='text/plain'): + """ + A generator that can be assigned to a swob Response's app_iter which, + when iterated over, will extract and PUT the objects pulled from the + request body. Will occasionally yield whitespace while request is being + processed. When the request is completed will yield a response body + that can be parsed to determine success. See above documentation for + details. + + :params req: a swob Request + :params compress_type: specifying the compression type of the tar. + Accepts '', 'gz', or 'bz2' + """ + resp_dict = {'Response Status': HTTPCreated().status, + 'Response Body': '', 'Number Files Created': 0} + failed_files = [] + last_yield = time() + if out_content_type and out_content_type.endswith('/xml'): + to_yield = b'\n' + else: + to_yield = b' ' + separator = b'' + containers_accessed = set() + req.environ['eventlet.minimum_write_chunk_size'] = 0 + try: + if not out_content_type: + raise HTTPNotAcceptable(request=req) + + if req.content_length is None and \ + req.headers.get('transfer-encoding', + '').lower() != 'chunked': + raise HTTPLengthRequired(request=req) + try: + vrs, account, extract_base = req.split_path(2, 3, True) + except ValueError: + raise HTTPNotFound(request=req) + extract_base = extract_base or '' + extract_base = extract_base.rstrip('/') + tar = tarfile.open(mode='r|' + compress_type, + fileobj=req.body_file) + failed_response_type = HTTPBadRequest + containers_created = 0 + while True: + if last_yield + self.yield_frequency < time(): + last_yield = time() + yield to_yield + to_yield, separator = b' ', b'\r\n\r\n' + tar_info = tar.next() + if tar_info is None or \ + len(failed_files) >= self.max_failed_extractions: + break + if tar_info.isfile(): + obj_path = tar_info.name.encode('utf-8', 'surrogateescape') + obj_path = bytes_to_wsgi(obj_path) + if obj_path.startswith('./'): + obj_path = obj_path[2:] + obj_path = obj_path.lstrip('/') + if extract_base: + obj_path = extract_base + '/' + obj_path + if '/' not in obj_path: + continue # ignore base level file + + destination = '/'.join( + ['', vrs, account, obj_path]) + container = obj_path.split('/', 1)[0] + if not constraints.check_utf8(wsgi_to_str(destination)): + failed_files.append( + [wsgi_quote(obj_path[:self.max_path_length]), + HTTPPreconditionFailed().status]) + continue + if tar_info.size > constraints.MAX_FILE_SIZE: + failed_files.append([ + wsgi_quote(obj_path[:self.max_path_length]), + HTTPRequestEntityTooLarge().status]) + continue + container_failure = None + if container not in containers_accessed: + cont_path = '/'.join(['', vrs, account, container]) + try: + if self.create_container(req, cont_path): + containers_created += 1 + if containers_created > self.max_containers: + raise HTTPBadRequest( + 'More than %d containers to create ' + 'from tar.' % self.max_containers) + except CreateContainerError as err: + # the object PUT to this container still may + # succeed if acls are set + container_failure = [ + wsgi_quote(cont_path[:self.max_path_length]), + err.status] + if err.status_int == HTTP_UNAUTHORIZED: + raise HTTPUnauthorized(request=req) + except ValueError: + failed_files.append([ + wsgi_quote(obj_path[:self.max_path_length]), + HTTPBadRequest().status]) + continue + + tar_file = tar.extractfile(tar_info) + create_headers = { + 'Content-Length': tar_info.size, + 'X-Auth-Token': req.headers.get('X-Auth-Token'), + } + + # Copy some whitelisted headers to the subrequest + for k, v in req.headers.items(): + if ((k.lower() in ('x-delete-at', 'x-delete-after')) + or is_user_meta('object', k)): + create_headers[k] = v + + create_obj_req = make_subrequest( + req.environ, method='PUT', + path=wsgi_quote(destination), + headers=create_headers, + agent='%(orig)s BulkExpand', swift_source='EA') + create_obj_req.environ['wsgi.input'] = tar_file + + for pax_key, pax_value in tar_info.pax_headers.items(): + header_name = pax_key_to_swift_header(pax_key) + if header_name: + # Both pax_key and pax_value are unicode + # strings; the key is already UTF-8 encoded, but + # we still have to encode the value. + create_obj_req.headers[header_name] = \ + pax_value.encode("utf-8") + + resp = create_obj_req.get_response(self.app) + containers_accessed.add(container) + if resp.is_success: + resp_dict['Number Files Created'] += 1 + else: + if container_failure: + failed_files.append(container_failure) + if resp.status_int == HTTP_UNAUTHORIZED: + failed_files.append([ + wsgi_quote(obj_path[:self.max_path_length]), + HTTPUnauthorized().status]) + raise HTTPUnauthorized(request=req) + if resp.status_int // 100 == 5: + failed_response_type = HTTPBadGateway + failed_files.append([ + wsgi_quote(obj_path[:self.max_path_length]), + resp.status]) + + if failed_files: + resp_dict['Response Status'] = failed_response_type().status + elif not resp_dict['Number Files Created']: + resp_dict['Response Status'] = HTTPBadRequest().status + resp_dict['Response Body'] = 'Invalid Tar File: No Valid Files' + + except HTTPException as err: + resp_dict['Response Status'] = err.status + resp_dict['Response Body'] = err.body.decode('utf-8') + except (tarfile.TarError, zlib.error) as tar_error: + resp_dict['Response Status'] = HTTPBadRequest().status + resp_dict['Response Body'] = 'Invalid Tar File: %s' % tar_error + except Exception: + self.logger.exception('Error in extract archive.') + resp_dict['Response Status'] = HTTPServerError().status + + yield separator + get_heartbeat_response_body( + out_content_type, resp_dict, failed_files, 'extract') + + def _process_delete(self, resp, pile, obj_name, resp_dict, + failed_files, failed_file_response, retry=0): + if resp.status_int // 100 == 2: + resp_dict['Number Deleted'] += 1 + elif resp.status_int == HTTP_NOT_FOUND: + resp_dict['Number Not Found'] += 1 + elif resp.status_int == HTTP_UNAUTHORIZED: + failed_files.append([wsgi_quote(str_to_wsgi(obj_name)), + HTTPUnauthorized().status]) + elif resp.status_int == HTTP_CONFLICT and pile and \ + self.retry_count > 0 and self.retry_count > retry: + retry += 1 + sleep(self.retry_interval ** retry) + delete_obj_req = Request.blank(resp.environ['PATH_INFO'], + resp.environ) + + def _retry(req, app, obj_name, retry): + return req.get_response(app), obj_name, retry + pile.spawn(_retry, delete_obj_req, self.app, obj_name, retry) + else: + if resp.status_int // 100 == 5: + failed_file_response['type'] = HTTPBadGateway + failed_files.append([wsgi_quote(str_to_wsgi(obj_name)), + resp.status]) + + @wsgify + def __call__(self, req): + extract_type = req.params.get('extract-archive') + resp = None + if extract_type is not None and req.method == 'PUT': + archive_type = { + 'tar': '', 'tar.gz': 'gz', + 'tar.bz2': 'bz2'}.get(extract_type.lower().strip('.')) + if archive_type is not None: + resp = HTTPOk(request=req) + try: + out_content_type = req.accept.best_match( + ACCEPTABLE_FORMATS) + except ValueError: + out_content_type = None # Ignore invalid header + if out_content_type: + resp.content_type = out_content_type + resp.app_iter = self.handle_extract_iter( + req, archive_type, out_content_type=out_content_type) + else: + resp = HTTPBadRequest("Unsupported archive format") + if 'bulk-delete' in req.params and req.method in ['POST', 'DELETE']: + resp = HTTPOk(request=req) + try: + out_content_type = req.accept.best_match(ACCEPTABLE_FORMATS) + except ValueError: + out_content_type = None # Ignore invalid header + if out_content_type: + resp.content_type = out_content_type + resp.app_iter = self.handle_delete_iter( + req, out_content_type=out_content_type) + + return resp or self.app + + +def filter_factory(global_conf, **local_conf): + conf = global_conf.copy() + conf.update(local_conf) + + max_containers_per_extraction = \ + int(conf.get('max_containers_per_extraction', 10000)) + max_failed_extractions = int(conf.get('max_failed_extractions', 1000)) + max_deletes_per_request = int(conf.get('max_deletes_per_request', 10000)) + max_failed_deletes = int(conf.get('max_failed_deletes', 1000)) + yield_frequency = int(conf.get('yield_frequency', 10)) + delete_concurrency = min(1000, max(1, int( + conf.get('delete_concurrency', 2)))) + retry_count = int(conf.get('delete_container_retry_count', 0)) + retry_interval = 1.5 + + register_swift_info( + 'bulk_upload', + max_containers_per_extraction=max_containers_per_extraction, + max_failed_extractions=max_failed_extractions) + register_swift_info( + 'bulk_delete', + max_deletes_per_request=max_deletes_per_request, + max_failed_deletes=max_failed_deletes) + + def bulk_filter(app): + return Bulk( + app, conf, + max_containers_per_extraction=max_containers_per_extraction, + max_failed_extractions=max_failed_extractions, + max_deletes_per_request=max_deletes_per_request, + max_failed_deletes=max_failed_deletes, + yield_frequency=yield_frequency, + delete_concurrency=delete_concurrency, + retry_count=retry_count, + retry_interval=retry_interval) + return bulk_filter diff --git a/swift/common/middleware/catch_errors.py b/swift/common/middleware/catch_errors.py index 4053825376..0b3d37626b 100644 --- a/swift/common/middleware/catch_errors.py +++ b/swift/common/middleware/catch_errors.py @@ -1,4 +1,4 @@ -# Copyright (c) 2010-2012 OpenStack, LLC. +# Copyright (c) 2010-2012 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,39 +13,124 @@ # See the License for the specific language governing permissions and # limitations under the License. -from eventlet import Timeout -import uuid - from swift.common.swob import Request, HTTPServerError -from swift.common.utils import get_logger +from swift.common.utils import get_logger, generate_trans_id, close_if_possible from swift.common.wsgi import WSGIContext +class BadResponseLength(Exception): + pass + + +class ByteEnforcer(object): + """ + Enforces that inner_iter yields exactly bytes before + exhaustion. + + If inner_iter fails to do so, BadResponseLength is raised. + + :param inner_iter: iterable of bytestrings + :param nbytes: number of bytes expected + + N.B. since we require the nbytes param and require the inner_iter to yield + exactly that many bytes we can support the __len__ interface for anyone + happens to expect non chunked resp iterables to support that + (e.g. eventlet's wsgi.server). + """ + + def __init__(self, inner_iter, nbytes): + self.inner_iter = inner_iter + self.nbytes = nbytes + + def __len__(self): + return self.nbytes + + def __iter__(self): + try: + bytes_left = self.nbytes + for chunk in self.inner_iter: + if bytes_left >= len(chunk): + yield chunk + bytes_left -= len(chunk) + else: + yield chunk[:bytes_left] + raise BadResponseLength( + "Too many bytes; truncating after %d bytes " + "with at least %d surplus bytes remaining" % ( + self.nbytes, len(chunk) - bytes_left)) + + if bytes_left: + raise BadResponseLength('Expected another %d bytes' % ( + bytes_left,)) + finally: + close_if_possible(self.inner_iter) + + class CatchErrorsContext(WSGIContext): - def __init__(self, app, logger): + def __init__(self, app, logger, trans_id_suffix=''): super(CatchErrorsContext, self).__init__(app) self.logger = logger + self.trans_id_suffix = trans_id_suffix def handle_request(self, env, start_response): - trans_id = 'tx' + uuid.uuid4().hex + trans_id_suffix = self.trans_id_suffix + trans_id_extra = env.get('HTTP_X_TRANS_ID_EXTRA') + if trans_id_extra: + trans_id_suffix += '-' + trans_id_extra[:32] + + trans_id = generate_trans_id(trans_id_suffix) env['swift.trans_id'] = trans_id self.logger.txn_id = trans_id try: # catch any errors in the pipeline resp = self._app_call(env) - except (Exception, Timeout), err: - self.logger.exception(_('Error: %s'), err) + except: # noqa + self.logger.exception('Error: An error occurred') resp = HTTPServerError(request=Request(env), - body='An error occurred', + body=b'An error occurred', content_type='text/plain') - resp.headers['x-trans-id'] = trans_id + resp.headers['X-Trans-Id'] = trans_id + resp.headers['X-Openstack-Request-Id'] = trans_id return resp(env, start_response) + # If the app specified a Content-Length, enforce that it sends that + # many bytes. + # + # If an app gives too few bytes, then the client will wait for the + # remainder before sending another HTTP request on the same socket; + # since no more bytes are coming, this will result in either an + # infinite wait or a timeout. In this case, we want to raise an + # exception to signal to the WSGI server that it should close the + # TCP connection. + # + # If an app gives too many bytes, then we can deadlock with the + # client; if the client reads its N bytes and then sends a large-ish + # request (enough to fill TCP buffers), it'll block until we read + # some of the request. However, we won't read the request since + # we'll be trying to shove the rest of our oversized response out + # the socket. In that case, we truncate the response body at N bytes + # and raise an exception to stop any more bytes from being + # generated and also to kill the TCP connection. + if env['REQUEST_METHOD'] == 'HEAD': + resp = ByteEnforcer(resp, 0) + + elif self._response_headers: + content_lengths = [val for header, val in self._response_headers + if header.lower() == "content-length"] + if len(content_lengths) == 1: + try: + content_length = int(content_lengths[0]) + except ValueError: + pass + else: + resp = ByteEnforcer(resp, content_length) + # make sure the response has the trans_id if self._response_headers is None: self._response_headers = [] - self._response_headers.append(('x-trans-id', trans_id)) + self._response_headers.append(('X-Trans-Id', trans_id)) + self._response_headers.append(('X-Openstack-Request-Id', trans_id)) start_response(self._response_status, self._response_headers, self._response_exc_info) return resp @@ -60,12 +145,15 @@ class CatchErrorMiddleware(object): def __init__(self, app, conf): self.app = app self.logger = get_logger(conf, log_route='catch-errors') + self.trans_id_suffix = conf.get('trans_id_suffix', '') def __call__(self, env, start_response): """ If used, this should be the first middleware in pipeline. """ - context = CatchErrorsContext(self.app, self.logger) + context = CatchErrorsContext(self.app, + self.logger, + self.trans_id_suffix) return context.handle_request(env, start_response) diff --git a/swift/common/middleware/cname_lookup.py b/swift/common/middleware/cname_lookup.py index 87363b2308..b9ef4f616f 100644 --- a/swift/common/middleware/cname_lookup.py +++ b/swift/common/middleware/cname_lookup.py @@ -1,4 +1,4 @@ -# Copyright (c) 2010-2012 OpenStack, LLC. +# Copyright (c) 2010-2012 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -29,35 +29,49 @@ try: import dns.resolver - from dns.exception import DNSException - from dns.resolver import NXDOMAIN, NoAnswer + import dns.exception except ImportError: # catch this to allow docs to be built without the dependency MODULE_DEPENDENCY_MET = False else: # executed if the try block finishes with no errors MODULE_DEPENDENCY_MET = True -from swift.common.swob import Request, HTTPBadRequest -from swift.common.utils import cache_from_env, get_logger +from swift.common.middleware import RewriteContext +from swift.common.swob import Request, HTTPBadRequest, \ + str_to_wsgi, wsgi_to_str +from swift.common.utils import cache_from_env, get_logger, is_valid_ip, \ + list_from_csv, parse_socket_string +from swift.common.registry import register_swift_info -def lookup_cname(domain): # pragma: no cover +def lookup_cname(domain, resolver): # pragma: no cover """ Given a domain, returns its DNS CNAME mapping and DNS ttl. :param domain: domain to query on + :param resolver: dns.resolver.Resolver() instance used for executing DNS + queries :returns: (ttl, result) """ try: - answer = dns.resolver.query(domain, 'CNAME').rrset + answer = resolver.query(domain, 'CNAME').rrset ttl = answer.ttl - result = answer.items[0].to_text() + result = list(answer.items)[0].to_text() result = result.rstrip('.') return ttl, result - except (DNSException, NXDOMAIN, NoAnswer): + except (dns.resolver.NXDOMAIN, dns.resolver.NoAnswer): + # As the memcache lib returns None when nothing is found in cache, + # returning false helps to distinguish between "nothing in cache" + # (None) and "nothing to cache" (False). + return 60, False + except (dns.exception.DNSException): return 0, None +class _CnameLookupContext(RewriteContext): + base_re = r'^(https?://)%s(/.*)?$' + + class CNAMELookupMiddleware(object): """ CNAME Lookup Middleware @@ -74,60 +88,94 @@ def __init__(self, app, conf): # reraise the exception if the dependency wasn't met raise ImportError('dnspython is required for this module') self.app = app - self.storage_domain = conf.get('storage_domain', 'example.com') - if self.storage_domain and self.storage_domain[0] != '.': - self.storage_domain = '.' + self.storage_domain + storage_domain = conf.get('storage_domain', 'example.com') + self.storage_domain = ['.' + s for s in + list_from_csv(storage_domain) + if not s.startswith('.')] + self.storage_domain += [s for s in list_from_csv(storage_domain) + if s.startswith('.')] self.lookup_depth = int(conf.get('lookup_depth', '1')) + nameservers = list_from_csv(conf.get('nameservers')) + try: + for i, server in enumerate(nameservers): + ip_or_host, maybe_port = nameservers[i] = \ + parse_socket_string(server, None) + if not is_valid_ip(ip_or_host): + raise ValueError + if maybe_port is not None: + int(maybe_port) + except ValueError: + raise ValueError('Invalid cname_lookup/nameservers configuration ' + 'found. All nameservers must be valid IPv4 or ' + 'IPv6, followed by an optional : port.') + self.resolver = dns.resolver.Resolver() + if nameservers: + self.resolver.nameservers = [ip for (ip, port) in nameservers] + self.resolver.nameserver_ports = { + ip: int(port) for (ip, port) in nameservers + if port is not None} self.memcache = None self.logger = get_logger(conf, log_route='cname-lookup') + def _domain_endswith_in_storage_domain(self, a_domain): + a_domain = '.' + a_domain + for domain in self.storage_domain: + if a_domain.endswith(domain): + return True + return False + def __call__(self, env, start_response): if not self.storage_domain: return self.app(env, start_response) - given_domain = env['HTTP_HOST'] + if 'HTTP_HOST' in env: + requested_host = env['HTTP_HOST'] + else: + requested_host = env['SERVER_NAME'] + given_domain = wsgi_to_str(requested_host) port = '' if ':' in given_domain: given_domain, port = given_domain.rsplit(':', 1) - if given_domain == self.storage_domain[1:]: # strip initial '.' + if is_valid_ip(given_domain): return self.app(env, start_response) a_domain = given_domain - if not a_domain.endswith(self.storage_domain): + if not self._domain_endswith_in_storage_domain(a_domain): if self.memcache is None: self.memcache = cache_from_env(env) error = True - for tries in xrange(self.lookup_depth): + for tries in range(self.lookup_depth): found_domain = None if self.memcache: memcache_key = ''.join(['cname-', a_domain]) found_domain = self.memcache.get(memcache_key) - if not found_domain: - ttl, found_domain = lookup_cname(a_domain) - if self.memcache: + if found_domain is None: + ttl, found_domain = lookup_cname(a_domain, self.resolver) + if self.memcache and ttl > 0: memcache_key = ''.join(['cname-', given_domain]) self.memcache.set(memcache_key, found_domain, - timeout=ttl) - if found_domain is None or found_domain == a_domain: + time=ttl) + if not found_domain or found_domain == a_domain: # no CNAME records or we're at the last lookup error = True found_domain = None break - elif found_domain.endswith(self.storage_domain): + elif self._domain_endswith_in_storage_domain(found_domain): # Found it! self.logger.info( - _('Mapped %(given_domain)s to %(found_domain)s') % + 'Mapped %(given_domain)s to %(found_domain)s', {'given_domain': given_domain, 'found_domain': found_domain}) if port: - env['HTTP_HOST'] = ':'.join([found_domain, port]) + env['HTTP_HOST'] = ':'.join([ + str_to_wsgi(found_domain), port]) else: - env['HTTP_HOST'] = found_domain + env['HTTP_HOST'] = str_to_wsgi(found_domain) error = False break else: # try one more deep in the chain self.logger.debug( - _('Following CNAME chain for ' - '%(given_domain)s to %(found_domain)s') % + 'Following CNAME chain for ' + '%(given_domain)s to %(found_domain)s', {'given_domain': given_domain, 'found_domain': found_domain}) a_domain = found_domain @@ -140,6 +188,11 @@ def __call__(self, env, start_response): resp = HTTPBadRequest(request=Request(env), body=msg, content_type='text/plain') return resp(env, start_response) + else: + context = _CnameLookupContext(self.app, requested_host, + env['HTTP_HOST']) + return context.handle_request(env, start_response) + return self.app(env, start_response) @@ -147,6 +200,9 @@ def filter_factory(global_conf, **local_conf): # pragma: no cover conf = global_conf.copy() conf.update(local_conf) + register_swift_info('cname_lookup', + lookup_depth=int(conf.get('lookup_depth', '1'))) + def cname_filter(app): return CNAMELookupMiddleware(app, conf) return cname_filter diff --git a/swift/common/middleware/container_quotas.py b/swift/common/middleware/container_quotas.py new file mode 100644 index 0000000000..2a10c8d2ce --- /dev/null +++ b/swift/common/middleware/container_quotas.py @@ -0,0 +1,122 @@ +# Copyright (c) 2010-2012 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +The ``container_quotas`` middleware implements simple quotas that can be +imposed on swift containers by a user with the ability to set container +metadata, most likely the account administrator. This can be useful for +limiting the scope of containers that are delegated to non-admin users, exposed +to ``formpost`` uploads, or just as a self-imposed sanity check. + +Any object PUT operations that exceed these quotas return a 413 response +(request entity too large) with a descriptive body. + +Quotas are subject to several limitations: eventual consistency, the timeliness +of the cached container_info (60 second ttl by default), and it's unable to +reject chunked transfer uploads that exceed the quota (though once the quota +is exceeded, new chunked transfers will be refused). + +Quotas are set by adding meta values to the container, and are validated when +set: + ++---------------------------------------------+-------------------------------+ +|Metadata | Use | ++=============================================+===============================+ +| X-Container-Meta-Quota-Bytes | Maximum size of the | +| | container, in bytes. | ++---------------------------------------------+-------------------------------+ +| X-Container-Meta-Quota-Count | Maximum object count of the | +| | container. | ++---------------------------------------------+-------------------------------+ + +The ``container_quotas`` middleware should be added to the pipeline in your +``/etc/swift/proxy-server.conf`` file just after any auth middleware. +For example:: + + [pipeline:main] + pipeline = catch_errors cache tempauth container_quotas proxy-server + + [filter:container_quotas] + use = egg:swift#container_quotas +""" +from swift.common.http import is_success +from swift.common.swob import HTTPRequestEntityTooLarge, HTTPBadRequest, \ + wsgify +from swift.common.registry import register_swift_info +from swift.proxy.controllers.base import get_container_info + + +class ContainerQuotaMiddleware(object): + def __init__(self, app, *args, **kwargs): + self.app = app + + def bad_response(self, req, container_info): + # 401 if the user couldn't have PUT this object in the first place. + # This prevents leaking the container's existence to unauthed users. + if 'swift.authorize' in req.environ: + req.acl = container_info['write_acl'] + aresp = req.environ['swift.authorize'](req) + if aresp: + return aresp + return HTTPRequestEntityTooLarge(body='Upload exceeds quota.') + + @wsgify + def __call__(self, req): + try: + (version, account, container, obj) = req.split_path(3, 4, True) + except ValueError: + return self.app + + # verify new quota headers are properly formatted + if not obj and req.method in ('PUT', 'POST'): + val = req.headers.get('X-Container-Meta-Quota-Bytes') + if val and not val.isdigit(): + return HTTPBadRequest(body='Invalid bytes quota.') + val = req.headers.get('X-Container-Meta-Quota-Count') + if val and not val.isdigit(): + return HTTPBadRequest(body='Invalid count quota.') + + # check user uploads against quotas + elif obj and req.method in ('PUT'): + container_info = get_container_info( + req.environ, self.app, swift_source='CQ') + if not container_info or not is_success(container_info['status']): + # this will hopefully 404 later + return self.app + + if 'quota-bytes' in container_info.get('meta', {}) and \ + 'bytes' in container_info and \ + container_info['meta']['quota-bytes'].isdigit(): + content_length = (req.content_length or 0) + new_size = int(container_info['bytes']) + content_length + if int(container_info['meta']['quota-bytes']) < new_size: + return self.bad_response(req, container_info) + + if 'quota-count' in container_info.get('meta', {}) and \ + 'object_count' in container_info and \ + container_info['meta']['quota-count'].isdigit(): + new_count = int(container_info['object_count']) + 1 + if int(container_info['meta']['quota-count']) < new_count: + return self.bad_response(req, container_info) + + return self.app + + +def filter_factory(global_conf, **local_conf): + register_swift_info('container_quotas') + + def container_quota_filter(app): + return ContainerQuotaMiddleware(app) + return container_quota_filter diff --git a/swift/common/middleware/container_sync.py b/swift/common/middleware/container_sync.py new file mode 100644 index 0000000000..35b1a7d4b6 --- /dev/null +++ b/swift/common/middleware/container_sync.py @@ -0,0 +1,174 @@ +# Copyright (c) 2013 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from swift.common.constraints import valid_api_version +from swift.common.container_sync_realms import ContainerSyncRealms +from swift.common.request_helpers import append_log_info +from swift.common.swob import HTTPBadRequest, HTTPUnauthorized, wsgify +from swift.common.utils import ( + config_true_value, get_logger, streq_const_time) +from swift.proxy.controllers.base import get_container_info +from swift.common.registry import register_swift_info + + +class ContainerSync(object): + """ + WSGI middleware that validates an incoming container sync request + using the container-sync-realms.conf style of container sync. + """ + + def __init__(self, app, conf, logger=None): + self.app = app + self.conf = conf + self.logger = logger or get_logger(conf, log_route='container_sync') + self.realms_conf = ContainerSyncRealms( + os.path.join( + conf.get('swift_dir', '/etc/swift'), + 'container-sync-realms.conf'), + self.logger) + self.allow_full_urls = config_true_value( + conf.get('allow_full_urls', 'true')) + # configure current realm/cluster for /info + self.realm = self.cluster = None + current = conf.get('current', None) + if current: + try: + self.realm, self.cluster = (p.upper() for p in + current.strip('/').split('/')) + except ValueError: + self.logger.error('Invalid current //REALM/CLUSTER (%s)', + current) + self.register_info() + + def register_info(self): + dct = {} + for realm in self.realms_conf.realms(): + clusters = self.realms_conf.clusters(realm) + if clusters: + dct[realm] = {'clusters': dict((c, {}) for c in clusters)} + if self.realm and self.cluster: + try: + dct[self.realm]['clusters'][self.cluster]['current'] = True + except KeyError: + self.logger.error('Unknown current //REALM/CLUSTER (%s)', + '//%s/%s' % (self.realm, self.cluster)) + register_swift_info('container_sync', realms=dct) + + @wsgify + def __call__(self, req): + if req.path == '/info': + # Ensure /info requests get the freshest results + self.register_info() + return self.app + + try: + (version, acc, cont, obj) = req.split_path(3, 4, True) + bad_path = False + except ValueError: + bad_path = True + + # use of bad_path bool is to avoid recursive tracebacks + if bad_path or not valid_api_version(version): + return self.app + + # validate container-sync metdata update + info = get_container_info( + req.environ, self.app, swift_source='CS') + sync_to = req.headers.get('x-container-sync-to') + if req.method in ('PUT', 'POST') and cont and not obj: + versions_cont = info.get( + 'sysmeta', {}).get('versions-container') + if sync_to and versions_cont: + raise HTTPBadRequest( + 'Cannot configure container sync on a container ' + 'with object versioning configured.', + request=req) + + if not self.allow_full_urls: + if sync_to and not sync_to.startswith('//'): + raise HTTPBadRequest( + body='Full URLs are not allowed for X-Container-Sync-To ' + 'values. Only realm values of the format ' + '//realm/cluster/account/container are allowed.\n', + request=req) + auth = req.headers.get('x-container-sync-auth') + if auth: + valid = False + auth = auth.split() + if len(auth) != 3: + append_log_info(req.environ, 'cs:not-3-args') + else: + realm, nonce, sig = auth + realm_key = self.realms_conf.key(realm) + realm_key2 = self.realms_conf.key2(realm) + if not realm_key: + append_log_info(req.environ, 'cs:no-local-realm-key') + else: + user_key = info.get('sync_key') + if not user_key: + append_log_info(req.environ, 'cs:no-local-user-key') + else: + # x-timestamp headers get shunted by gatekeeper + if 'x-backend-inbound-x-timestamp' in req.headers: + req.headers['x-timestamp'] = req.headers.pop( + 'x-backend-inbound-x-timestamp') + + expected = self.realms_conf.get_sig( + req.method, req.path, + req.headers.get('x-timestamp', '0'), nonce, + realm_key, user_key) + expected2 = self.realms_conf.get_sig( + req.method, req.path, + req.headers.get('x-timestamp', '0'), nonce, + realm_key2, user_key) if realm_key2 else expected + if not streq_const_time(sig, expected) and \ + not streq_const_time(sig, expected2): + append_log_info(req.environ, 'cs:invalid-sig') + else: + append_log_info(req.environ, 'cs:valid') + valid = True + if not valid: + exc = HTTPUnauthorized( + body='X-Container-Sync-Auth header not valid; ' + 'contact cluster operator for support.', + headers={'content-type': 'text/plain'}, + request=req) + exc.headers['www-authenticate'] = ' '.join([ + 'SwiftContainerSync', + exc.www_authenticate().split(None, 1)[1]]) + raise exc + else: + req.environ['swift.authorize_override'] = True + # An SLO manifest will already be in the internal manifest + # syntax and might be synced before its segments, so stop SLO + # middleware from performing the usual manifest validation. + req.environ['swift.slo_override'] = True + # Similar arguments for static symlinks + req.environ['swift.symlink_override'] = True + + return self.app + + +def filter_factory(global_conf, **local_conf): + conf = global_conf.copy() + conf.update(local_conf) + register_swift_info('container_sync') + + def cache_filter(app): + return ContainerSync(app, conf) + + return cache_filter diff --git a/swift/common/middleware/copy.py b/swift/common/middleware/copy.py new file mode 100644 index 0000000000..fed8f860fe --- /dev/null +++ b/swift/common/middleware/copy.py @@ -0,0 +1,538 @@ +# Copyright (c) 2015 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Server side copy is a feature that enables users/clients to COPY objects +between accounts and containers without the need to download and then +re-upload objects, thus eliminating additional bandwidth consumption and +also saving time. This may be used when renaming/moving an object which +in Swift is a (COPY + DELETE) operation. + +The server side copy middleware should be inserted in the pipeline after auth +and before the quotas and large object middlewares. If it is not present in the +pipeline in the proxy-server configuration file, it will be inserted +automatically. There is no configurable option provided to turn off server +side copy. + +-------- +Metadata +-------- +* All metadata of source object is preserved during object copy. +* One can also provide additional metadata during PUT/COPY request. This will + over-write any existing conflicting keys. +* Server side copy can also be used to change content-type of an existing + object. + +----------- +Object Copy +----------- +* The destination container must exist before requesting copy of the object. +* When several replicas exist, the system copies from the most recent replica. + That is, the copy operation behaves as though the X-Newest header is in the + request. +* The request to copy an object should have no body (i.e. content-length of the + request must be zero). + +There are two ways in which an object can be copied: + +1. Send a PUT request to the new object (destination/target) with an additional + header named ``X-Copy-From`` specifying the source object + (in '/container/object' format). Example:: + + curl -i -X PUT http:///container1/destination_obj + -H 'X-Auth-Token: ' + -H 'X-Copy-From: /container2/source_obj' + -H 'Content-Length: 0' + +2. Send a COPY request with an existing object in URL with an additional header + named ``Destination`` specifying the destination/target object + (in '/container/object' format). Example:: + + curl -i -X COPY http:///container2/source_obj + -H 'X-Auth-Token: ' + -H 'Destination: /container1/destination_obj' + -H 'Content-Length: 0' + +Note that if the incoming request has some conditional headers (e.g. ``Range``, +``If-Match``), the *source* object will be evaluated for these headers (i.e. if +PUT with both ``X-Copy-From`` and ``Range``, Swift will make a partial copy to +the destination object). + +------------------------- +Cross Account Object Copy +------------------------- +Objects can also be copied from one account to another account if the user +has the necessary permissions (i.e. permission to read from container +in source account and permission to write to container in destination account). + +Similar to examples mentioned above, there are two ways to copy objects across +accounts: + +1. Like the example above, send PUT request to copy object but with an + additional header named ``X-Copy-From-Account`` specifying the source + account. Example:: + + curl -i -X PUT http://:/v1/AUTH_test1/container/destination_obj + -H 'X-Auth-Token: ' + -H 'X-Copy-From: /container/source_obj' + -H 'X-Copy-From-Account: AUTH_test2' + -H 'Content-Length: 0' + +2. Like the previous example, send a COPY request but with an additional header + named ``Destination-Account`` specifying the name of destination account. + Example:: + + curl -i -X COPY http://:/v1/AUTH_test2/container/source_obj + -H 'X-Auth-Token: ' + -H 'Destination: /container/destination_obj' + -H 'Destination-Account: AUTH_test1' + -H 'Content-Length: 0' + +------------------- +Large Object Copy +------------------- +The best option to copy a large object is to copy segments individually. +To copy the manifest object of a large object, add the query parameter to +the copy request:: + + ?multipart-manifest=get + +If a request is sent without the query parameter, an attempt will be made to +copy the whole object but will fail if the object size is +greater than 5GB. + +""" + +from swift.common.utils import get_logger, config_true_value, FileLikeIter, \ + close_if_possible +from swift.common.swob import Request, HTTPPreconditionFailed, \ + HTTPRequestEntityTooLarge, HTTPBadRequest, HTTPException, \ + wsgi_quote, wsgi_unquote +from swift.common.http import HTTP_MULTIPLE_CHOICES, is_success, HTTP_OK +from swift.common.constraints import check_account_format, MAX_FILE_SIZE +from swift.common.request_helpers import copy_header_subset, remove_items, \ + is_sys_meta, is_sys_or_user_meta, is_object_transient_sysmeta, \ + check_path_header, OBJECT_SYSMETA_CONTAINER_UPDATE_OVERRIDE_PREFIX +from swift.common.wsgi import WSGIContext, make_subrequest +import eventlet +from swift.common.request_helpers import get_heartbeat_response_body + + +def _check_copy_from_header(req): + """ + Validate that the value from x-copy-from header is + well formatted. We assume the caller ensures that + x-copy-from header is present in req.headers. + + :param req: HTTP request object + :returns: A tuple with container name and object name + :raise HTTPPreconditionFailed: if x-copy-from value + is not well formatted. + """ + return check_path_header(req, 'X-Copy-From', 2, + 'X-Copy-From header must be of the form ' + '/') + + +def _check_destination_header(req): + """ + Validate that the value from destination header is + well formatted. We assume the caller ensures that + destination header is present in req.headers. + + :param req: HTTP request object + :returns: A tuple with container name and object name + :raise HTTPPreconditionFailed: if destination value + is not well formatted. + """ + return check_path_header(req, 'Destination', 2, + 'Destination header must be of the form ' + '/') + + +def _copy_headers(src, dest): + """ + Will copy desired headers from src to dest. + + :params src: an instance of collections.Mapping + :params dest: an instance of collections.Mapping + """ + for k, v in src.items(): + if (is_sys_or_user_meta('object', k) or + is_object_transient_sysmeta(k) or + k.lower() == 'x-delete-at'): + dest[k] = v + + +class ServerSideCopyWebContext(WSGIContext): + + def __init__(self, app, logger, yield_frequency=10): + super(ServerSideCopyWebContext, self).__init__(app) + self.app = app + self.logger = logger + self.yield_frequency = yield_frequency + + def get_source_resp(self, req): + sub_req = make_subrequest( + req.environ, path=wsgi_quote(req.path_info), headers=req.headers, + swift_source='SSC') + return sub_req.get_response(self.app) + + def send_put_req(self, req, additional_resp_headers, start_response): + heartbeat = config_true_value(req.params.get('heartbeat')) + ACCEPTABLE_FORMATS = ['text/plain', 'application/json'] + + try: + out_content_type = req.accept.best_match(ACCEPTABLE_FORMATS) + except ValueError: + out_content_type = 'text/plain' + if not out_content_type: + out_content_type = 'text/plain' + + if heartbeat: + gt = eventlet.spawn(self._app_call, + req.environ) + start_response('202 Accepted', + [('Content-Type', out_content_type)]) + + def resp_iter(): + # Send an initial heartbeat + yield b' ' + app_iter = [b''] + try: + while not gt.dead: + try: + with eventlet.Timeout(self.yield_frequency): + app_iter = gt.wait() + except eventlet.Timeout: + yield b' ' + except Exception as e: + # Send back the status to the client if error + self._response_status = '500 Internal Error' + app_iter = [str(e).encode('utf8')] + finally: + response_body = b''.join(app_iter).decode('utf8') + resp_dict = {'Response Status': self._response_status, + 'Response Body': response_body} + errors = [] + + if not is_success(self._get_status_int()): + src_path = additional_resp_headers['X-Copied-From'] + errors.append(( + wsgi_quote(src_path), + self._response_status, + )) + + for k, v in additional_resp_headers.items(): + if not k.lower().startswith(('x-object-sysmeta-', + 'x-backend')): + resp_dict[k] = v + + for k, v in self._response_headers: + if not k.lower().startswith(('x-object-sysmeta-', + 'x-backend')): + resp_dict[k] = v + yield get_heartbeat_response_body(out_content_type, + resp_dict, + errors, 'copy') + close_if_possible(gt) + + return resp_iter() + + else: + app_resp = self._app_call(req.environ) + self._adjust_put_response(req, additional_resp_headers) + start_response(self._response_status, + self._response_headers, + self._response_exc_info) + return app_resp + + def _adjust_put_response(self, req, additional_resp_headers): + if is_success(self._get_status_int()): + for header, value in additional_resp_headers.items(): + self._response_headers.append((header, value)) + + def handle_OPTIONS_request(self, req, start_response): + app_resp = self._app_call(req.environ) + if is_success(self._get_status_int()): + for i, (header, value) in enumerate(self._response_headers): + if header.lower() == 'allow' and 'COPY' not in value: + self._response_headers[i] = ('Allow', value + ', COPY') + if header.lower() == 'access-control-allow-methods' and \ + 'COPY' not in value: + self._response_headers[i] = \ + ('Access-Control-Allow-Methods', value + ', COPY') + start_response(self._response_status, + self._response_headers, + self._response_exc_info) + return app_resp + + +class ServerSideCopyMiddleware(object): + + def __init__(self, app, conf): + self.app = app + self.logger = get_logger(conf, log_route="copy") + self.yield_frequency = int(conf.get('yield_frequency', 10)) + + def __call__(self, env, start_response): + req = Request(env) + try: + (version, account, container, obj) = req.split_path(4, 4, True) + is_obj_req = True + except ValueError: + is_obj_req = False + if not is_obj_req: + # If obj component is not present in req, do not proceed further. + return self.app(env, start_response) + + try: + # In some cases, save off original request method since it gets + # mutated into PUT during handling. This way logging can display + # the method the client actually sent. + if req.method == 'PUT' and req.headers.get('X-Copy-From'): + return self.handle_PUT(req, start_response) + elif req.method == 'COPY': + req.environ['swift.orig_req_method'] = req.method + return self.handle_COPY(req, start_response, + account, container, obj) + elif req.method == 'OPTIONS': + # Does not interfere with OPTIONS response from + # (account,container) servers and /info response. + return self.handle_OPTIONS(req, start_response) + + except HTTPException as e: + return e(req.environ, start_response) + + return self.app(env, start_response) + + def handle_COPY(self, req, start_response, account, container, obj): + if not req.headers.get('Destination'): + return HTTPPreconditionFailed(request=req, + body='Destination header required' + )(req.environ, start_response) + dest_account = account + if 'Destination-Account' in req.headers: + dest_account = wsgi_unquote(req.headers.get('Destination-Account')) + dest_account = check_account_format(req, dest_account) + req.headers['X-Copy-From-Account'] = wsgi_quote(account) + account = dest_account + del req.headers['Destination-Account'] + dest_container, dest_object = _check_destination_header(req) + source = '/%s/%s' % (container, obj) + container = dest_container + obj = dest_object + # re-write the existing request as a PUT instead of creating a new one + req.method = 'PUT' + # As this the path info is updated with destination container, + # the proxy server app will use the right object controller + # implementation corresponding to the container's policy type. + ver, _junk = req.split_path(1, 2, rest_with_last=True) + req.path_info = '/%s/%s/%s/%s' % ( + ver, dest_account, dest_container, dest_object) + req.headers['Content-Length'] = 0 + req.headers['X-Copy-From'] = wsgi_quote(source) + del req.headers['Destination'] + return self.handle_PUT(req, start_response) + + def _get_source_object(self, ssc_ctx, source_path, req): + source_req = req.copy_get() + + # make sure the source request uses it's container_info + source_req.headers.pop('X-Backend-Storage-Policy-Index', None) + source_req.path_info = source_path + source_req.headers['X-Newest'] = 'true' + + # in case we are copying an SLO manifest, set format=raw parameter + params = source_req.params + if params.get('multipart-manifest') == 'get': + params['format'] = 'raw' + source_req.params = params + + source_resp = ssc_ctx.get_source_resp(source_req) + + if source_resp.content_length is None: + # This indicates a transfer-encoding: chunked source object, + # which currently only happens because there are more than + # CONTAINER_LISTING_LIMIT segments in a segmented object. In + # this case, we're going to refuse to do the server-side copy. + close_if_possible(source_resp.app_iter) + return HTTPRequestEntityTooLarge(request=req) + + if source_resp.content_length > MAX_FILE_SIZE: + close_if_possible(source_resp.app_iter) + return HTTPRequestEntityTooLarge(request=req) + + return source_resp + + def _create_response_headers(self, source_path, source_resp, sink_req): + resp_headers = dict() + acct, path = source_path.split('/', 3)[2:4] + resp_headers['X-Copied-From-Account'] = wsgi_quote(acct) + resp_headers['X-Copied-From'] = wsgi_quote(path) + if 'last-modified' in source_resp.headers: + resp_headers['X-Copied-From-Last-Modified'] = \ + source_resp.headers['last-modified'] + if 'X-Object-Version-Id' in source_resp.headers: + resp_headers['X-Copied-From-Version-Id'] = \ + source_resp.headers['X-Object-Version-Id'] + # Existing sys and user meta of source object is added to response + # headers in addition to the new ones. + _copy_headers(sink_req.headers, resp_headers) + return resp_headers + + def handle_PUT(self, req, start_response): + if req.content_length: + return HTTPBadRequest(body='Copy requests require a zero byte ' + 'body', request=req, + content_type='text/plain')(req.environ, + start_response) + # If heartbeat is enabled, set minimum_write_chunk_size directly + # in the original client request before making subrequests + if config_true_value(req.params.get('heartbeat')): + wsgi_input = req.environ.get('wsgi.input') + if hasattr(wsgi_input, 'environ'): + wsgi_input.environ['eventlet.minimum_write_chunk_size'] = 0 + # Not sure if we also need to set it in + # the current request's environ + req.environ['eventlet.minimum_write_chunk_size'] = 0 + + # Form the path of source object to be fetched + ver, acct, _rest = req.split_path(2, 3, True) + src_account_name = req.headers.get('X-Copy-From-Account') + if src_account_name: + src_account_name = check_account_format( + req, wsgi_unquote(src_account_name)) + else: + src_account_name = acct + src_container_name, src_obj_name = _check_copy_from_header(req) + source_path = '/%s/%s/%s/%s' % (ver, src_account_name, + src_container_name, src_obj_name) + + # GET the source object, bail out on error + ssc_ctx = ServerSideCopyWebContext(self.app, self.logger, + self.yield_frequency) + source_resp = self._get_source_object(ssc_ctx, source_path, req) + if source_resp.status_int >= HTTP_MULTIPLE_CHOICES: + return source_resp(source_resp.environ, start_response) + + # Create a new Request object based on the original request instance. + # This will preserve original request environ including headers. + sink_req = Request.blank(req.path_info, environ=req.environ) + + def is_object_sysmeta(k): + return is_sys_meta('object', k) + + if config_true_value(req.headers.get('x-fresh-metadata', 'false')): + # x-fresh-metadata only applies to copy, not post-as-copy: ignore + # existing user metadata, update existing sysmeta with new + copy_header_subset(source_resp, sink_req, is_object_sysmeta) + copy_header_subset(req, sink_req, is_object_sysmeta) + else: + # First copy existing sysmeta, user meta and other headers from the + # source to the sink, apart from headers that are conditionally + # copied below and timestamps. + exclude_headers = ('x-static-large-object', 'x-object-manifest', + 'etag', 'content-type', 'x-timestamp', + 'x-backend-timestamp') + copy_header_subset(source_resp, sink_req, + lambda k: k.lower() not in exclude_headers) + # now update with original req headers + sink_req.headers.update(req.headers) + + params = sink_req.params + params_updated = False + + if params.get('multipart-manifest') == 'get': + if 'X-Static-Large-Object' in source_resp.headers: + params['multipart-manifest'] = 'put' + if 'X-Object-Manifest' in source_resp.headers: + del params['multipart-manifest'] + sink_req.headers['X-Object-Manifest'] = \ + source_resp.headers['X-Object-Manifest'] + params_updated = True + + if 'version-id' in params: + del params['version-id'] + params_updated = True + + if params_updated: + sink_req.params = params + + # Set swift.source, data source, content length and etag + # for the PUT request + sink_req.environ['swift.source'] = 'SSC' + sink_req.environ['wsgi.input'] = FileLikeIter(source_resp.app_iter) + sink_req.content_length = source_resp.content_length + if (source_resp.status_int == HTTP_OK and + 'X-Static-Large-Object' not in source_resp.headers and + ('X-Object-Manifest' not in source_resp.headers or + req.params.get('multipart-manifest') == 'get')): + # copy source etag so that copied content is verified, unless: + # - not a 200 OK response: source etag may not match the actual + # content, for example with a 206 Partial Content response to a + # ranged request + # - SLO manifest: etag cannot be specified in manifest PUT; SLO + # generates its own etag value which may differ from source + # - SLO: etag in SLO response is not hash of actual content + # - DLO: etag in DLO response is not hash of actual content + sink_req.headers['Etag'] = source_resp.etag + else: + # since we're not copying the source etag, make sure that any + # container update override values are not copied. + remove_items(sink_req.headers, lambda k: k.startswith( + OBJECT_SYSMETA_CONTAINER_UPDATE_OVERRIDE_PREFIX.title())) + + # We no longer need these headers + sink_req.headers.pop('X-Copy-From', None) + sink_req.headers.pop('X-Copy-From-Account', None) + + # If the copy request does not explicitly override content-type, + # use the one present in the source object. + if not req.headers.get('content-type'): + sink_req.headers['Content-Type'] = \ + source_resp.headers['Content-Type'] + + # Create response headers for PUT response + resp_headers = self._create_response_headers(source_path, + source_resp, sink_req) + + put_resp = ssc_ctx.send_put_req(sink_req, resp_headers, start_response) + + # For heartbeat=on, we need to cleanup the resp iter + if config_true_value(req.params.get('heartbeat')): + def clean_iter(app_iter): + try: + for chunk in app_iter: + yield chunk + finally: + close_if_possible(source_resp.app_iter) + return clean_iter(put_resp) + + close_if_possible(source_resp.app_iter) + return put_resp + + def handle_OPTIONS(self, req, start_response): + return ServerSideCopyWebContext(self.app, self.logger).\ + handle_OPTIONS_request(req, start_response) + + +def filter_factory(global_conf, **local_conf): + conf = global_conf.copy() + conf.update(local_conf) + + def copy_filter(app): + return ServerSideCopyMiddleware(app, conf) + + return copy_filter diff --git a/swift/common/middleware/crossdomain.py b/swift/common/middleware/crossdomain.py new file mode 100644 index 0000000000..c15e524548 --- /dev/null +++ b/swift/common/middleware/crossdomain.py @@ -0,0 +1,105 @@ +# Copyright (c) 2013 OpenStack Foundation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from swift.common.swob import Request, Response +from swift.common.registry import register_swift_info + + +class CrossDomainMiddleware(object): + + """ + Cross domain middleware used to respond to requests for cross domain + policy information. + + If the path is ``/crossdomain.xml`` it will respond with an xml cross + domain policy document. This allows web pages hosted elsewhere to use + client side technologies such as Flash, Java and Silverlight to interact + with the Swift API. + + To enable this middleware, add it to the pipeline in your proxy-server.conf + file. It should be added before any authentication (e.g., tempauth or + keystone) middleware. In this example ellipsis (...) indicate other + middleware you may have chosen to use: + + .. code:: cfg + + [pipeline:main] + pipeline = ... crossdomain ... authtoken ... proxy-server + + And add a filter section, such as: + + .. code:: cfg + + [filter:crossdomain] + use = egg:swift#crossdomain + cross_domain_policy = + + + For continuation lines, put some whitespace before the continuation + text. Ensure you put a completely blank line to terminate the + ``cross_domain_policy`` value. + + The ``cross_domain_policy`` name/value is optional. If omitted, the policy + defaults as if you had specified: + + .. code:: cfg + + cross_domain_policy = + + .. note:: + + The default policy is very permissive; this is appropriate + for most public cloud deployments, but may not be appropriate + for all deployments. See also: + `CWE-942 `__ + + + """ + + def __init__(self, app, conf, *args, **kwargs): + self.app = app + self.conf = conf + default_domain_policy = '' + self.cross_domain_policy = self.conf.get('cross_domain_policy', + default_domain_policy) + + def GET(self, req): + """Returns a 200 response with cross domain policy information """ + body = '\n' \ + '\n' \ + '\n' \ + '%s\n' \ + '' % self.cross_domain_policy + return Response(request=req, body=body.encode('utf-8'), + content_type="application/xml") + + def __call__(self, env, start_response): + req = Request(env) + if req.path == '/crossdomain.xml' and req.method == 'GET': + return self.GET(req)(env, start_response) + else: + return self.app(env, start_response) + + +def filter_factory(global_conf, **local_conf): + conf = global_conf.copy() + conf.update(local_conf) + register_swift_info('crossdomain') + + def crossdomain_filter(app): + return CrossDomainMiddleware(app, conf) + return crossdomain_filter diff --git a/swift/common/middleware/crypto/__init__.py b/swift/common/middleware/crypto/__init__.py new file mode 100644 index 0000000000..a70e7b8609 --- /dev/null +++ b/swift/common/middleware/crypto/__init__.py @@ -0,0 +1,36 @@ +# Copyright (c) 2016 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Implements middleware for object encryption which comprises an instance of a +:class:`~swift.common.middleware.crypto.decrypter.Decrypter` combined with an +instance of an :class:`~swift.common.middleware.crypto.encrypter.Encrypter`. +""" +from swift.common.middleware.crypto.decrypter import Decrypter +from swift.common.middleware.crypto.encrypter import Encrypter + +from swift.common.utils import config_true_value +from swift.common.registry import register_swift_info + + +def filter_factory(global_conf, **local_conf): + """Provides a factory function for loading encryption middleware.""" + conf = global_conf.copy() + conf.update(local_conf) + enabled = not config_true_value(conf.get('disable_encryption', 'false')) + register_swift_info('encryption', admin=True, enabled=enabled) + + def encryption_filter(app): + return Decrypter(Encrypter(app, conf), conf) + return encryption_filter diff --git a/swift/common/middleware/crypto/crypto_utils.py b/swift/common/middleware/crypto/crypto_utils.py new file mode 100644 index 0000000000..980bebb912 --- /dev/null +++ b/swift/common/middleware/crypto/crypto_utils.py @@ -0,0 +1,292 @@ +# Copyright (c) 2015-2016 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import base64 +import binascii +import json +import os + +from cryptography.hazmat.backends import default_backend +from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes +import urllib.parse + +from swift.common.exceptions import EncryptionException, UnknownSecretIdError +from swift.common.swob import HTTPInternalServerError +from swift.common.utils import get_logger, parse_header +from swift.common.wsgi import WSGIContext + +CRYPTO_KEY_CALLBACK = 'swift.callback.fetch_crypto_keys' + + +class Crypto(object): + """ + Used by middleware: Calls cryptography library + """ + cipher = 'AES_CTR_256' + # AES will accept several key sizes - we are using 256 bits i.e. 32 bytes + key_length = 32 + iv_length = algorithms.AES.block_size // 8 + + def __init__(self, conf=None): + self.logger = get_logger(conf, log_route="crypto") + # memoize backend to avoid repeated iteration over entry points + self.backend = default_backend() + + def create_encryption_ctxt(self, key, iv): + """ + Creates a crypto context for encrypting + + :param key: 256-bit key + :param iv: 128-bit iv or nonce used for encryption + :raises ValueError: on invalid key or iv + :returns: an instance of an encryptor + """ + self.check_key(key) + engine = Cipher(algorithms.AES(key), modes.CTR(iv), + backend=self.backend) + return engine.encryptor() + + def create_decryption_ctxt(self, key, iv, offset): + """ + Creates a crypto context for decrypting + + :param key: 256-bit key + :param iv: 128-bit iv or nonce used for decryption + :param offset: offset into the message; used for range reads + :returns: an instance of a decryptor + """ + self.check_key(key) + if offset < 0: + raise ValueError('Offset must not be negative') + if offset: + # Adjust IV so that it is correct for decryption at offset. + # The CTR mode offset is incremented for every AES block and taken + # modulo 2^128. + offset_blocks, offset_in_block = divmod(offset, self.iv_length) + ivl = int(binascii.hexlify(iv), 16) + offset_blocks + ivl %= 1 << algorithms.AES.block_size + iv = bytes(bytearray.fromhex(format( + ivl, '0%dx' % (2 * self.iv_length)))) + else: + offset_in_block = 0 + + engine = Cipher(algorithms.AES(key), modes.CTR(iv), + backend=self.backend) + dec = engine.decryptor() + # Adjust decryption boundary within current AES block + dec.update(b'*' * offset_in_block) + return dec + + def create_iv(self): + return os.urandom(self.iv_length) + + def create_crypto_meta(self): + # create a set of parameters + return {'iv': self.create_iv(), 'cipher': self.cipher} + + def check_crypto_meta(self, meta): + """ + Check that crypto meta dict has valid items. + + :param meta: a dict + :raises EncryptionException: if an error is found in the crypto meta + """ + try: + if meta['cipher'] != self.cipher: + raise EncryptionException('Bad crypto meta: Cipher must be %s' + % self.cipher) + if len(meta['iv']) != self.iv_length: + raise EncryptionException( + 'Bad crypto meta: IV must be length %s bytes' + % self.iv_length) + except KeyError as err: + raise EncryptionException( + 'Bad crypto meta: Missing %s' % err) + + def create_random_key(self): + # helper method to create random key of correct length + return os.urandom(self.key_length) + + def wrap_key(self, wrapping_key, key_to_wrap): + # we don't use an RFC 3394 key wrap algorithm such as cryptography's + # aes_wrap_key because it's slower and we have iv material readily + # available so don't need a deterministic algorithm + iv = self.create_iv() + encryptor = Cipher(algorithms.AES(wrapping_key), modes.CTR(iv), + backend=self.backend).encryptor() + return {'key': encryptor.update(key_to_wrap), 'iv': iv} + + def unwrap_key(self, wrapping_key, context): + # unwrap a key from dict of form returned by wrap_key + # check the key length early - unwrapping won't change the length + self.check_key(context['key']) + decryptor = Cipher(algorithms.AES(wrapping_key), + modes.CTR(context['iv']), + backend=self.backend).decryptor() + return decryptor.update(context['key']) + + def check_key(self, key): + if len(key) != self.key_length: + raise ValueError("Key must be length %s bytes" % self.key_length) + + +class CryptoWSGIContext(WSGIContext): + """ + Base class for contexts used by crypto middlewares. + """ + def __init__(self, crypto_app, server_type, logger): + super(CryptoWSGIContext, self).__init__(crypto_app.app) + self.crypto = crypto_app.crypto + self.logger = logger + self.server_type = server_type + + def get_keys(self, env, required=None, key_id=None): + # Get the key(s) from the keymaster + required = required if required is not None else [self.server_type] + try: + fetch_crypto_keys = env[CRYPTO_KEY_CALLBACK] + except KeyError: + self.logger.exception('ERROR get_keys() missing callback') + raise HTTPInternalServerError( + "Unable to retrieve encryption keys.") + + err = None + try: + keys = fetch_crypto_keys(key_id=key_id) + except UnknownSecretIdError as err: + self.logger.error('get_keys(): unknown key id: %s', err) + raise + except Exception as err: # noqa + self.logger.exception('get_keys(): from callback: %s', err) + raise HTTPInternalServerError( + "Unable to retrieve encryption keys.") + + for name in required: + try: + key = keys[name] + self.crypto.check_key(key) + continue + except KeyError: + self.logger.exception("Missing key for %r", name) + except TypeError: + self.logger.exception("Did not get a keys dict") + except ValueError as e: + # don't include the key in any messages! + self.logger.exception("Bad key for %(name)r: %(err)s", + {'name': name, 'err': e}) + raise HTTPInternalServerError( + "Unable to retrieve encryption keys.") + + return keys + + def get_multiple_keys(self, env): + # get a list of keys from the keymaster containing one dict of keys for + # each of the keymaster root secret ids + keys = [self.get_keys(env)] + active_key_id = keys[0]['id'] + for other_key_id in keys[0].get('all_ids', []): + if other_key_id == active_key_id: + continue + keys.append(self.get_keys(env, key_id=other_key_id)) + return keys + + +def dump_crypto_meta(crypto_meta): + """ + Serialize crypto meta to a form suitable for including in a header value. + + The crypto-meta is serialized as a json object. The iv and key values are + random bytes and as a result need to be base64 encoded before sending over + the wire. Base64 encoding returns a bytes object in py3, to future proof + the code, decode this data to produce a string, which is what the + json.dumps function expects. + + :param crypto_meta: a dict containing crypto meta items + :returns: a string serialization of a crypto meta dict + """ + def b64_encode_meta(crypto_meta): + return { + name: (base64.b64encode(value).decode() if name in ('iv', 'key') + else b64_encode_meta(value) if isinstance(value, dict) + else value) + for name, value in crypto_meta.items()} + + # use sort_keys=True to make serialized form predictable for testing + return urllib.parse.quote_plus( + json.dumps(b64_encode_meta(crypto_meta), sort_keys=True)) + + +def load_crypto_meta(value, b64decode=True): + """ + Build the crypto_meta from the json object. + + Note that json.loads always produces unicode strings; to ensure the + resultant crypto_meta matches the original object: + * cast all keys to str (effectively a no-op on py3), + * base64 decode 'key' and 'iv' values to bytes, and + * encode remaining string values as UTF-8 on py2 (while leaving them + as native unicode strings on py3). + + :param value: a string serialization of a crypto meta dict + :param b64decode: decode the 'key' and 'iv' values to bytes, default True + :returns: a dict containing crypto meta items + :raises EncryptionException: if an error occurs while parsing the + crypto meta + """ + def b64_decode_meta(crypto_meta): + return { + str(name): ( + base64.b64decode(val) if name in ('iv', 'key') and b64decode + else b64_decode_meta(val) if isinstance(val, dict) + else val) + for name, val in crypto_meta.items()} + + try: + if not isinstance(value, str): + raise ValueError('crypto meta not a string') + val = json.loads(urllib.parse.unquote_plus(value)) + if not isinstance(val, dict): + raise ValueError('crypto meta not a Mapping') + return b64_decode_meta(val) + except (KeyError, ValueError, TypeError) as err: + msg = 'Bad crypto meta %r: %s' % (value, err) + raise EncryptionException(msg) + + +def append_crypto_meta(value, crypto_meta): + """ + Serialize and append crypto metadata to an encrypted value. + + :param value: value to which serialized crypto meta will be appended. + :param crypto_meta: a dict of crypto meta + :return: a string of the form ; swift_meta= + """ + if not isinstance(value, str): + raise ValueError + return '%s; swift_meta=%s' % (value, dump_crypto_meta(crypto_meta)) + + +def extract_crypto_meta(value): + """ + Extract and deserialize any crypto meta from the end of a value. + + :param value: string that may have crypto meta at end + :return: a tuple of the form: + (, or None) + """ + swift_meta = None + value, meta = parse_header(value) + if 'swift_meta' in meta: + swift_meta = load_crypto_meta(meta['swift_meta']) + return value, swift_meta diff --git a/swift/common/middleware/crypto/decrypter.py b/swift/common/middleware/crypto/decrypter.py new file mode 100644 index 0000000000..7caa34aee6 --- /dev/null +++ b/swift/common/middleware/crypto/decrypter.py @@ -0,0 +1,481 @@ +# Copyright (c) 2015-2016 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import base64 +import json + +from swift.common.constraints import valid_api_version, check_utf8 +from swift.common.header_key_dict import HeaderKeyDict +from swift.common.http import is_success +from swift.common.middleware.crypto.crypto_utils import CryptoWSGIContext, \ + load_crypto_meta, extract_crypto_meta, Crypto +from swift.common.exceptions import EncryptionException, UnknownSecretIdError +from swift.common.request_helpers import get_object_transient_sysmeta, \ + get_sys_meta_prefix, get_user_meta_prefix, \ + get_container_update_override_key +from swift.common.swob import Request, HTTPException, \ + HTTPInternalServerError, wsgi_to_bytes, bytes_to_wsgi, wsgi_to_str +from swift.common.utils import get_logger, config_true_value, \ + parse_content_range, closing_if_possible, parse_content_type, \ + FileLikeIter, multipart_byteranges_to_document_iters + +DECRYPT_CHUNK_SIZE = 65536 + + +def purge_crypto_sysmeta_headers(headers): + return [h for h in headers if not + h[0].lower().startswith( + (get_object_transient_sysmeta('crypto-'), + get_sys_meta_prefix('object') + 'crypto-'))] + + +class BaseDecrypterContext(CryptoWSGIContext): + def get_crypto_meta(self, header_name, check=True): + """ + Extract a crypto_meta dict from a header. + + :param header_name: name of header that may have crypto_meta + :param check: if True validate the crypto meta + :return: A dict containing crypto_meta items + :raises EncryptionException: if an error occurs while parsing the + crypto meta + """ + crypto_meta_json = self._response_header_value(header_name) + + if crypto_meta_json is None: + return None + crypto_meta = load_crypto_meta(crypto_meta_json) + if check: + self.crypto.check_crypto_meta(crypto_meta) + return crypto_meta + + def get_unwrapped_key(self, crypto_meta, wrapping_key): + """ + Get a wrapped key from crypto-meta and unwrap it using the provided + wrapping key. + + :param crypto_meta: a dict of crypto-meta + :param wrapping_key: key to be used to decrypt the wrapped key + :return: an unwrapped key + :raises HTTPInternalServerError: if the crypto-meta has no wrapped key + or the unwrapped key is invalid + """ + try: + return self.crypto.unwrap_key(wrapping_key, + crypto_meta['body_key']) + except KeyError as err: + self.logger.error( + 'Error decrypting %(resp_type)s: Missing %(key)s', + {'resp_type': self.server_type, 'key': err}) + except ValueError as err: + self.logger.error('Error decrypting %(resp_type)s: %(reason)s', + {'resp_type': self.server_type, 'reason': err}) + raise HTTPInternalServerError( + body='Error decrypting %s' % self.server_type, + content_type='text/plain') + + def decrypt_value_with_meta(self, value, key, required, decoder): + """ + Base64-decode and decrypt a value if crypto meta can be extracted from + the value itself, otherwise return the value unmodified. + + A value should either be a string that does not contain the ';' + character or should be of the form:: + + ;swift_meta= + + :param value: value to decrypt + :param key: crypto key to use + :param required: if True then the value is required to be decrypted + and an EncryptionException will be raised if the + header cannot be decrypted due to missing crypto meta. + :param decoder: function to turn the decrypted bytes into useful data + :returns: decrypted value if crypto meta is found, otherwise the + unmodified value + :raises EncryptionException: if an error occurs while parsing crypto + meta or if the header value was required + to be decrypted but crypto meta was not + found. + """ + extracted_value, crypto_meta = extract_crypto_meta(value) + if crypto_meta: + self.crypto.check_crypto_meta(crypto_meta) + value = self.decrypt_value( + extracted_value, key, crypto_meta, decoder) + elif required: + raise EncryptionException( + "Missing crypto meta in value %s" % value) + + return value + + def decrypt_value(self, value, key, crypto_meta, decoder): + """ + Base64-decode and decrypt a value using the crypto_meta provided. + + :param value: a base64-encoded value to decrypt + :param key: crypto key to use + :param crypto_meta: a crypto-meta dict of form returned by + :py:func:`~swift.common.middleware.crypto.Crypto.get_crypto_meta` + :param decoder: function to turn the decrypted bytes into useful data + :returns: decrypted value + """ + if not value: + return decoder(b'') + crypto_ctxt = self.crypto.create_decryption_ctxt( + key, crypto_meta['iv'], 0) + return decoder(crypto_ctxt.update(base64.b64decode(value))) + + def get_decryption_keys(self, req, crypto_meta=None): + """ + Determine if a response should be decrypted, and if so then fetch keys. + + :param req: a Request object + :param crypto_meta: a dict of crypto metadata + :returns: a dict of decryption keys + """ + if config_true_value(req.environ.get('swift.crypto.override')): + self.logger.debug('No decryption is necessary because of override') + return None + + key_id = crypto_meta.get('key_id') if crypto_meta else None + return self.get_keys(req.environ, key_id=key_id) + + +class DecrypterObjContext(BaseDecrypterContext): + def __init__(self, decrypter, logger): + super(DecrypterObjContext, self).__init__(decrypter, 'object', logger) + + def _decrypt_header(self, header, value, key, required=False): + """ + Attempt to decrypt a header value that may be encrypted. + + :param header: the header name + :param value: the header value + :param key: decryption key + :param required: if True then the header is required to be decrypted + and an HTTPInternalServerError will be raised if the + header cannot be decrypted due to missing crypto meta. + :return: decrypted value or the original value if it was not encrypted. + :raises HTTPInternalServerError: if an error occurred during decryption + or if the header value was required to + be decrypted but crypto meta was not + found. + """ + try: + return self.decrypt_value_with_meta( + value, key, required, bytes_to_wsgi) + except EncryptionException as err: + self.logger.error( + "Error decrypting header %(header)s: %(error)s", + {'header': header, 'error': err}) + raise HTTPInternalServerError( + body='Error decrypting header', + content_type='text/plain') + + def decrypt_user_metadata(self, keys): + prefix = get_object_transient_sysmeta('crypto-meta-') + prefix_len = len(prefix) + new_prefix = get_user_meta_prefix(self.server_type).title() + result = [] + for name, val in self._response_headers: + if name.lower().startswith(prefix) and val: + short_name = name[prefix_len:] + decrypted_value = self._decrypt_header( + name, val, keys[self.server_type], required=True) + result.append((new_prefix + short_name, decrypted_value)) + return result + + def decrypt_resp_headers(self, put_keys, post_keys, update_cors_exposed): + """ + Find encrypted headers and replace with the decrypted versions. + + :param put_keys: a dict of decryption keys used for object PUT. + :param post_keys: a dict of decryption keys used for object POST. + :return: A list of headers with any encrypted headers replaced by their + decrypted values. + :raises HTTPInternalServerError: if any error occurs while decrypting + headers + """ + mod_hdr_pairs = [] + + if put_keys: + # Decrypt plaintext etag and place in Etag header for client + # response + etag_header = 'X-Object-Sysmeta-Crypto-Etag' + encrypted_etag = self._response_header_value(etag_header) + if encrypted_etag: + decrypted_etag = self._decrypt_header( + etag_header, encrypted_etag, put_keys['object'], + required=True) + mod_hdr_pairs.append(('Etag', decrypted_etag)) + + etag_header = get_container_update_override_key('etag') + encrypted_etag = self._response_header_value(etag_header) + if encrypted_etag: + decrypted_etag = self._decrypt_header( + etag_header, encrypted_etag, put_keys['container']) + mod_hdr_pairs.append((etag_header, decrypted_etag)) + + # Decrypt all user metadata. Encrypted user metadata values are stored + # in the x-object-transient-sysmeta-crypto-meta- namespace. Those are + # decrypted and moved back to the x-object-meta- namespace. Prior to + # decryption, the response should have no x-object-meta- headers, but + # if it does then they will be overwritten by any decrypted headers + # that map to the same x-object-meta- header names i.e. decrypted + # headers win over unexpected, unencrypted headers. + if post_keys: + decrypted_meta = self.decrypt_user_metadata(post_keys) + mod_hdr_pairs.extend(decrypted_meta) + else: + decrypted_meta = [] + + mod_hdr_names = {h.lower() for h, v in mod_hdr_pairs} + + found_aceh = False + for header, value in self._response_headers: + lheader = header.lower() + if lheader in mod_hdr_names: + continue + if lheader == 'access-control-expose-headers': + found_aceh = True + mod_hdr_pairs.append((header, value + ', ' + ', '.join( + meta.lower() for meta, _data in decrypted_meta))) + else: + mod_hdr_pairs.append((header, value)) + if update_cors_exposed and not found_aceh: + mod_hdr_pairs.append(('Access-Control-Expose-Headers', ', '.join( + meta.lower() for meta, _data in decrypted_meta))) + return mod_hdr_pairs + + def multipart_response_iter(self, resp, boundary, body_key, crypto_meta): + """ + Decrypts a multipart mime doc response body. + + :param resp: application response + :param boundary: multipart boundary string + :param body_key: decryption key for the response body + :param crypto_meta: crypto_meta for the response body + :return: generator for decrypted response body + """ + with closing_if_possible(resp): + parts_iter = multipart_byteranges_to_document_iters( + FileLikeIter(resp), boundary) + for first_byte, last_byte, length, headers, body in parts_iter: + yield b"--" + boundary + b"\r\n" + + for header, value in headers: + yield b"%s: %s\r\n" % (wsgi_to_bytes(header), + wsgi_to_bytes(value)) + + yield b"\r\n" + + decrypt_ctxt = self.crypto.create_decryption_ctxt( + body_key, crypto_meta['iv'], first_byte) + for chunk in iter(lambda: body.read(DECRYPT_CHUNK_SIZE), b''): + yield decrypt_ctxt.update(chunk) + + yield b"\r\n" + + yield b"--" + boundary + b"--" + + def response_iter(self, resp, body_key, crypto_meta, offset): + """ + Decrypts a response body. + + :param resp: application response + :param body_key: decryption key for the response body + :param crypto_meta: crypto_meta for the response body + :param offset: offset into object content at which response body starts + :return: generator for decrypted response body + """ + decrypt_ctxt = self.crypto.create_decryption_ctxt( + body_key, crypto_meta['iv'], offset) + with closing_if_possible(resp): + for chunk in resp: + yield decrypt_ctxt.update(chunk) + + def _read_crypto_meta(self, header, check): + crypto_meta = None + if (is_success(self._get_status_int()) or + self._get_status_int() in (304, 412)): + try: + crypto_meta = self.get_crypto_meta(header, check) + except EncryptionException as err: + self.logger.error('Error decrypting object: %s', err) + raise HTTPInternalServerError( + body='Error decrypting object', content_type='text/plain') + return crypto_meta + + def handle(self, req, start_response): + app_resp = self._app_call(req.environ) + + try: + put_crypto_meta = self._read_crypto_meta( + 'X-Object-Sysmeta-Crypto-Body-Meta', True) + put_keys = self.get_decryption_keys(req, put_crypto_meta) + post_crypto_meta = self._read_crypto_meta( + 'X-Object-Transient-Sysmeta-Crypto-Meta', False) + post_keys = self.get_decryption_keys(req, post_crypto_meta) + except EncryptionException as err: + self.logger.error( + "Error decrypting object: %s", + err) + raise HTTPInternalServerError( + body='Error decrypting object', + content_type='text/plain') + + if put_keys is None and post_keys is None: + # skip decryption + start_response(self._response_status, self._response_headers, + self._response_exc_info) + return app_resp + + mod_resp_headers = self.decrypt_resp_headers( + put_keys, post_keys, + update_cors_exposed=bool(req.headers.get('origin'))) + + if put_crypto_meta and req.method == 'GET' and \ + is_success(self._get_status_int()): + # 2xx response and encrypted body + body_key = self.get_unwrapped_key( + put_crypto_meta, put_keys['object']) + content_type, content_type_attrs = parse_content_type( + self._response_header_value('Content-Type')) + + if (self._get_status_int() == 206 and + content_type == 'multipart/byteranges'): + boundary = wsgi_to_bytes(dict(content_type_attrs)["boundary"]) + resp_iter = self.multipart_response_iter( + app_resp, boundary, body_key, put_crypto_meta) + else: + offset = 0 + content_range = self._response_header_value('Content-Range') + if content_range: + # Determine offset within the whole object if ranged GET + offset, end, total = parse_content_range(content_range) + resp_iter = self.response_iter( + app_resp, body_key, put_crypto_meta, offset) + else: + # don't decrypt body of unencrypted or non-2xx responses + resp_iter = app_resp + + mod_resp_headers = purge_crypto_sysmeta_headers(mod_resp_headers) + start_response(self._response_status, mod_resp_headers, + self._response_exc_info) + + return resp_iter + + +class DecrypterContContext(BaseDecrypterContext): + def __init__(self, decrypter, logger): + super(DecrypterContContext, self).__init__( + decrypter, 'container', logger) + + def handle(self, req, start_response): + app_resp = self._app_call(req.environ) + + if is_success(self._get_status_int()): + # only decrypt body of 2xx responses + headers = HeaderKeyDict(self._response_headers) + content_type = headers.get('content-type', '').split(';', 1)[0] + if content_type == 'application/json': + app_resp = self.process_json_resp(req, app_resp) + + start_response(self._response_status, + self._response_headers, + self._response_exc_info) + + return app_resp + + def process_json_resp(self, req, resp_iter): + """ + Parses json body listing and decrypt encrypted entries. Updates + Content-Length header with new body length and return a body iter. + """ + with closing_if_possible(resp_iter): + resp_body = b''.join(resp_iter) + body_json = json.loads(resp_body) + new_body = json.dumps([self.decrypt_obj_dict(req, obj_dict) + for obj_dict in body_json]).encode('ascii') + self.update_content_length(len(new_body)) + return [new_body] + + def decrypt_obj_dict(self, req, obj_dict): + if 'hash' in obj_dict: + # each object's etag may have been encrypted with a different key + # so fetch keys based on its crypto meta + ciphertext, crypto_meta = extract_crypto_meta(obj_dict['hash']) + bad_keys = set() + if crypto_meta: + try: + self.crypto.check_crypto_meta(crypto_meta) + keys = self.get_decryption_keys(req, crypto_meta) + # Note that symlinks (for example) may put swift paths in + # the listing ETag, so we can't just use ASCII. + obj_dict['hash'] = self.decrypt_value( + ciphertext, keys['container'], crypto_meta, + decoder=lambda x: x.decode('utf-8')) + except EncryptionException as err: + if not isinstance(err, UnknownSecretIdError) or \ + err.args[0] not in bad_keys: + # Only warn about an unknown key once per listing + self.logger.error( + "Error decrypting container listing: %s", + err) + if isinstance(err, UnknownSecretIdError): + bad_keys.add(err.args[0]) + obj_dict['hash'] = '' + return obj_dict + + +class Decrypter(object): + """Middleware for decrypting data and user metadata.""" + + def __init__(self, app, conf): + self.app = app + self.logger = get_logger(conf, log_route="decrypter") + self.crypto = Crypto(conf) + + def __call__(self, env, start_response): + req = Request(env) + try: + parts = req.split_path(3, 4, True) + is_cont_or_obj_req = True + except ValueError: + is_cont_or_obj_req = False + + if not is_cont_or_obj_req: + return self.app(env, start_response) + if not valid_api_version(parts[0]): + # Not a swift request + return self.app(env, start_response) + if not check_utf8(wsgi_to_str(req.path_info), + internal=req.allow_reserved_names): + # Not a valid swift request + return self.app(env, start_response) + # TODO any other invalid paths we want to ignore?? + + if parts[3] and req.method in ('GET', 'HEAD'): + handler = DecrypterObjContext(self, self.logger).handle + elif parts[2] and req.method == 'GET': + handler = DecrypterContContext(self, self.logger).handle + else: + # url and/or request verb is not handled by decrypter + return self.app(env, start_response) + + try: + return handler(req, start_response) + except HTTPException as err_resp: + return err_resp(env, start_response) diff --git a/swift/common/middleware/crypto/encrypter.py b/swift/common/middleware/crypto/encrypter.py new file mode 100644 index 0000000000..b33aaeaf7a --- /dev/null +++ b/swift/common/middleware/crypto/encrypter.py @@ -0,0 +1,383 @@ +# Copyright (c) 2015-2016 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import base64 +import hashlib +import hmac +from contextlib import contextmanager + +from swift.common.constraints import check_metadata +from swift.common.http import is_success +from swift.common.middleware.crypto.crypto_utils import CryptoWSGIContext, \ + dump_crypto_meta, append_crypto_meta, Crypto +from swift.common.request_helpers import get_object_transient_sysmeta, \ + strip_user_meta_prefix, is_user_meta, update_etag_is_at_header, \ + get_container_update_override_key +from swift.common.swob import Request, Match, HTTPException, \ + HTTPUnprocessableEntity, wsgi_to_bytes, bytes_to_wsgi, normalize_etag +from swift.common.utils import get_logger, config_true_value, \ + MD5_OF_EMPTY_STRING, md5, InputProxy + + +def encrypt_header_val(crypto, value, key): + """ + Encrypt a header value using the supplied key. + + :param crypto: a Crypto instance + :param value: value to encrypt + :param key: crypto key to use + :returns: a tuple of (encrypted value, crypto_meta) where crypto_meta is a + dict of form returned by + :py:func:`~swift.common.middleware.crypto.Crypto.get_crypto_meta` + :raises ValueError: if value is empty + """ + if not value: + raise ValueError('empty value is not acceptable') + + crypto_meta = crypto.create_crypto_meta() + crypto_ctxt = crypto.create_encryption_ctxt(key, crypto_meta['iv']) + enc_val = bytes_to_wsgi(base64.b64encode( + crypto_ctxt.update(wsgi_to_bytes(value)))) + return enc_val, crypto_meta + + +def _hmac_etag(key, etag): + """ + Compute an HMAC-SHA256 using given key and etag. + + :param key: The starting key for the hash. + :param etag: The etag to hash. + :returns: a Base64-encoded representation of the HMAC + """ + if not isinstance(etag, bytes): + etag = wsgi_to_bytes(etag) + result = hmac.new(key, etag, digestmod=hashlib.sha256).digest() + return base64.b64encode(result).decode() + + +class EncInputWrapper(InputProxy): + """File-like object to be swapped in for wsgi.input.""" + def __init__(self, crypto, keys, req, logger): + super().__init__(req.environ['wsgi.input']) + self.env = req.environ + self.path = req.path + self.crypto = crypto + self.body_crypto_ctxt = None + self.keys = keys + self.plaintext_md5 = None + self.ciphertext_md5 = None + self.logger = logger + self.install_footers_callback(req) + + def _init_encryption_context(self): + # do this once when body is first read + if self.body_crypto_ctxt is None: + self.body_crypto_meta = self.crypto.create_crypto_meta() + body_key = self.crypto.create_random_key() + # wrap the body key with object key + self.body_crypto_meta['body_key'] = self.crypto.wrap_key( + self.keys['object'], body_key) + self.body_crypto_meta['key_id'] = self.keys['id'] + self.body_crypto_ctxt = self.crypto.create_encryption_ctxt( + body_key, self.body_crypto_meta.get('iv')) + self.plaintext_md5 = md5(usedforsecurity=False) + self.ciphertext_md5 = md5(usedforsecurity=False) + + def install_footers_callback(self, req): + # the proxy controller will call back for footer metadata after + # body has been sent + inner_callback = req.environ.get('swift.callback.update_footers') + # remove any Etag from headers, it won't be valid for ciphertext and + # we'll send the ciphertext Etag later in footer metadata + client_etag = req.headers.pop('etag', None) + override_header = get_container_update_override_key('etag') + container_listing_etag_header = req.headers.get(override_header) + + def footers_callback(footers): + if inner_callback: + # pass on footers dict to any other callback that was + # registered before this one. It may override any footers that + # were set. + inner_callback(footers) + + plaintext_etag = None + if self.body_crypto_ctxt: + plaintext_etag = self.plaintext_md5.hexdigest() + # If client (or other middleware) supplied etag, then validate + # against plaintext etag + etag_to_check = footers.get('Etag') or client_etag + if (etag_to_check is not None and + plaintext_etag != etag_to_check): + raise HTTPUnprocessableEntity(request=Request(self.env)) + + # override any previous notion of etag with the ciphertext etag + footers['Etag'] = self.ciphertext_md5.hexdigest() + + # Encrypt the plaintext etag using the object key and persist + # as sysmeta along with the crypto parameters that were used. + encrypted_etag, etag_crypto_meta = encrypt_header_val( + self.crypto, plaintext_etag, self.keys['object']) + footers['X-Object-Sysmeta-Crypto-Etag'] = \ + append_crypto_meta(encrypted_etag, etag_crypto_meta) + footers['X-Object-Sysmeta-Crypto-Body-Meta'] = \ + dump_crypto_meta(self.body_crypto_meta) + + # Also add an HMAC of the etag for use when evaluating + # conditional requests + footers['X-Object-Sysmeta-Crypto-Etag-Mac'] = _hmac_etag( + self.keys['object'], plaintext_etag) + else: + # No data was read from body, nothing was encrypted, so don't + # set any crypto sysmeta for the body, but do re-instate any + # etag provided in inbound request if other middleware has not + # already set a value. + if client_etag is not None: + footers.setdefault('Etag', client_etag) + + # When deciding on the etag that should appear in container + # listings, look for: + # * override in the footer, otherwise + # * override in the header, and finally + # * MD5 of the plaintext received + # This may be None if no override was set and no data was read. An + # override value of '' will be passed on. + container_listing_etag = footers.get( + override_header, container_listing_etag_header) + + if container_listing_etag is None: + container_listing_etag = plaintext_etag + + if (container_listing_etag and + (container_listing_etag != MD5_OF_EMPTY_STRING or + plaintext_etag)): + # Encrypt the container-listing etag using the container key + # and a random IV, and use it to override the container update + # value, with the crypto parameters appended. We use the + # container key here so that only that key is required to + # decrypt all etag values in a container listing when handling + # a container GET request. Don't encrypt an MD5_OF_EMPTY_STRING + # unless there actually was some body content, in which case + # the container-listing etag is possibly conveying some + # non-obvious information. + val, crypto_meta = encrypt_header_val( + self.crypto, container_listing_etag, + self.keys['container']) + crypto_meta['key_id'] = self.keys['id'] + footers[override_header] = \ + append_crypto_meta(val, crypto_meta) + # else: no override was set and no data was read + + req.environ['swift.callback.update_footers'] = footers_callback + + def chunk_update(self, chunk, eof, *args, **kwargs): + if chunk: + self._init_encryption_context() + self.plaintext_md5.update(chunk) + # Encrypt one chunk at a time + ciphertext = self.body_crypto_ctxt.update(chunk) + self.ciphertext_md5.update(ciphertext) + return ciphertext + + return chunk + + +class EncrypterObjContext(CryptoWSGIContext): + def __init__(self, encrypter, logger): + super(EncrypterObjContext, self).__init__( + encrypter, 'object', logger) + + def _check_headers(self, req): + # Check the user-metadata length before encrypting and encoding + error_response = check_metadata(req, self.server_type) + if error_response: + raise error_response + + def encrypt_user_metadata(self, req, keys): + """ + Encrypt user-metadata header values. Replace each x-object-meta- + user metadata header with a corresponding + x-object-transient-sysmeta-crypto-meta- header which has the + crypto metadata required to decrypt appended to the encrypted value. + + :param req: a swob Request + :param keys: a dict of encryption keys + """ + prefix = get_object_transient_sysmeta('crypto-meta-') + user_meta_headers = [h for h in req.headers.items() if + is_user_meta(self.server_type, h[0]) and h[1]] + crypto_meta = None + for name, val in user_meta_headers: + short_name = strip_user_meta_prefix(self.server_type, name) + new_name = prefix + short_name + enc_val, crypto_meta = encrypt_header_val( + self.crypto, val, keys[self.server_type]) + req.headers[new_name] = append_crypto_meta(enc_val, crypto_meta) + req.headers.pop(name) + # store a single copy of the crypto meta items that are common to all + # encrypted user metadata independently of any such meta that is stored + # with the object body because it might change on a POST. This is done + # for future-proofing - the meta stored here is not currently used + # during decryption. + if crypto_meta: + meta = dump_crypto_meta({'cipher': crypto_meta['cipher'], + 'key_id': keys['id']}) + req.headers[get_object_transient_sysmeta('crypto-meta')] = meta + + def handle_put(self, req, start_response): + self._check_headers(req) + keys = self.get_keys(req.environ, required=['object', 'container']) + self.encrypt_user_metadata(req, keys) + + enc_input_proxy = EncInputWrapper(self.crypto, keys, req, self.logger) + req.environ['wsgi.input'] = enc_input_proxy + + resp = self._app_call(req.environ) + + # If an etag is in the response headers and a plaintext etag was + # calculated, then overwrite the response value with the plaintext etag + # provided it matches the ciphertext etag. If it does not match then do + # not overwrite and allow the response value to return to client. + mod_resp_headers = self._response_headers + if (is_success(self._get_status_int()) and + enc_input_proxy.plaintext_md5): + plaintext_etag = enc_input_proxy.plaintext_md5.hexdigest() + ciphertext_etag = enc_input_proxy.ciphertext_md5.hexdigest() + mod_resp_headers = [ + (h, v if (h.lower() != 'etag' or + normalize_etag(v) != ciphertext_etag) + else plaintext_etag) + for h, v in mod_resp_headers] + + start_response(self._response_status, mod_resp_headers, + self._response_exc_info) + return resp + + def handle_post(self, req, start_response): + """ + Encrypt the new object headers with a new iv and the current crypto. + Note that an object may have encrypted headers while the body may + remain unencrypted. + """ + self._check_headers(req) + keys = self.get_keys(req.environ) + self.encrypt_user_metadata(req, keys) + + resp = self._app_call(req.environ) + start_response(self._response_status, self._response_headers, + self._response_exc_info) + return resp + + @contextmanager + def _mask_conditional_etags(self, req, header_name): + """ + Calculate HMACs of etags in header value and append to existing list. + The HMACs are calculated in the same way as was done for the object + plaintext etag to generate the value of + X-Object-Sysmeta-Crypto-Etag-Mac when the object was PUT. The object + server can therefore use these HMACs to evaluate conditional requests. + HMACs of the etags are appended for the current root secrets and + historic root secrets because it is not known which of them may have + been used to generate the on-disk etag HMAC. + + The existing etag values are left in the list of values to match in + case the object was not encrypted when it was PUT. It is unlikely that + a masked etag value would collide with an unmasked value. + + :param req: an instance of swob.Request + :param header_name: name of header that has etags to mask + :return: True if any etags were masked, False otherwise + """ + masked = False + old_etags = req.headers.get(header_name) + if old_etags: + all_keys = self.get_multiple_keys(req.environ) + new_etags = [] + for etag in Match(old_etags).tags: + if etag == '*': + new_etags.append(etag) + continue + new_etags.append('"%s"' % etag) + for keys in all_keys: + masked_etag = _hmac_etag(keys['object'], etag) + new_etags.append('"%s"' % masked_etag) + masked = True + + req.headers[header_name] = ', '.join(new_etags) + + try: + yield masked + finally: + if old_etags: + req.headers[header_name] = old_etags + + def handle_get_or_head(self, req, start_response): + with self._mask_conditional_etags(req, 'If-Match') as masked1: + with self._mask_conditional_etags(req, 'If-None-Match') as masked2: + if masked1 or masked2: + update_etag_is_at_header( + req, 'X-Object-Sysmeta-Crypto-Etag-Mac') + resp = self._app_call(req.environ) + start_response(self._response_status, self._response_headers, + self._response_exc_info) + return resp + + +class Encrypter(object): + """Middleware for encrypting data and user metadata. + + By default all PUT or POST'ed object data and/or metadata will be + encrypted. Encryption of new data and/or metadata may be disabled by + setting the ``disable_encryption`` option to True. However, this middleware + should remain in the pipeline in order for existing encrypted data to be + read. + """ + + def __init__(self, app, conf): + self.app = app + self.logger = get_logger(conf, log_route="encrypter") + self.crypto = Crypto(conf) + self.disable_encryption = config_true_value( + conf.get('disable_encryption', 'false')) + + def __call__(self, env, start_response): + # If override is set in env, then just pass along + if config_true_value(env.get('swift.crypto.override')): + return self.app(env, start_response) + + req = Request(env) + + if self.disable_encryption and req.method in ('PUT', 'POST'): + return self.app(env, start_response) + try: + req.split_path(4, 4, True) + is_object_request = True + except ValueError: + is_object_request = False + if not is_object_request: + return self.app(env, start_response) + + if req.method in ('GET', 'HEAD'): + handler = EncrypterObjContext(self, self.logger).handle_get_or_head + elif req.method == 'PUT': + handler = EncrypterObjContext(self, self.logger).handle_put + elif req.method == 'POST': + handler = EncrypterObjContext(self, self.logger).handle_post + else: + # anything else + return self.app(env, start_response) + + try: + return handler(req, start_response) + except HTTPException as err_resp: + return err_resp(env, start_response) diff --git a/swift/common/middleware/crypto/keymaster.py b/swift/common/middleware/crypto/keymaster.py new file mode 100644 index 0000000000..cea9fc62ff --- /dev/null +++ b/swift/common/middleware/crypto/keymaster.py @@ -0,0 +1,385 @@ +# Copyright (c) 2015 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import hashlib +import hmac + +from swift.common.exceptions import UnknownSecretIdError +from swift.common.middleware.crypto.crypto_utils import CRYPTO_KEY_CALLBACK +from swift.common.swob import Request, HTTPException, wsgi_to_str, str_to_wsgi +from swift.common.utils import readconf, strict_b64decode, get_logger, \ + split_path, load_multikey_opts +from swift.common.wsgi import WSGIContext + + +class KeyMasterContext(WSGIContext): + """ + The simple scheme for key derivation is as follows: every path is + associated with a key, where the key is derived from the path itself in a + deterministic fashion such that the key does not need to be stored. + Specifically, the key for any path is an HMAC of a root key and the path + itself, calculated using an SHA256 hash function:: + + = HMAC_SHA256(, ) + """ + def __init__(self, keymaster, account, container, obj, + meta_version_to_write='2'): + """ + :param keymaster: a Keymaster instance + :param account: account name + :param container: container name + :param obj: object name + """ + super(KeyMasterContext, self).__init__(keymaster.app) + self.keymaster = keymaster + self.account = account + self.container = container + self.obj = obj + self._keys = {} + self.alternate_fetch_keys = None + self.meta_version_to_write = meta_version_to_write + + def _make_key_id(self, path, secret_id, version): + if version in ('1', '2'): + path = str_to_wsgi(path) + key_id = {'v': version, 'path': path} + if secret_id: + # stash secret_id so that decrypter can pass it back to get the + # same keys + key_id['secret_id'] = secret_id + return key_id + + def fetch_crypto_keys(self, key_id=None, *args, **kwargs): + """ + Setup container and object keys based on the request path. + + Keys are derived from request path. The 'id' entry in the results dict + includes the part of the path used to derive keys. Other keymaster + implementations may use a different strategy to generate keys and may + include a different type of 'id', so callers should treat the 'id' as + opaque keymaster-specific data. + + :param key_id: if given this should be a dict with the items included + under the ``id`` key of a dict returned by this method. + :returns: A dict containing encryption keys for 'object' and + 'container', and entries 'id' and 'all_ids'. The 'all_ids' entry is a + list of key id dicts for all root secret ids including the one used + to generate the returned keys. + """ + if key_id: + secret_id = key_id.get('secret_id') + version = key_id['v'] + if version not in ('1', '2', '3'): + raise ValueError('Unknown key_id version: %s' % version) + + if version == '1' and not key_id['path'].startswith( + '/' + self.account + '/'): + # Well shoot. This was the bug that made us notice we needed + # a v2! Hope the current account/container was the original! + key_acct, key_cont, key_obj = ( + self.account, self.container, key_id['path']) + else: + key_acct, key_cont, key_obj = split_path( + key_id['path'], 1, 3, True) + + check_path = ( + self.account, self.container or key_cont, self.obj or key_obj) + if version in ('1', '2') and ( + key_acct, key_cont, key_obj) != check_path: + # Older py3 proxies may have written down crypto meta as WSGI + # strings; we still need to be able to read that + try: + alt_path = tuple( + part.encode('latin1').decode('utf-8') + for part in (key_acct, key_cont, key_obj)) + except UnicodeError: + # Well, it was worth a shot + pass + else: + if check_path == alt_path or ( + check_path[:2] == alt_path[:2] and not self.obj): + # This object is affected by bug #1888037 + key_acct, key_cont, key_obj = alt_path + + if (key_acct, key_cont, key_obj) != check_path: + # Pipeline may have been misconfigured, with copy right of + # encryption. In that case, path in meta may not be the + # request path. + self.keymaster.logger.info( + "Path stored in meta (%r) does not match path from " + "request (%r)! Using path from meta.", + key_id['path'], + '/' + '/'.join(x for x in [ + self.account, self.container, self.obj] if x)) + else: + secret_id = self.keymaster.active_secret_id + # v1 had a bug where we would claim the path was just the object + # name if the object started with a slash. + # v1 and v2 had a bug on py3 where we'd write the path in meta as + # a WSGI string (ie, as Latin-1 chars decoded from UTF-8 bytes). + # Bump versions to establish that we can trust the path. + version = self.meta_version_to_write + key_acct, key_cont, key_obj = ( + self.account, self.container, self.obj) + + if (secret_id, version) in self._keys: + return self._keys[(secret_id, version)] + + keys = {} + account_path = '/' + key_acct + + try: + # self.account/container/obj reflect the level of the *request*, + # which may be different from the level of the key_id-path. Only + # fetch the keys that the request needs. + if self.container: + path = account_path + '/' + key_cont + keys['container'] = self.keymaster.create_key( + path, secret_id=secret_id) + + if self.obj: + if key_obj.startswith('/') and version == '1': + path = key_obj + else: + path = path + '/' + key_obj + keys['object'] = self.keymaster.create_key( + path, secret_id=secret_id) + + # For future-proofing include a keymaster version number and + # the path used to derive keys in the 'id' entry of the + # results. The encrypter will persist this as part of the + # crypto-meta for encrypted data and metadata. If we ever + # change the way keys are generated then the decrypter could + # pass the persisted 'id' value when it calls fetch_crypto_keys + # to inform the keymaster as to how that particular data or + # metadata had its keys generated. Currently we have no need to + # do that, so we are simply persisting this information for + # future use. + keys['id'] = self._make_key_id(path, secret_id, version) + # pass back a list of key id dicts for all other secret ids in + # case the caller is interested, in which case the caller can + # call this method again for different secret ids; this avoided + # changing the return type of the callback or adding another + # callback. Note that the caller should assume no knowledge of + # the content of these key id dicts. + keys['all_ids'] = [self._make_key_id(path, id_, version) + for id_ in self.keymaster.root_secret_ids] + if self.alternate_fetch_keys: + alternate_keys = self.alternate_fetch_keys( + key_id=None, *args, **kwargs) + keys['all_ids'].extend(alternate_keys.get('all_ids', [])) + + self._keys[(secret_id, version)] = keys + + return keys + except UnknownSecretIdError: + if self.alternate_fetch_keys: + return self.alternate_fetch_keys(key_id, *args, **kwargs) + raise + + def handle_request(self, req, start_response): + self.alternate_fetch_keys = req.environ.get(CRYPTO_KEY_CALLBACK) + req.environ[CRYPTO_KEY_CALLBACK] = self.fetch_crypto_keys + resp = self._app_call(req.environ) + start_response(self._response_status, self._response_headers, + self._response_exc_info) + return resp + + +class BaseKeyMaster(object): + """Base middleware for providing encryption keys. + + This provides some basic helpers for: + + - loading from a separate config path, + - deriving keys based on path, and + - installing a ``swift.callback.fetch_crypto_keys`` hook + in the request environment. + + Subclasses should define ``log_route``, ``keymaster_opts``, and + ``keymaster_conf_section`` attributes, and implement the + ``_get_root_secret`` function. + """ + @property + def log_route(self): + raise NotImplementedError + + @property + def keymaster_opts(self): + raise NotImplementedError + + @property + def keymaster_conf_section(self): + raise NotImplementedError + + def _get_root_secret(self, conf): + raise NotImplementedError + + def __init__(self, app, conf): + self.app = app + self.logger = get_logger(conf, log_route=self.log_route) + self.keymaster_config_path = conf.get('keymaster_config_path') + conf = self._load_keymaster_config_file(conf) + + # The _get_root_secret() function is overridden by other keymasters + # which may historically only return a single value + self._root_secrets = self._get_root_secret(conf) + if not isinstance(self._root_secrets, dict): + self._root_secrets = {None: self._root_secrets} + self.active_secret_id = conf.get('active_root_secret_id') or None + if self.active_secret_id not in self._root_secrets: + raise ValueError('No secret loaded for active_root_secret_id %s' % + self.active_secret_id) + for secret_id, secret in self._root_secrets.items(): + if not isinstance(secret, bytes): + raise ValueError('Secret with id %s is %s, not bytes' % ( + secret_id, type(secret))) + + self.meta_version_to_write = conf.get('meta_version_to_write') or '2' + if self.meta_version_to_write not in ('1', '2', '3'): + raise ValueError('Unknown/unsupported metadata version: %r' % + self.meta_version_to_write) + + @property + def root_secret(self): + # Returns the default root secret; this is here for historical reasons + # to support tests and any third party code that might have used it + return self._root_secrets.get(self.active_secret_id) + + @property + def root_secret_ids(self): + # Only sorted to simplify testing + return sorted(self._root_secrets.keys(), key=lambda x: x or '') + + def _load_keymaster_config_file(self, conf): + if not self.keymaster_config_path: + return conf + + # Keymaster options specified in the filter section would be ignored if + # a separate keymaster config file is specified. To avoid confusion, + # prohibit them existing in the filter section. + bad_opts = [] + for opt in conf: + for km_opt in self.keymaster_opts: + if ((km_opt.endswith('*') and opt.startswith(km_opt[:-1])) or + opt == km_opt): + bad_opts.append(opt) + if bad_opts: + raise ValueError('keymaster_config_path is set, but there ' + 'are other config options specified: %s' % + ", ".join(bad_opts)) + return readconf(self.keymaster_config_path, + self.keymaster_conf_section) + + def __call__(self, env, start_response): + req = Request(env) + + try: + parts = [wsgi_to_str(part) for part in req.split_path(2, 4, True)] + except ValueError: + return self.app(env, start_response) + + if req.method in ('PUT', 'POST', 'GET', 'HEAD'): + # handle only those request methods that may require keys + km_context = KeyMasterContext( + self, *parts[1:], + meta_version_to_write=self.meta_version_to_write) + try: + return km_context.handle_request(req, start_response) + except HTTPException as err_resp: + return err_resp(env, start_response) + + # anything else + return self.app(env, start_response) + + def create_key(self, path, secret_id=None): + """ + Creates an encryption key that is unique for the given path. + + :param path: the (WSGI string) path of the resource being encrypted. + :param secret_id: the id of the root secret from which the key should + be derived. + :return: an encryption key. + :raises UnknownSecretIdError: if the secret_id is not recognised. + """ + try: + key = self._root_secrets[secret_id] + except KeyError: + self.logger.warning('Unrecognised secret id: %s' % secret_id) + raise UnknownSecretIdError(secret_id) + else: + path = path.encode('utf-8') + return hmac.new(key, path, digestmod=hashlib.sha256).digest() + + +class KeyMaster(BaseKeyMaster): + """Middleware for providing encryption keys. + + The middleware requires its encryption root secret to be set. This is the + root secret from which encryption keys are derived. This must be set before + first use to a value that is at least 256 bits. The security of all + encrypted data critically depends on this key, therefore it should be set + to a high-entropy value. For example, a suitable value may be obtained by + generating a 32 byte (or longer) value using a cryptographically secure + random number generator. Changing the root secret is likely to result in + data loss. + """ + log_route = 'keymaster' + keymaster_opts = ('encryption_root_secret*', 'active_root_secret_id') + keymaster_conf_section = 'keymaster' + + def _get_root_secret(self, conf): + """ + This keymaster requires ``encryption_root_secret[_id]`` options to be + set. At least one must be set before first use to a value that is a + base64 encoding of at least 32 bytes. The encryption root secrets are + specified in either proxy-server.conf, or in an external file + referenced from proxy-server.conf using ``keymaster_config_path``. + + :param conf: the keymaster config section from proxy-server.conf + :type conf: dict + + :return: a dict mapping secret ids to encryption root secret binary + bytes + :rtype: dict + """ + root_secrets = {} + for opt, secret_id, value in load_multikey_opts( + conf, 'encryption_root_secret', allow_none_key=True): + try: + secret = self._decode_root_secret(value) + except ValueError: + raise ValueError( + '%s option in %s must be a base64 encoding of at ' + 'least 32 raw bytes' % + (opt, self.keymaster_config_path or 'proxy-server.conf')) + root_secrets[secret_id] = secret + return root_secrets + + def _decode_root_secret(self, b64_root_secret): + binary_root_secret = strict_b64decode(b64_root_secret, + allow_line_breaks=True) + if len(binary_root_secret) < 32: + raise ValueError + return binary_root_secret + + +def filter_factory(global_conf, **local_conf): + conf = global_conf.copy() + conf.update(local_conf) + + def keymaster_filter(app): + return KeyMaster(app, conf) + + return keymaster_filter diff --git a/swift/common/middleware/crypto/kmip_keymaster.py b/swift/common/middleware/crypto/kmip_keymaster.py new file mode 100644 index 0000000000..9bc9b79964 --- /dev/null +++ b/swift/common/middleware/crypto/kmip_keymaster.py @@ -0,0 +1,178 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2018 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import os + +from swift.common.middleware.crypto import keymaster +from swift.common.utils import LogLevelFilter, load_multikey_opts + +from kmip.pie.client import ProxyKmipClient + +""" +This middleware enables Swift to fetch a root secret from a KMIP service. +The root secret is expected to have been previously created in the KMIP service +and is referenced by its unique identifier. The secret should be an AES-256 +symmetric key. + +To use this middleware, edit the swift proxy-server.conf to insert the +middleware in the wsgi pipeline, replacing any other keymaster middleware:: + + [pipeline:main] + pipeline = catch_errors gatekeeper healthcheck proxy-logging \ + kmip_keymaster encryption proxy-logging proxy-server + +and add a new filter section:: + + [filter:kmip_keymaster] + use = egg:swift#kmip_keymaster + key_id = + key_id_ = + active_root_secret_id = + host = + port = + certfile = /path/to/client/cert.pem + keyfile = /path/to/client/key.pem + ca_certs = /path/to/server/cert.pem + username = + password = + +Apart from ``use``, ``key_id*``, ``active_root_secret_id`` the options are +as defined for a PyKMIP client. The authoritative definition of these options +can be found at `https://pykmip.readthedocs.io/en/latest/client.html`_ + +The value of each ``key_id*`` option should be a unique identifier for a secret +to be retrieved from the KMIP service. Any of these secrets may be used for +*decryption*. + +The value of the ``active_root_secret_id`` option should be the ``secret_id`` +for the secret that should be used for all new *encryption*. If not specified, +the ``key_id`` secret will be used. + +.. note:: + + To ensure there is no loss of data availability, deploying a new key to + your cluster requires a two-stage config change. First, add the new key + to the ``key_id_`` option and restart the proxy-server. Do this + for all proxies. Next, set the ``active_root_secret_id`` option to the + new secret id and restart the proxy. Again, do this for all proxies. This + process ensures that all proxies will have the new key available for + *decryption* before any proxy uses it for *encryption*. + +The keymaster configuration can alternatively be defined in a separate config +file by using the ``keymaster_config_path`` option:: + + [filter:kmip_keymaster] + use = egg:swift#kmip_keymaster + keymaster_config_path=/etc/swift/kmip_keymaster.conf + +In this case, the ``filter:kmip_keymaster`` section should contain no other +options than ``use`` and ``keymaster_config_path``. All other options should be +defined in the separate config file in a section named ``kmip_keymaster``. For +example:: + + [kmip_keymaster] + key_id = 1234567890 + key_id_foo = 2468024680 + key_id_bar = 1357913579 + active_root_secret_id = foo + host = 127.0.0.1 + port = 5696 + certfile = /etc/swift/kmip_client.crt + keyfile = /etc/swift/kmip_client.key + ca_certs = /etc/swift/kmip_server.crt + username = swift + password = swift_password +""" + + +class KmipKeyMaster(keymaster.BaseKeyMaster): + log_route = 'kmip_keymaster' + keymaster_opts = ('host', 'port', 'certfile', 'keyfile', + 'ca_certs', 'username', 'password', + 'active_root_secret_id', 'key_id*') + keymaster_conf_section = 'kmip_keymaster' + + def _load_keymaster_config_file(self, conf): + conf = super(KmipKeyMaster, self)._load_keymaster_config_file(conf) + if self.keymaster_config_path: + section = self.keymaster_conf_section + else: + # __name__ is just the filter name, not the whole section name. + # Luckily, PasteDeploy only uses the one prefix for filters. + section = 'filter:' + conf['__name__'] + + if os.path.isdir(conf['__file__']): + raise ValueError( + 'KmipKeyMaster config cannot be read from conf dir %s. Use ' + 'keymaster_config_path option in the proxy server config to ' + 'specify a config file.') + + # Make sure we've got the kmip log handler set up before + # we instantiate a client + kmip_logger = logging.getLogger('kmip') + for handler in self.logger.logger.handlers: + kmip_logger.addHandler(handler) + + debug_filter = LogLevelFilter(logging.DEBUG) + for name in ( + # The kmip_protocol logger includes hex-encoded data off the + # wire, which may include key material!! We *NEVER* want that + # enabled. + 'kmip.services.server.kmip_protocol', + # The config_helper logger includes any password that may be + # provided, which doesn't seem great either. + 'kmip.core.config_helper', + ): + logging.getLogger(name).addFilter(debug_filter) + + self.proxy_kmip_client = ProxyKmipClient( + config=section, + config_file=conf['__file__'] + ) + return conf + + def _get_root_secret(self, conf): + multikey_opts = load_multikey_opts(conf, 'key_id', allow_none_key=True) + kmip_to_secret = {} + root_secrets = {} + with self.proxy_kmip_client as client: + for opt, secret_id, kmip_id in multikey_opts: + if kmip_id in kmip_to_secret: + # Save some round trips if there are multiple + # secret_ids for a single kmip_id + root_secrets[secret_id] = root_secrets[ + kmip_to_secret[kmip_id]] + continue + secret = client.get(kmip_id) + algo = secret.cryptographic_algorithm.name + length = secret.cryptographic_length + if (algo, length) != ('AES', 256): + raise ValueError( + 'Expected key %s to be an AES-256 key, not %s-%d' % ( + kmip_id, algo, length)) + root_secrets[secret_id] = secret.value + kmip_to_secret.setdefault(kmip_id, secret_id) + return root_secrets + + +def filter_factory(global_conf, **local_conf): + conf = global_conf.copy() + conf.update(local_conf) + + def keymaster_filter(app): + return KmipKeyMaster(app, conf) + + return keymaster_filter diff --git a/swift/common/middleware/crypto/kms_keymaster.py b/swift/common/middleware/crypto/kms_keymaster.py new file mode 100644 index 0000000000..a44a219d7f --- /dev/null +++ b/swift/common/middleware/crypto/kms_keymaster.py @@ -0,0 +1,122 @@ +# Copyright (c) 2016 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from castellan import key_manager, options +from castellan.common.credentials import keystone_password +from oslo_config import cfg +from swift.common.middleware.crypto.keymaster import BaseKeyMaster +from swift.common.utils import load_multikey_opts + + +class KmsKeyMaster(BaseKeyMaster): + """Middleware for retrieving a encryption root secret from an external KMS. + + The middleware accesses the encryption root secret from an external key + management system (KMS), e.g., a Barbican service, using Castellan. To be + able to do so, the appropriate configuration options shall be set in the + proxy-server.conf file, or in the configuration pointed to using the + keymaster_config_path configuration value in the proxy-server.conf file. + """ + log_route = 'kms_keymaster' + keymaster_opts = ('username', 'password', 'project_name', + 'user_domain_name', 'project_domain_name', + 'user_id', 'user_domain_id', 'trust_id', + 'domain_id', 'domain_name', 'project_id', + 'project_domain_id', 'reauthenticate', + 'auth_endpoint', 'api_class', 'key_id*', + 'barbican_endpoint', 'barbican_region_name', + 'active_root_secret_id') + keymaster_conf_section = 'kms_keymaster' + + def _get_root_secret(self, conf): + """ + Retrieve the root encryption secret from an external key management + system using Castellan. + + :param conf: the keymaster config section from proxy-server.conf + :type conf: dict + + :return: the encryption root secret binary bytes + :rtype: bytearray + """ + ctxt = keystone_password.KeystonePassword( + auth_url=conf.get('auth_endpoint'), + username=conf.get('username'), + password=conf.get('password'), + project_name=conf.get('project_name'), + user_domain_name=conf.get('user_domain_name'), + project_domain_name=conf.get( + 'project_domain_name'), + user_id=conf.get('user_id'), + user_domain_id=conf.get('user_domain_id'), + trust_id=conf.get('trust_id'), + domain_id=conf.get('domain_id'), + domain_name=conf.get('domain_name'), + project_id=conf.get('project_id'), + project_domain_id=conf.get('project_domain_id'), + reauthenticate=conf.get('reauthenticate')) + oslo_conf = cfg.ConfigOpts() + options.set_defaults( + oslo_conf, auth_endpoint=conf.get('auth_endpoint'), + barbican_endpoint=conf.get('barbican_endpoint'), + api_class=conf.get('api_class') + ) + # Set barbican_region_name if provided in config + # This is used by Castellan's BarbicanKeyManager for endpoint discovery + if conf.get('barbican_region_name'): + oslo_conf.set_default('barbican_region_name', + conf.get('barbican_region_name'), + group='barbican') + options.enable_logging() + manager = key_manager.API(oslo_conf) + + root_secrets = {} + for opt, secret_id, key_id in load_multikey_opts( + conf, 'key_id', allow_none_key=True): + key = manager.get(ctxt, key_id) + if key is None: + raise ValueError("Retrieval of encryption root secret with " + "key_id '%s' returned None." + % (key_id, )) + try: + if (key.bit_length < 256) or (key.algorithm.lower() != "aes"): + raise ValueError('encryption root secret stored in the ' + 'external KMS must be an AES key of at ' + 'least 256 bits (provided key ' + 'length: %d, provided key algorithm: %s)' + % (key.bit_length, key.algorithm)) + if (key.format != 'RAW'): + raise ValueError('encryption root secret stored in the ' + 'external KMS must be in RAW format and ' + 'not e.g., as a base64 encoded string ' + '(format of key with uuid %s: %s)' % + (key_id, key.format)) + except Exception: + raise ValueError("Secret with key_id '%s' is not a symmetric " + "key (type: %s)" % (key_id, str(type(key)))) + secret = key.get_encoded() + if not isinstance(secret, bytes): + secret = secret.encode('utf-8') + root_secrets[secret_id] = secret + return root_secrets + + +def filter_factory(global_conf, **local_conf): + conf = global_conf.copy() + conf.update(local_conf) + + def kms_keymaster_filter(app): + return KmsKeyMaster(app, conf) + + return kms_keymaster_filter diff --git a/swift/common/middleware/dlo.py b/swift/common/middleware/dlo.py new file mode 100644 index 0000000000..1b41ac9091 --- /dev/null +++ b/swift/common/middleware/dlo.py @@ -0,0 +1,470 @@ +# Copyright (c) 2013 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Middleware that will provide Dynamic Large Object (DLO) support. + +--------------- +Using ``swift`` +--------------- + +The quickest way to try out this feature is use the ``swift`` Swift Tool +included with the `python-swiftclient`_ library. You can use the ``-S`` +option to specify the segment size to use when splitting a large file. For +example:: + + swift upload test_container -S 1073741824 large_file + +This would split the large_file into 1G segments and begin uploading those +segments in parallel. Once all the segments have been uploaded, ``swift`` will +then create the manifest file so the segments can be downloaded as one. + +So now, the following ``swift`` command would download the entire large +object:: + + swift download test_container large_file + +``swift`` command uses a strict convention for its segmented object +support. In the above example it will upload all the segments into a +second container named test_container_segments. These segments will +have names like large_file/1290206778.25/21474836480/00000000, +large_file/1290206778.25/21474836480/00000001, etc. + +The main benefit for using a separate container is that the main container +listings will not be polluted with all the segment names. The reason for using +the segment name format of /// is so that an +upload of a new file with the same name won't overwrite the contents of the +first until the last moment when the manifest file is updated. + +``swift`` will manage these segment files for you, deleting old segments on +deletes and overwrites, etc. You can override this behavior with the +``--leave-segments`` option if desired; this is useful if you want to have +multiple versions of the same large object available. + +.. _`python-swiftclient`: http://github.com/openstack/python-swiftclient + +---------- +Direct API +---------- + +You can also work with the segments and manifests directly with HTTP +requests instead of having ``swift`` do that for you. You can just +upload the segments like you would any other object and the manifest +is just a zero-byte (not enforced) file with an extra +``X-Object-Manifest`` header. + +All the object segments need to be in the same container, have a common object +name prefix, and sort in the order in which they should be concatenated. +Object names are sorted lexicographically as UTF-8 byte strings. +They don't have to be in the same container as the manifest file will be, which +is useful to keep container listings clean as explained above with ``swift``. + +The manifest file is simply a zero-byte (not enforced) file with the extra +``X-Object-Manifest: /`` header, where ```` is +the container the object segments are in and ```` is the common prefix +for all the segments. + +It is best to upload all the segments first and then create or update the +manifest. In this way, the full object won't be available for downloading +until the upload is complete. Also, you can upload a new set of segments to +a second location and then update the manifest to point to this new location. +During the upload of the new segments, the original manifest will still be +available to download the first set of segments. + +.. note:: + + When updating a manifest object using a POST request, a + ``X-Object-Manifest`` header must be included for the object to + continue to behave as a manifest object. + + The manifest file should have no content. However, this is not enforced. + If the manifest path itself conforms to container/prefix specified in + ``X-Object-Manifest``, and if manifest has some content/data in it, it + would also be considered as segment and manifest's content will be part of + the concatenated GET response. The order of concatenation follows the usual + DLO logic which is - the order of concatenation adheres to order returned + when segment names are sorted. + + +Here's an example using ``curl`` with tiny 1-byte segments:: + + # First, upload the segments + curl -X PUT -H 'X-Auth-Token: ' \ + http:///container/myobject/00000001 --data-binary '1' + curl -X PUT -H 'X-Auth-Token: ' \ + http:///container/myobject/00000002 --data-binary '2' + curl -X PUT -H 'X-Auth-Token: ' \ + http:///container/myobject/00000003 --data-binary '3' + + # Next, create the manifest file + curl -X PUT -H 'X-Auth-Token: ' \ + -H 'X-Object-Manifest: container/myobject/' \ + http:///container/myobject --data-binary '' + + # And now we can download the segments as a single object + curl -H 'X-Auth-Token: ' \ + http:///container/myobject +""" + +import json + +from swift.common import constraints +from swift.common.exceptions import ListingIterError, SegmentError +from swift.common.http import is_success +from swift.common.swob import Request, Response, HTTPException, \ + HTTPRequestedRangeNotSatisfiable, HTTPBadRequest, HTTPConflict, \ + str_to_wsgi, wsgi_to_str, wsgi_quote, wsgi_unquote, normalize_etag +from swift.common.utils import get_logger, \ + RateLimitedIterator, quote, close_if_possible, closing_if_possible, \ + drain_and_close, md5 +from swift.common.request_helpers import SegmentedIterable, \ + update_ignore_range_header +from swift.common.wsgi import WSGIContext, make_subrequest, load_app_config + + +class GetContext(WSGIContext): + def __init__(self, dlo, logger): + super(GetContext, self).__init__(dlo.app) + self.dlo = dlo + self.logger = logger + + def _get_container_listing(self, req, version, account, container, + prefix, marker=''): + ''' + :param version: whatever + :param account: native + :param container: native + :param prefix: native + :param marker: native + ''' + con_req = make_subrequest( + req.environ, + path=wsgi_quote('/'.join([ + '', str_to_wsgi(version), + str_to_wsgi(account), str_to_wsgi(container)])), + method='GET', + headers={'x-auth-token': req.headers.get('x-auth-token')}, + agent=('%(orig)s ' + 'DLO MultipartGET'), swift_source='DLO') + con_req.query_string = 'prefix=%s' % quote(prefix) + if marker: + con_req.query_string += '&marker=%s' % quote(marker) + + con_resp = con_req.get_response(self.dlo.app) + if not is_success(con_resp.status_int): + if req.method == 'HEAD': + con_resp.body = b'' + return con_resp, None + with closing_if_possible(con_resp.app_iter): + return None, json.loads(b''.join(con_resp.app_iter)) + + def _segment_listing_iterator(self, req, version, account, container, + prefix, segments, first_byte=None, + last_byte=None): + ''' + :param req: upstream request + :param version: native + :param account: native + :param container: native + :param prefix: native + :param segments: array of dicts, with native strings + :param first_byte: number + :param last_byte: number + ''' + # It's sort of hokey that this thing takes in the first page of + # segments as an argument, but we need to compute the etag and content + # length from the first page, and it's better to have a hokey + # interface than to make redundant requests. + if first_byte is None: + first_byte = 0 + if last_byte is None: + last_byte = float("inf") + + while True: + for segment in segments: + seg_length = int(segment['bytes']) + + if first_byte >= seg_length: + # don't need any bytes from this segment + first_byte = max(first_byte - seg_length, -1) + last_byte = max(last_byte - seg_length, -1) + continue + elif last_byte < 0: + # no bytes are needed from this or any future segment + break + + seg_name = segment['name'] + + # We deliberately omit the etag and size here; + # SegmentedIterable will check size and etag if + # specified, but we don't want it to. DLOs only care + # that the objects' names match the specified prefix. + # SegmentedIterable will instead check that the data read + # from each segment matches the response headers. + _path = "/".join(["", version, account, container, seg_name]) + _first = None if first_byte <= 0 else first_byte + _last = None if last_byte >= seg_length - 1 else last_byte + yield { + 'path': _path, + 'first_byte': _first, + 'last_byte': _last + } + + first_byte = max(first_byte - seg_length, -1) + last_byte = max(last_byte - seg_length, -1) + + if len(segments) < constraints.CONTAINER_LISTING_LIMIT: + # a short page means that we're done with the listing + break + elif last_byte < 0: + break + + marker = segments[-1]['name'] + error_response, segments = self._get_container_listing( + req, version, account, container, prefix, marker) + if error_response: + # we've already started sending the response body to the + # client, so all we can do is raise an exception to make the + # WSGI server close the connection early + close_if_possible(error_response.app_iter) + raise ListingIterError( + "Got status %d listing container /%s/%s" % + (error_response.status_int, account, container)) + + def get_or_head_response(self, req, x_object_manifest): + ''' + :param req: user's request + :param x_object_manifest: as unquoted, native string + ''' + response_headers = self._response_headers + + container, obj_prefix = x_object_manifest.split('/', 1) + + version, account, _junk = req.split_path(2, 3, True) + version = wsgi_to_str(version) + account = wsgi_to_str(account) + error_response, segments = self._get_container_listing( + req, version, account, container, obj_prefix) + if error_response: + return error_response + have_complete_listing = len(segments) < \ + constraints.CONTAINER_LISTING_LIMIT + + first_byte = last_byte = None + actual_content_length = None + content_length_for_swob_range = None + if req.range and len(req.range.ranges) == 1: + content_length_for_swob_range = sum(o['bytes'] for o in segments) + + # This is a hack to handle suffix byte ranges (e.g. "bytes=-5"), + # which we can't honor unless we have a complete listing. + _junk, range_end = req.range.ranges_for_length(float("inf"))[0] + + # If this is all the segments, we know whether or not this + # range request is satisfiable. + # + # Alternately, we may not have all the segments, but this range + # falls entirely within the first page's segments, so we know + # that it is satisfiable. + if (have_complete_listing + or range_end < content_length_for_swob_range): + byteranges = req.range.ranges_for_length( + content_length_for_swob_range) + if not byteranges: + headers = {'Accept-Ranges': 'bytes'} + if have_complete_listing: + headers['Content-Range'] = 'bytes */%d' % ( + content_length_for_swob_range, ) + return HTTPRequestedRangeNotSatisfiable( + request=req, headers=headers) + first_byte, last_byte = byteranges[0] + # For some reason, swob.Range.ranges_for_length adds 1 to the + # last byte's position. + last_byte -= 1 + actual_content_length = last_byte - first_byte + 1 + else: + # The range may or may not be satisfiable, but we can't tell + # based on just one page of listing, and we're not going to go + # get more pages because that would use up too many resources, + # so we ignore the Range header and return the whole object. + actual_content_length = None + content_length_for_swob_range = None + req.range = None + else: + req.range = None + + response_headers = [ + (h, v) for h, v in response_headers + if h.lower() not in ("content-length", "content-range")] + + if content_length_for_swob_range is not None: + # Here, we have to give swob a big-enough content length so that + # it can compute the actual content length based on the Range + # header. This value will not be visible to the client; swob will + # substitute its own Content-Length. + # + # Note: if the manifest points to at least CONTAINER_LISTING_LIMIT + # segments, this may be less than the sum of all the segments' + # sizes. However, it'll still be greater than the last byte in the + # Range header, so it's good enough for swob. + response_headers.append(('Content-Length', + str(content_length_for_swob_range))) + elif have_complete_listing: + actual_content_length = sum(o['bytes'] for o in segments) + response_headers.append(('Content-Length', + str(actual_content_length))) + + if have_complete_listing: + response_headers = [(h, v) for h, v in response_headers + if h.lower() != "etag"] + etag = md5(usedforsecurity=False) + for seg_dict in segments: + etag.update(normalize_etag(seg_dict['hash']).encode('utf8')) + response_headers.append(('Etag', '"%s"' % etag.hexdigest())) + + app_iter = None + if req.method == 'GET': + listing_iter = RateLimitedIterator( + self._segment_listing_iterator( + req, version, account, container, obj_prefix, segments, + first_byte=first_byte, last_byte=last_byte), + self.dlo.rate_limit_segments_per_sec, + limit_after=self.dlo.rate_limit_after_segment) + + app_iter = SegmentedIterable( + req, self.dlo.app, listing_iter, ua_suffix="DLO MultipartGET", + swift_source="DLO", name=req.path, logger=self.logger, + max_get_time=self.dlo.max_get_time, + response_body_length=actual_content_length) + + try: + app_iter.validate_first_segment() + except HTTPException as err_resp: + return err_resp + except (SegmentError, ListingIterError): + return HTTPConflict(request=req) + + resp = Response(request=req, headers=response_headers, + conditional_response=True, + app_iter=app_iter) + + return resp + + def handle_request(self, req, start_response): + """ + Take a GET or HEAD request, and if it is for a dynamic large object + manifest, return an appropriate response. + + Otherwise, simply pass it through. + """ + update_ignore_range_header(req, 'X-Object-Manifest') + resp_iter = self._app_call(req.environ) + + # make sure this response is for a dynamic large object manifest + for header, value in self._response_headers: + if (header.lower() == 'x-object-manifest'): + content_length = self._response_header_value('content-length') + if content_length is not None and int(content_length) < 1024: + # Go ahead and consume small bodies + drain_and_close(resp_iter) + close_if_possible(resp_iter) + response = self.get_or_head_response( + req, wsgi_to_str(wsgi_unquote(value))) + return response(req.environ, start_response) + # Not a dynamic large object manifest; just pass it through. + start_response(self._response_status, + self._response_headers, + self._response_exc_info) + return resp_iter + + +class DynamicLargeObject(object): + def __init__(self, app, conf): + self.app = app + self.logger = get_logger(conf, log_route='dlo') + + # DLO functionality used to live in the proxy server, not middleware, + # so let's try to go find config values in the proxy's config section + # to ease cluster upgrades. + self._populate_config_from_old_location(conf) + + self.max_get_time = int(conf.get('max_get_time', '86400')) + self.rate_limit_after_segment = int(conf.get( + 'rate_limit_after_segment', '10')) + self.rate_limit_segments_per_sec = int(conf.get( + 'rate_limit_segments_per_sec', '1')) + + def _populate_config_from_old_location(self, conf): + if ('rate_limit_after_segment' in conf or + 'rate_limit_segments_per_sec' in conf or + 'max_get_time' in conf or + '__file__' not in conf): + return + + proxy_conf = load_app_config(conf['__file__']) + for setting in ('rate_limit_after_segment', + 'rate_limit_segments_per_sec', + 'max_get_time'): + if setting in proxy_conf: + conf[setting] = proxy_conf[setting] + + def __call__(self, env, start_response): + """ + WSGI entry point + """ + req = Request(env) + try: + vrs, account, container, obj = req.split_path(4, 4, True) + is_obj_req = True + except ValueError: + is_obj_req = False + if not is_obj_req: + return self.app(env, start_response) + + if ((req.method == 'GET' or req.method == 'HEAD') and + req.params.get('multipart-manifest') != 'get'): + return GetContext(self, self.logger).\ + handle_request(req, start_response) + elif req.method == 'PUT': + error_response = self._validate_x_object_manifest_header(req) + if error_response: + return error_response(env, start_response) + return self.app(env, start_response) + + def _validate_x_object_manifest_header(self, req): + """ + Make sure that X-Object-Manifest is valid if present. + """ + if 'X-Object-Manifest' in req.headers: + value = req.headers['X-Object-Manifest'] + container = prefix = None + try: + container, prefix = value.split('/', 1) + except ValueError: + pass + if not container or not prefix or '?' in value or '&' in value or \ + prefix.startswith('/'): + return HTTPBadRequest( + request=req, + body=('X-Object-Manifest must be in the ' + 'format container/prefix')) + + +def filter_factory(global_conf, **local_conf): + conf = global_conf.copy() + conf.update(local_conf) + + def dlo_filter(app): + return DynamicLargeObject(app, conf) + return dlo_filter diff --git a/swift/common/middleware/domain_remap.py b/swift/common/middleware/domain_remap.py index 025ae4b050..aec8e90e93 100644 --- a/swift/common/middleware/domain_remap.py +++ b/swift/common/middleware/domain_remap.py @@ -1,4 +1,4 @@ -# Copyright (c) 2010-2012 OpenStack, LLC. +# Copyright (c) 2010-2012 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,39 +17,95 @@ """ Domain Remap Middleware -Middleware that translates container and account parts of a domain to -path parameters that the proxy server understands. +Middleware that translates container and account parts of a domain to path +parameters that the proxy server understands. -container.account.storageurl/object gets translated to -container.account.storageurl/path_root/account/container/object +Translation is only performed when the request URL's host domain matches one of +a list of domains. This list may be configured by the option +``storage_domain``, and defaults to the single domain ``example.com``. -account.storageurl/path_root/container/object gets translated to -account.storageurl/path_root/account/container/object +If not already present, a configurable ``path_root``, which defaults to ``v1``, +will be added to the start of the translated path. -Browsers can convert a host header to lowercase, so check that reseller -prefix on the account is the correct case. This is done by comparing the -items in the reseller_prefixes config option to the found prefix. If they -match except for case, the item from reseller_prefixes will be used -instead of the found reseller prefix. The reseller_prefixes list is -exclusive. If defined, any request with an account prefix not in that list -will be ignored by this middleware. reseller_prefixes defaults to 'AUTH'. +For example, with the default configuration:: + + container.AUTH-account.example.com/object + container.AUTH-account.example.com/v1/object + +would both be translated to:: + + container.AUTH-account.example.com/v1/AUTH_account/container/object + +and:: + + AUTH-account.example.com/container/object + AUTH-account.example.com/v1/container/object + +would both be translated to:: + + AUTH-account.example.com/v1/AUTH_account/container/object + +Additionally, translation is only performed when the account name in the +translated path starts with a reseller prefix matching one of a list configured +by the option ``reseller_prefixes``, or when no match is found but a +``default_reseller_prefix`` has been configured. + +The ``reseller_prefixes`` list defaults to the single prefix ``AUTH``. The +``default_reseller_prefix`` is not configured by default. + +Browsers can convert a host header to lowercase, so the middleware checks that +the reseller prefix on the account name is the correct case. This is done by +comparing the items in the ``reseller_prefixes`` config option to the found +prefix. If they match except for case, the item from ``reseller_prefixes`` will +be used instead of the found reseller prefix. The middleware will also replace +any hyphen ('-') in the account name with an underscore ('_'). + +For example, with the default configuration:: + + auth-account.example.com/container/object + AUTH-account.example.com/container/object + auth_account.example.com/container/object + AUTH_account.example.com/container/object + +would all be translated to:: + + .example.com/v1/AUTH_account/container/object + +When no match is found in ``reseller_prefixes``, the +``default_reseller_prefix`` config option is used. When no +``default_reseller_prefix`` is configured, any request with an account prefix +not in the ``reseller_prefixes`` list will be ignored by this middleware. + +For example, with ``default_reseller_prefix = AUTH``:: + + account.example.com/container/object + +would be translated to:: + + account.example.com/v1/AUTH_account/container/object Note that this middleware requires that container names and account names -(except as described above) must be DNS-compatible. This means that the -account name created in the system and the containers created by users -cannot exceed 63 characters or have UTF-8 characters. These are -restrictions over and above what swift requires and are not explicitly -checked. Simply put, the this middleware will do a best-effort attempt to -derive account and container names from elements in the domain name and -put those derived values into the URL path (leaving the Host header -unchanged). - -Also note that using container sync with remapped domain names is not -advised. With container sync, you should use the true storage end points as -sync destinations. +(except as described above) must be DNS-compatible. This means that the account +name created in the system and the containers created by users cannot exceed 63 +characters or have UTF-8 characters. These are restrictions over and above what +Swift requires and are not explicitly checked. Simply put, this middleware +will do a best-effort attempt to derive account and container names from +elements in the domain name and put those derived values into the URL path +(leaving the ``Host`` header unchanged). + +Also note that using :doc:`overview_container_sync` with remapped domain names +is not advised. With :doc:`overview_container_sync`, you should use the true +storage end points as sync destinations. """ -from swift.common.swob import Request, HTTPBadRequest +from swift.common.middleware import RewriteContext +from swift.common.swob import Request, HTTPBadRequest, wsgi_quote +from swift.common.utils import config_true_value, list_from_csv +from swift.common.registry import register_swift_info + + +class _DomainRemapContext(RewriteContext): + base_re = r'^(https?://[^/]+)%s(.*)$' class DomainRemapMiddleware(object): @@ -65,25 +121,35 @@ class DomainRemapMiddleware(object): def __init__(self, app, conf): self.app = app - self.storage_domain = conf.get('storage_domain', 'example.com') - if self.storage_domain and self.storage_domain[0] != '.': - self.storage_domain = '.' + self.storage_domain - self.path_root = conf.get('path_root', 'v1').strip('/') + storage_domain = conf.get('storage_domain', 'example.com') + self.storage_domain = ['.' + s for s in + list_from_csv(storage_domain) + if not s.startswith('.')] + self.storage_domain += [s for s in list_from_csv(storage_domain) + if s.startswith('.')] + self.path_root = conf.get('path_root', 'v1').strip('/') + '/' prefixes = conf.get('reseller_prefixes', 'AUTH') - self.reseller_prefixes = [x.strip() for x in prefixes.split(',') - if x.strip()] + self.reseller_prefixes = list_from_csv(prefixes) self.reseller_prefixes_lower = [x.lower() for x in self.reseller_prefixes] + self.default_reseller_prefix = conf.get('default_reseller_prefix') + self.mangle_client_paths = config_true_value( + conf.get('mangle_client_paths')) def __call__(self, env, start_response): if not self.storage_domain: return self.app(env, start_response) - given_domain = env['HTTP_HOST'] + if 'HTTP_HOST' in env: + given_domain = env['HTTP_HOST'] + else: + given_domain = env['SERVER_NAME'] port = '' if ':' in given_domain: given_domain, port = given_domain.rsplit(':', 1) - if given_domain.endswith(self.storage_domain): - parts_to_parse = given_domain[:-len(self.storage_domain)] + storage_domain = next((domain for domain in self.storage_domain + if given_domain.endswith(domain)), None) + if storage_domain: + parts_to_parse = given_domain[:-len(storage_domain)] parts_to_parse = parts_to_parse.strip('.').split('.') len_parts_to_parse = len(parts_to_parse) if len_parts_to_parse == 2: @@ -92,31 +158,44 @@ def __call__(self, env, start_response): container, account = None, parts_to_parse[0] else: resp = HTTPBadRequest(request=Request(env), - body='Bad domain in host header', + body=b'Bad domain in host header', content_type='text/plain') return resp(env, start_response) - if '_' not in account and '-' in account: - account = account.replace('-', '_', 1) - account_reseller_prefix = account.split('_', 1)[0].lower() - if account_reseller_prefix not in self.reseller_prefixes_lower: - # account prefix is not in config list. bail. - return self.app(env, start_response) - prefix_index = self.reseller_prefixes_lower.index( - account_reseller_prefix) - real_prefix = self.reseller_prefixes[prefix_index] - if not account.startswith(real_prefix): - account_suffix = account[len(real_prefix):] - account = real_prefix + account_suffix - path = env['PATH_INFO'].strip('/') - new_path_parts = ['', self.path_root, account] + if len(self.reseller_prefixes) > 0: + if '_' not in account and '-' in account: + account = account.replace('-', '_', 1) + account_reseller_prefix = account.split('_', 1)[0].lower() + + if account_reseller_prefix in self.reseller_prefixes_lower: + prefix_index = self.reseller_prefixes_lower.index( + account_reseller_prefix) + real_prefix = self.reseller_prefixes[prefix_index] + if not account.startswith(real_prefix): + account_suffix = account[len(real_prefix):] + account = real_prefix + account_suffix + elif self.default_reseller_prefix: + # account prefix is not in config list. Add default one. + account = "%s_%s" % (self.default_reseller_prefix, account) + else: + # account prefix is not in config list. bail. + return self.app(env, start_response) + + requested_path = env['PATH_INFO'] + path = requested_path[1:] + new_path_parts = ['', self.path_root[:-1], account] if container: new_path_parts.append(container) - if path.startswith(self.path_root): - path = path[len(self.path_root):].lstrip('/') - if path: - new_path_parts.append(path) + if self.mangle_client_paths and (path + '/').startswith( + self.path_root): + path = path[len(self.path_root):] + new_path_parts.append(path) new_path = '/'.join(new_path_parts) env['PATH_INFO'] = new_path + + context = _DomainRemapContext( + self.app, wsgi_quote(requested_path), wsgi_quote(new_path)) + return context.handle_request(env, start_response) + return self.app(env, start_response) @@ -124,6 +203,10 @@ def filter_factory(global_conf, **local_conf): conf = global_conf.copy() conf.update(local_conf) + register_swift_info( + 'domain_remap', + default_reseller_prefix=conf.get('default_reseller_prefix')) + def domain_filter(app): return DomainRemapMiddleware(app, conf) return domain_filter diff --git a/swift/common/middleware/etag_quoter.py b/swift/common/middleware/etag_quoter.py new file mode 100644 index 0000000000..d67c1b48b7 --- /dev/null +++ b/swift/common/middleware/etag_quoter.py @@ -0,0 +1,128 @@ +# Copyright (c) 2010-2020 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This middleware fix the Etag header of responses so that it is RFC compliant. +`RFC 7232 `__ specifies that +the value of the Etag header must be double quoted. + +It must be placed at the beggining of the pipeline, right after cache:: + + [pipeline:main] + pipeline = ... cache etag-quoter ... + + [filter:etag-quoter] + use = egg:swift#etag_quoter + +Set ``X-Account-Rfc-Compliant-Etags: true`` at the account +level to have any Etags in object responses be double quoted, as in +``"d41d8cd98f00b204e9800998ecf8427e"``. Alternatively, you may +only fix Etags in a single container by setting +``X-Container-Rfc-Compliant-Etags: true`` on the container. +This may be necessary for Swift to work properly with some CDNs. + +Either option may also be explicitly *disabled*, so you may enable quoted +Etags account-wide as above but turn them off for individual containers +with ``X-Container-Rfc-Compliant-Etags: false``. This may be +useful if some subset of applications expect Etags to be bare MD5s. +""" + +from swift.common.constraints import valid_api_version +from swift.common.http import is_success +from swift.common.swob import Request +from swift.common.utils import config_true_value +from swift.common.registry import register_swift_info +from swift.proxy.controllers.base import get_account_info, get_container_info + + +class EtagQuoterMiddleware(object): + def __init__(self, app, conf): + self.app = app + self.conf = conf + + def __call__(self, env, start_response): + req = Request(env) + try: + version, account, container, obj = req.split_path( + 2, 4, rest_with_last=True) + is_swifty_request = valid_api_version(version) + except ValueError: + is_swifty_request = False + + if not is_swifty_request: + return self.app(env, start_response) + + if not obj: + typ = 'Container' if container else 'Account' + client_header = 'X-%s-Rfc-Compliant-Etags' % typ + sysmeta_header = 'X-%s-Sysmeta-Rfc-Compliant-Etags' % typ + if client_header in req.headers: + if req.headers[client_header]: + req.headers[sysmeta_header] = config_true_value( + req.headers[client_header]) + else: + req.headers[sysmeta_header] = '' + if req.headers.get(client_header.replace('X-', 'X-Remove-', 1)): + req.headers[sysmeta_header] = '' + + def translating_start_response(status, headers, exc_info=None): + return start_response(status, [ + (client_header if h.title() == sysmeta_header else h, + v) for h, v in headers + ], exc_info) + + return self.app(env, translating_start_response) + + container_info = get_container_info(env, self.app, 'EQ') + if not container_info or not is_success(container_info['status']): + return self.app(env, start_response) + + flag = container_info.get('sysmeta', {}).get('rfc-compliant-etags') + if flag is None: + account_info = get_account_info(env, self.app, 'EQ') + if not account_info or not is_success(account_info['status']): + return self.app(env, start_response) + + flag = account_info.get('sysmeta', {}).get( + 'rfc-compliant-etags') + + if flag is None: + flag = self.conf.get('enable_by_default', 'false') + + if not config_true_value(flag): + return self.app(env, start_response) + + status, headers, resp_iter = req.call_application(self.app) + + headers = [ + (header, value) if header.lower() != 'etag' or ( + value.startswith(('"', 'W/"')) and value.endswith('"')) + else (header, '"%s"' % value) + for header, value in headers] + + start_response(status, headers) + return resp_iter + + +def filter_factory(global_conf, **local_conf): + conf = global_conf.copy() + conf.update(local_conf) + register_swift_info( + 'etag_quoter', enable_by_default=config_true_value( + conf.get('enable_by_default', 'false'))) + + def etag_quoter_filter(app): + return EtagQuoterMiddleware(app, conf) + return etag_quoter_filter diff --git a/swift/common/middleware/formpost.py b/swift/common/middleware/formpost.py index 9dee49c902..9c75bc79b2 100644 --- a/swift/common/middleware/formpost.py +++ b/swift/common/middleware/formpost.py @@ -1,4 +1,4 @@ -# Copyright (c) 2011 OpenStack, LLC. +# Copyright (c) 2011 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +r""" FormPost Middleware Translates a browser form post into a regular Swift object PUT. @@ -31,7 +31,24 @@ -The is the URL to the Swift desination, such as:: +Optionally, if you want the uploaded files to be temporary you can set +x-delete-at or x-delete-after attributes by adding one of these as a +form input:: + + + + +If you want to specify the content type or content encoding of the files you +can set content-encoding or content-type by adding them to the form input:: + + + + +The above example applies these parameters to all uploaded files. You can also +set the content-type and content-encoding on a per-file basis by adding the +parameters to each part of the upload. + +The is the URL of the Swift destination, such as:: https://swift-cluster.example.com/v1/AUTH_account/container/object_prefix @@ -49,11 +66,12 @@ Note the form method must be POST and the enctype must be set as "multipart/form-data". -The redirect attribute is the URL to redirect the browser to after -the upload completes. The URL will have status and message query -parameters added to it, indicating the HTTP status code for the -upload (2xx is success) and a possible message for further -information if there was an error (such as "max_file_size exceeded"). +The redirect attribute is the URL to redirect the browser to after the upload +completes. This is an optional parameter. If you are uploading the form via an +XMLHttpRequest the redirect should not be included. The URL will have status +and message query parameters added to it, indicating the HTTP status code for +the upload (2xx is success) and a possible message for further information if +there was an error (such as "max_file_size exceeded"). The max_file_size attribute must be included and indicates the largest single file upload that can be done, in bytes. @@ -66,26 +84,29 @@ The expires attribute is the Unix timestamp before which the form must be submitted before it is invalidated. -The signature attribute is the HMAC-SHA1 signature of the form. Here is +The signature attribute is the HMAC signature of the form. Here is sample code for computing the signature:: import hmac - from hashlib import sha1 + from hashlib import sha512 from time import time path = '/v1/account/container/object_prefix' - redirect = 'https://myserver.com/some-page' + redirect = 'https://srv.com/some-page' # set to '' if redirect not in form max_file_size = 104857600 max_file_count = 10 expires = int(time() + 600) key = 'mykey' - hmac_body = '%s\\n%s\\n%s\\n%s\\n%s' % (path, redirect, + hmac_body = '%s\n%s\n%s\n%s\n%s' % (path, redirect, max_file_size, max_file_count, expires) - signature = hmac.new(key, hmac_body, sha1).hexdigest() + signature = hmac.new(key, hmac_body, sha512).hexdigest() -The key is the value of the X-Account-Meta-Temp-URL-Key header on the -account. +The key is the value of either the account (X-Account-Meta-Temp-URL-Key, +X-Account-Meta-Temp-Url-Key-2) or the container +(X-Container-Meta-Temp-URL-Key, X-Container-Meta-Temp-Url-Key-2) TempURL keys. Be certain to use the full path, from the /v1/ onward. +Note that x_delete_at and x_delete_after are not used in signature generation +as they are both optional attributes. The command line tool ``swift-form-signature`` may be used (mostly just when testing) to compute expires and signature. @@ -102,16 +123,23 @@ __all__ = ['FormPost', 'filter_factory', 'READ_CHUNK_SIZE', 'MAX_VALUE_LENGTH'] import hmac -import re -import rfc822 -from hashlib import sha1 -from StringIO import StringIO -from time import gmtime, strftime, time -from urllib import quote, unquote +from time import time -from swift.common.utils import get_logger, streq_const_time -from swift.common.wsgi import make_pre_authed_env -from swift.common.http import HTTP_BAD_REQUEST +from urllib.parse import quote + +from swift.common.constraints import valid_api_version +from swift.common.exceptions import MimeInvalid +from swift.common.middleware.tempurl import get_tempurl_keys_from_metadata +from swift.common.digest import get_allowed_digests, \ + extract_digest_and_algorithm, DEFAULT_ALLOWED_DIGESTS +from swift.common.utils import streq_const_time, parse_content_disposition, \ + parse_mime_headers, iter_multipart_mime_documents, reiterate, \ + closing_if_possible, get_logger, InputProxy +from swift.common.registry import register_swift_info +from swift.common.wsgi import WSGIContext, make_pre_authed_env +from swift.common.swob import HTTPUnauthorized, wsgi_to_str, str_to_wsgi +from swift.common.http import is_success +from swift.proxy.controllers.base import get_account_info, get_container_info #: The size of data to read from the form at any given time. @@ -121,134 +149,16 @@ #: truncated. MAX_VALUE_LENGTH = 4096 -#: Regular expression to match form attributes. -ATTRIBUTES_RE = re.compile(r'(\w+)=(".*?"|[^";]+)(; ?|$)') - class FormInvalid(Exception): pass -def _parse_attrs(header): - """ - Given the value of a header like: - Content-Disposition: form-data; name="somefile"; filename="test.html" - - Return data like - ("form-data", {"name": "somefile", "filename": "test.html"}) - - :param header: Value of a header (the part after the ': '). - :returns: (value name, dict) of the attribute data parsed (see above). - """ - attributes = {} - attrs = '' - if '; ' in header: - header, attrs = header.split('; ', 1) - m = True - while m: - m = ATTRIBUTES_RE.match(attrs) - if m: - attrs = attrs[len(m.group(0)):] - attributes[m.group(1)] = m.group(2).strip('"') - return header, attributes - - -class _IterRequestsFileLikeObject(object): - - def __init__(self, wsgi_input, boundary, input_buffer): - self.no_more_data_for_this_file = False - self.no_more_files = False - self.wsgi_input = wsgi_input - self.boundary = boundary - self.input_buffer = input_buffer - - def read(self, length=None): - if not length: - length = READ_CHUNK_SIZE - if self.no_more_data_for_this_file: - return '' - - # read enough data to know whether we're going to run - # into a boundary in next [length] bytes - if len(self.input_buffer) < length + len(self.boundary) + 2: - to_read = length + len(self.boundary) + 2 - while to_read > 0: - chunk = self.wsgi_input.read(to_read) - to_read -= len(chunk) - self.input_buffer += chunk - if not chunk: - self.no_more_files = True - break - - boundary_pos = self.input_buffer.find(self.boundary) - - # boundary does not exist in the next (length) bytes - if boundary_pos == -1 or boundary_pos > length: - ret = self.input_buffer[:length] - self.input_buffer = self.input_buffer[length:] - # if it does, just return data up to the boundary - else: - ret, self.input_buffer = self.input_buffer.split(self.boundary, 1) - self.no_more_files = self.input_buffer.startswith('--') - self.no_more_data_for_this_file = True - self.input_buffer = self.input_buffer[2:] - return ret - - def readline(self): - if self.no_more_data_for_this_file: - return '' - boundary_pos = newline_pos = -1 - while newline_pos < 0 and boundary_pos < 0: - chunk = self.wsgi_input.read(READ_CHUNK_SIZE) - self.input_buffer += chunk - newline_pos = self.input_buffer.find('\r\n') - boundary_pos = self.input_buffer.find(self.boundary) - if not chunk: - self.no_more_files = True - break - # found a newline - if newline_pos >= 0 and \ - (boundary_pos < 0 or newline_pos < boundary_pos): - # Use self.read to ensure any logic there happens... - ret = '' - to_read = newline_pos + 2 - while to_read > 0: - chunk = self.read(to_read) - # Should never happen since we're reading from input_buffer, - # but just for completeness... - if not chunk: - break - to_read -= len(chunk) - ret += chunk - return ret - else: # no newlines, just return up to next boundary - return self.read(len(self.input_buffer)) - +class FormUnauthorized(Exception): + pass -def _iter_requests(wsgi_input, boundary): - """ - Given a multi-part mime encoded input file object and boundary, - yield file-like objects for each part. - :param wsgi_input: The file-like object to read from. - :param boundary: The mime boundary to separate new file-like - objects on. - :returns: A generator of file-like objects for each part. - """ - boundary = '--' + boundary - if wsgi_input.readline().strip() != boundary: - raise FormInvalid('invalid starting boundary') - boundary = '\r\n' + boundary - input_buffer = '' - done = False - while not done: - it = _IterRequestsFileLikeObject(wsgi_input, boundary, input_buffer) - yield it - done = it.no_more_files - input_buffer = it.input_buffer - - -class _CappedFileLikeObject(object): +class _CappedFileLikeObject(InputProxy): """ A file-like object wrapping another file-like object that raises an EOFError if the amount of data read exceeds a given @@ -260,23 +170,15 @@ class _CappedFileLikeObject(object): """ def __init__(self, fp, max_file_size): - self.fp = fp + super().__init__(fp) self.max_file_size = max_file_size - self.amount_read = 0 - - def read(self, size=None): - ret = self.fp.read(size) - self.amount_read += len(ret) - if self.amount_read > self.max_file_size: - raise EOFError('max_file_size exceeded') - return ret + self.file_size_exceeded = False - def readline(self): - ret = self.fp.readline() - self.amount_read += len(ret) - if self.amount_read > self.max_file_size: + def chunk_update(self, chunk, eof, *args, **kwargs): + if self.bytes_received > self.max_file_size: + self.file_size_exceeded = True raise EOFError('max_file_size exceeded') - return ret + return chunk class FormPost(object): @@ -285,20 +187,25 @@ class FormPost(object): See above for a full description. + The proxy logs created for any subrequests made will have swift.source set + to "FP". + :param app: The next WSGI filter or app in the paste.deploy chain. :param conf: The configuration dict for the middleware. """ - def __init__(self, app, conf): + def __init__(self, app, conf, logger=None): #: The next WSGI application/filter in the paste.deploy pipeline. self.app = app #: The filter configuration dict. self.conf = conf - #: The logger to use with this middleware. - self.logger = get_logger(conf, log_route='formpost') - #: The HTTP user agent to use with subrequests. - self.agent = '%(orig)s FormPost' + self.logger = logger or get_logger(conf, log_route='formpost') + # Defaulting to SUPPORTED_DIGESTS just so we don't completely + # deprecate sha1 yet. We'll change this to DEFAULT_ALLOWED_DIGESTS + # later. + self.allowed_digests = conf.get( + 'allowed_digests', DEFAULT_ALLOWED_DIGESTS.split()) def __call__(self, env, start_response): """ @@ -311,22 +218,34 @@ def __call__(self, env, start_response): if env['REQUEST_METHOD'] == 'POST': try: content_type, attrs = \ - _parse_attrs(env.get('CONTENT_TYPE') or '') + parse_content_disposition(env.get('CONTENT_TYPE') or '') if content_type == 'multipart/form-data' and \ 'boundary' in attrs: + http_user_agent = "%s FormPost" % ( + env.get('HTTP_USER_AGENT', '')) + env['HTTP_USER_AGENT'] = http_user_agent.strip() status, headers, body = self._translate_form( env, attrs['boundary']) - self._log_request(env, int(status.split(' ', 1)[0])) start_response(status, headers) - return body - except (FormInvalid, EOFError), err: - self._log_request(env, HTTP_BAD_REQUEST) - body = 'FormPost: %s' % err + return [body] + except MimeInvalid: + body = b'FormPost: invalid starting boundary' + start_response( + '400 Bad Request', + (('Content-Type', 'text/plain'), + ('Content-Length', str(len(body))))) + return [body] + except (FormInvalid, EOFError) as err: + body = ('FormPost: %s' % err).encode('utf-8') start_response( '400 Bad Request', (('Content-Type', 'text/plain'), ('Content-Length', str(len(body))))) return [body] + except FormUnauthorized as err: + message = 'FormPost: %s' % str(err).title() + return HTTPUnauthorized(body=message)( + env, start_response) return self.app(env, start_response) def _translate_form(self, env, boundary): @@ -338,14 +257,19 @@ def _translate_form(self, env, boundary): :param boundary: The MIME type boundary to look for. :returns: status_line, headers_list, body """ - key = self._get_key(env) + keys = self._get_keys(env) + boundary = boundary.encode('utf-8') status = message = '' attributes = {} + file_attributes = {} + subheaders = [] + resp_body = None file_count = 0 - for fp in _iter_requests(env['wsgi.input'], boundary): - hdrs = rfc822.Message(fp, 0) - disp, attrs = \ - _parse_attrs(hdrs.getheader('Content-Disposition', '')) + for fp in iter_multipart_mime_documents( + env['wsgi.input'], boundary, read_chunk_size=READ_CHUNK_SIZE): + hdrs = parse_mime_headers(fp) + disp, attrs = parse_content_disposition( + hdrs.get('Content-Disposition', '')) if disp == 'form-data' and attrs.get('filename'): file_count += 1 try: @@ -355,16 +279,22 @@ def _translate_form(self, env, boundary): break except ValueError: raise FormInvalid('max_file_count not an integer') - attributes['filename'] = attrs['filename'] or 'filename' + file_attributes = attributes.copy() + file_attributes['filename'] = attrs['filename'] or 'filename' if 'content-type' not in attributes and 'content-type' in hdrs: - attributes['content-type'] = \ + file_attributes['content-type'] = \ hdrs['Content-Type'] or 'application/octet-stream' - status, message = self._perform_subrequest(env, attributes, fp, - key) - if status[:1] != '2': + if 'content-encoding' not in attributes and \ + 'content-encoding' in hdrs: + file_attributes['content-encoding'] = \ + hdrs['Content-Encoding'] + status, subheaders, resp_body = \ + self._perform_subrequest(env, file_attributes, fp, keys) + status_code = int(status.split(' ', 1)[0]) + if not is_success(status_code): break else: - data = '' + data = b'' mxln = MAX_VALUE_LENGTH while mxln: chunk = fp.read(mxln) @@ -374,29 +304,42 @@ def _translate_form(self, env, boundary): data += chunk while fp.read(READ_CHUNK_SIZE): pass + data = data.decode('utf-8') if 'name' in attrs: attributes[attrs['name'].lower()] = data.rstrip('\r\n--') if not status: status = '400 Bad Request' message = 'no files to process' - if not attributes.get('redirect'): + + status_code = int(status.split(' ', 1)[0]) + headers = [(k, v) for k, v in subheaders + if k.lower().startswith('access-control')] + + redirect = attributes.get('redirect') + if not redirect: body = status if message: body = status + '\r\nFormPost: ' + message.title() - headers = [('Content-Type', 'text/plain'), - ('Content-Length', len(body))] + body = body.encode('utf-8') + if not is_success(status_code) and resp_body: + body = resp_body + headers.extend([('Content-Type', 'text/plain'), + ('Content-Length', len(body))]) return status, headers, body - status = status.split(' ', 1)[0] - body = '

Click to ' \ - 'continue...

' % \ - (attributes['redirect'], quote(status), quote(message)) - headers = [ - ('Location', '%s?status=%s&message=%s' % ( - attributes['redirect'], quote(status), quote(message))), - ('Content-Length', str(len(body)))] + if '?' in redirect: + redirect += '&' + else: + redirect += '?' + redirect += 'status=%s&message=%s' % (quote(str(status_code)), + quote(message)) + body = '

' \ + 'Click to continue...

' % redirect + body = body.encode('utf-8') + headers.extend( + [('Location', redirect), ('Content-Length', str(len(body)))]) return '303 See Other', headers, body - def _perform_subrequest(self, orig_env, attributes, fp, key): + def _perform_subrequest(self, orig_env, attributes, fp, keys): """ Performs the subrequest and returns the response. @@ -404,135 +347,125 @@ def _perform_subrequest(self, orig_env, attributes, fp, key): to form a new env for the subrequest. :param attributes: dict of the attributes of the form so far. :param fp: The file-like object containing the request body. - :param key: The account key to validate the signature with. - :returns: (status_line, message) + :param keys: The account keys to validate the signature with. + :returns: (status_line, headers_list) """ - if not key: - return '401 Unauthorized', 'invalid signature' + if not keys: + raise FormUnauthorized('invalid signature') try: max_file_size = int(attributes.get('max_file_size') or 0) except ValueError: raise FormInvalid('max_file_size not an integer') - subenv = make_pre_authed_env(orig_env, 'PUT', agent=self.agent) + subenv = make_pre_authed_env(orig_env, 'PUT', agent=None, + swift_source='FP') + if 'QUERY_STRING' in subenv: + del subenv['QUERY_STRING'] subenv['HTTP_TRANSFER_ENCODING'] = 'chunked' subenv['wsgi.input'] = _CappedFileLikeObject(fp, max_file_size) - if subenv['PATH_INFO'][-1] != '/' and \ + if not subenv['PATH_INFO'].endswith('/') and \ subenv['PATH_INFO'].count('/') < 4: subenv['PATH_INFO'] += '/' - subenv['PATH_INFO'] += attributes['filename'] or 'filename' + subenv['PATH_INFO'] += str_to_wsgi( + attributes['filename'] or 'filename') + if 'x_delete_at' in attributes: + try: + subenv['HTTP_X_DELETE_AT'] = int(attributes['x_delete_at']) + except ValueError: + raise FormInvalid('x_delete_at not an integer: ' + 'Unix timestamp required.') + if 'x_delete_after' in attributes: + try: + subenv['HTTP_X_DELETE_AFTER'] = int( + attributes['x_delete_after']) + except ValueError: + raise FormInvalid('x_delete_after not an integer: ' + 'Number of seconds required.') if 'content-type' in attributes: subenv['CONTENT_TYPE'] = \ attributes['content-type'] or 'application/octet-stream' - elif 'CONTENT_TYPE' in subenv: - del subenv['CONTENT_TYPE'] + if 'content-encoding' in attributes: + subenv['HTTP_CONTENT_ENCODING'] = attributes['content-encoding'] try: if int(attributes.get('expires') or 0) < time(): - return '401 Unauthorized', 'form expired' + raise FormUnauthorized('form expired') except ValueError: raise FormInvalid('expired not an integer') hmac_body = '%s\n%s\n%s\n%s\n%s' % ( - orig_env['PATH_INFO'], + wsgi_to_str(orig_env['PATH_INFO']), attributes.get('redirect') or '', attributes.get('max_file_size') or '0', attributes.get('max_file_count') or '0', attributes.get('expires') or '0') - sig = hmac.new(key, hmac_body, sha1).hexdigest() - if not streq_const_time(sig, (attributes.get('signature') or - 'invalid')): - return '401 Unauthorized', 'invalid signature' - substatus = [None] - - def _start_response(status, headers, exc_info=None): - substatus[0] = status + hmac_body = hmac_body.encode('utf-8') - i = iter(self.app(subenv, _start_response)) + has_valid_sig = False + signature = attributes.get('signature', '') try: - i.next() - except StopIteration: - pass - return substatus[0], '' - - def _get_key(self, env): + hash_name, signature = extract_digest_and_algorithm(signature) + except ValueError: + raise FormUnauthorized('invalid signature') + if hash_name not in self.allowed_digests: + raise FormUnauthorized('invalid signature') + + for key in keys: + # Encode key like in swift.common.utls.get_hmac. + if not isinstance(key, bytes): + key = key.encode('utf8') + sig = hmac.new(key, hmac_body, hash_name).hexdigest() + if streq_const_time(sig, signature): + has_valid_sig = True + if not has_valid_sig: + raise FormUnauthorized('invalid signature') + self.logger.increment('formpost.digests.%s' % hash_name) + wsgi_ctx = WSGIContext(self.app) + wsgi_input = subenv['wsgi.input'] + resp = wsgi_ctx._app_call(subenv) + if wsgi_input.file_size_exceeded: + raise EOFError("max_file_size exceeded") + with closing_if_possible(reiterate(resp)): + body = b''.join(resp) + return wsgi_ctx._response_status, wsgi_ctx._response_headers, body + + def _get_keys(self, env): """ - Returns the X-Account-Meta-Temp-URL-Key header value for the - account, or None if none is set. + Returns the X-[Account|Container]-Meta-Temp-URL-Key[-2] header values + for the account or container, or an empty list if none are set. + + Returns 0-4 elements depending on how many keys are set in the + account's or container's metadata. + + Also validate that the request + path indicates a valid container; if not, no keys will be returned. :param env: The WSGI environment for the request. - :returns: X-Account-Meta-Temp-URL-Key str value, or None. + :returns: list of tempurl keys """ parts = env['PATH_INFO'].split('/', 4) - if len(parts) < 4 or parts[0] or parts[1] != 'v1' or not parts[2] or \ - not parts[3]: - return None - account = parts[2] - key = None - memcache = env.get('swift.cache') - if memcache: - key = memcache.get('temp-url-key/%s' % account) - if not key: - newenv = make_pre_authed_env(env, 'HEAD', '/v1/' + account, - self.agent) - newenv['CONTENT_LENGTH'] = '0' - newenv['wsgi.input'] = StringIO('') - key = [None] - - def _start_response(status, response_headers, exc_info=None): - for h, v in response_headers: - if h.lower() == 'x-account-meta-temp-url-key': - key[0] = v - - i = iter(self.app(newenv, _start_response)) - try: - i.next() - except StopIteration: - pass - key = key[0] - if key and memcache: - memcache.set('temp-url-key/%s' % account, key, timeout=60) - return key - - def _log_request(self, env, response_status_int): - """ - Used when a request might not be logged by the underlying - WSGI application, but we'd still like to record what - happened. An early 401 Unauthorized is a good example of - this. + if len(parts) < 4 or parts[0] or not valid_api_version(parts[1]) \ + or not parts[2] or not parts[3]: + return [] - :param env: The WSGI environment for the request. - :param response_status_int: The HTTP status we'll be replying - to the request with. - """ - the_request = quote(unquote(env.get('PATH_INFO') or '/')) - if env.get('QUERY_STRING'): - the_request = the_request + '?' + env['QUERY_STRING'] - client = env.get('HTTP_X_CLUSTER_CLIENT_IP') - if not client and 'HTTP_X_FORWARDED_FOR' in env: - # remote host for other lbs - client = env['HTTP_X_FORWARDED_FOR'].split(',')[0].strip() - if not client: - client = env.get('REMOTE_ADDR') - self.logger.info(' '.join(quote(str(x)) for x in ( - client or '-', - env.get('REMOTE_ADDR') or '-', - strftime('%d/%b/%Y/%H/%M/%S', gmtime()), - env.get('REQUEST_METHOD') or 'GET', - the_request, - env.get('SERVER_PROTOCOL') or '1.0', - response_status_int, - env.get('HTTP_REFERER') or '-', - (env.get('HTTP_USER_AGENT') or '-') + ' FormPOST', - env.get('HTTP_X_AUTH_TOKEN') or '-', - '-', - '-', - '-', - env.get('swift.trans_id') or '-', - '-', - '-', - ))) + account_info = get_account_info(env, self.app, swift_source='FP') + account_keys = get_tempurl_keys_from_metadata(account_info['meta']) + + container_info = get_container_info(env, self.app, swift_source='FP') + container_keys = get_tempurl_keys_from_metadata( + container_info.get('meta', [])) + + return account_keys + container_keys def filter_factory(global_conf, **local_conf): - """ Returns the WSGI filter for use with paste.deploy. """ + """Returns the WSGI filter for use with paste.deploy.""" conf = global_conf.copy() conf.update(local_conf) + + logger = get_logger(conf, log_route='formpost') + allowed_digests, deprecated_digests = get_allowed_digests( + conf.get('allowed_digests', '').split(), logger) + info = {'allowed_digests': sorted(allowed_digests)} + if deprecated_digests: + info['deprecated_digests'] = sorted(deprecated_digests) + register_swift_info('formpost', **info) + conf.update(info) return lambda app: FormPost(app, conf) diff --git a/swift/common/middleware/gatekeeper.py b/swift/common/middleware/gatekeeper.py new file mode 100644 index 0000000000..dfb5ef04b0 --- /dev/null +++ b/swift/common/middleware/gatekeeper.py @@ -0,0 +1,138 @@ +# Copyright (c) 2010-2012 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +The ``gatekeeper`` middleware imposes restrictions on the headers that +may be included with requests and responses. Request headers are filtered +to remove headers that should never be generated by a client. Similarly, +response headers are filtered to remove private headers that should +never be passed to a client. + +The ``gatekeeper`` middleware must always be present in the proxy server +wsgi pipeline. It should be configured close to the start of the pipeline +specified in ``/etc/swift/proxy-server.conf``, immediately after catch_errors +and before any other middleware. It is essential that it is configured ahead +of all middlewares using system metadata in order that they function +correctly. + +If ``gatekeeper`` middleware is not configured in the pipeline then it will be +automatically inserted close to the start of the pipeline by the proxy server. +""" + + +from swift.common.swob import Request +from swift.common.utils import get_logger, config_true_value +from swift.common.request_helpers import ( + remove_items, get_sys_meta_prefix, OBJECT_TRANSIENT_SYSMETA_PREFIX +) +from urllib.parse import urlsplit +import re + +#: A list of python regular expressions that will be used to +#: match against inbound request headers. Matching headers will +#: be removed from the request. +# Exclude headers starting with a sysmeta prefix. +# Exclude headers starting with object transient system metadata prefix. +# Exclude headers starting with an internal backend header prefix. +# If adding to this list, note that these are regex patterns, +# so use a trailing $ to constrain to an exact header match +# rather than prefix match. +inbound_exclusions = [get_sys_meta_prefix('account'), + get_sys_meta_prefix('container'), + get_sys_meta_prefix('object'), + OBJECT_TRANSIENT_SYSMETA_PREFIX, + 'x-backend'] + + +#: A list of python regular expressions that will be used to +#: match against outbound response headers. Matching headers will +#: be removed from the response. +outbound_exclusions = inbound_exclusions + + +def make_exclusion_test(exclusions): + expr = '|'.join(exclusions) + test = re.compile(expr, re.IGNORECASE) + return test.match + + +class GatekeeperMiddleware(object): + def __init__(self, app, conf): + self.app = app + self.logger = get_logger(conf, log_route='gatekeeper') + self.inbound_condition = make_exclusion_test(inbound_exclusions) + self.outbound_condition = make_exclusion_test(outbound_exclusions) + self.shunt_x_timestamp = config_true_value( + conf.get('shunt_inbound_x_timestamp', 'true')) + self.allow_reserved_names_header = config_true_value( + conf.get('allow_reserved_names_header', 'false')) + + def __call__(self, env, start_response): + req = Request(env) + removed = remove_items(req.headers, self.inbound_condition) + if removed: + self.logger.debug('removed request headers: %s' % removed) + + if 'X-Timestamp' in req.headers and self.shunt_x_timestamp: + ts = req.headers.pop('X-Timestamp') + req.headers['X-Backend-Inbound-X-Timestamp'] = ts + # log in a similar format as the removed headers + self.logger.debug('shunted request headers: %s' % + [('X-Timestamp', ts)]) + + if 'X-Allow-Reserved-Names' in req.headers \ + and self.allow_reserved_names_header: + req.headers['X-Backend-Allow-Reserved-Names'] = \ + req.headers.pop('X-Allow-Reserved-Names') + + def gatekeeper_response(status, response_headers, exc_info=None): + def fixed_response_headers(): + def relative_path(value): + parsed = urlsplit(value) + new_path = parsed.path + if parsed.query: + new_path += ('?%s' % parsed.query) + if parsed.fragment: + new_path += ('#%s' % parsed.fragment) + return new_path + + if not env.get('swift.leave_relative_location'): + return response_headers + else: + return [ + (k, v) if k.lower() != 'location' else + (k, relative_path(v)) for (k, v) in response_headers + ] + + response_headers = fixed_response_headers() + removed = [(header, value) for header, value in response_headers + if self.outbound_condition(header)] + + if removed: + self.logger.debug('removed response headers: %s' % removed) + new_headers = [ + (header, value) for header, value in response_headers + if not self.outbound_condition(header)] + return start_response(status, new_headers, exc_info) + return start_response(status, response_headers, exc_info) + return self.app(env, gatekeeper_response) + + +def filter_factory(global_conf, **local_conf): + conf = global_conf.copy() + conf.update(local_conf) + + def gatekeeper_filter(app): + return GatekeeperMiddleware(app, conf) + return gatekeeper_filter diff --git a/swift/common/middleware/healthcheck.py b/swift/common/middleware/healthcheck.py index 75deda250a..f9f6b24ea9 100644 --- a/swift/common/middleware/healthcheck.py +++ b/swift/common/middleware/healthcheck.py @@ -1,4 +1,4 @@ -# Copyright (c) 2010-2012 OpenStack, LLC. +# Copyright (c) 2010-2012 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -31,29 +31,24 @@ class HealthCheckMiddleware(object): def __init__(self, app, conf): self.app = app - self.conf = conf self.disable_path = conf.get('disable_path', '') def GET(self, req): """Returns a 200 response with "OK" in the body.""" - return Response(request=req, body="OK", content_type="text/plain") + return Response(request=req, body=b"OK", content_type="text/plain") def DISABLED(self, req): """Returns a 503 response with "DISABLED BY FILE" in the body.""" - return Response(request=req, status=503, body="DISABLED BY FILE", + return Response(request=req, status=503, body=b"DISABLED BY FILE", content_type="text/plain") def __call__(self, env, start_response): req = Request(env) - try: - if req.path == '/healthcheck': - handler = self.GET - if self.disable_path and os.path.exists(self.disable_path): - handler = self.DISABLED - return handler(req)(env, start_response) - except UnicodeError: - # definitely, this is not /healthcheck - pass + if req.path == '/healthcheck': + handler = self.GET + if self.disable_path and os.path.exists(self.disable_path): + handler = self.DISABLED + return handler(req)(env, start_response) return self.app(env, start_response) diff --git a/swift/common/middleware/keystoneauth.py b/swift/common/middleware/keystoneauth.py index b7cdd347cb..ff25b9f347 100644 --- a/swift/common/middleware/keystoneauth.py +++ b/swift/common/middleware/keystoneauth.py @@ -1,6 +1,4 @@ -# vim: tabstop=4 shiftwidth=4 softtabstop=4 - -# Copyright 2012 OpenStack LLC +# Copyright 2012 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain @@ -15,39 +13,51 @@ # under the License. from swift.common import utils as swift_utils +from swift.common.http import is_success from swift.common.middleware import acl as swift_acl +from swift.common.request_helpers import get_sys_meta_prefix from swift.common.swob import HTTPNotFound, HTTPForbidden, HTTPUnauthorized +from swift.common.utils import config_read_reseller_options, list_from_csv +from swift.proxy.controllers.base import get_account_info +import functools + +PROJECT_DOMAIN_ID_HEADER = 'x-account-project-domain-id' +PROJECT_DOMAIN_ID_SYSMETA_HEADER = \ + get_sys_meta_prefix('account') + 'project-domain-id' +# a string that is unique w.r.t valid ids +UNKNOWN_ID = '_unknown' class KeystoneAuth(object): """Swift middleware to Keystone authorization system. - In Swift's proxy-server.conf add this middleware to your pipeline:: - - [pipeline:main] - pipeline = catch_errors cache authtoken keystoneauth proxy-server - - Make sure you have the authtoken middleware before the - keystoneauth middleware. + In Swift's proxy-server.conf add this keystoneauth middleware and the + authtoken middleware to your pipeline. Make sure you have the authtoken + middleware before the keystoneauth middleware. The authtoken middleware will take care of validating the user and keystoneauth will authorize access. - The authtoken middleware is shipped directly with keystone it - does not have any other dependences than itself so you can either + The sample proxy-server.conf shows a sample pipeline that uses keystone. + + :download:`proxy-server.conf-sample ` + + The authtoken middleware is shipped with keystonemiddleware - it + does not have any other dependencies than itself so you can either install it by copying the file directly in your python path or by - installing keystone. + installing keystonemiddleware. If support is required for unvalidated users (as with anonymous - access) or for tempurl/formpost middleware, authtoken will need - to be configured with delay_auth_decision set to 1. See the - Keystone documentation for more detail on how to configure the + access) or for formpost/staticweb/tempurl middleware, authtoken will + need to be configured with ``delay_auth_decision`` set to true. See + the Keystone documentation for more detail on how to configure the authtoken middleware. In proxy-server.conf you will need to have the setting account auto creation to true:: - [app:proxy-server] account_autocreate = true + [app:proxy-server] + account_autocreate = true And add a swift authorization filter section, such as:: @@ -55,28 +65,107 @@ class KeystoneAuth(object): use = egg:swift#keystoneauth operator_roles = admin, swiftoperator - This maps tenants to account in Swift. - - The user whose able to give ACL / create Containers permissions - will be the one that are inside the operator_roles + The user who is able to give ACL / create Containers permissions + will be the user with a role listed in the ``operator_roles`` setting which by default includes the admin and the swiftoperator roles. - The option is_admin if set to true will allow the - username that has the same name as the account name to be the owner. - - Example: If we have the account called hellocorp with a user - hellocorp that user will be admin on that account and can give ACL - to all other users for hellocorp. + The keystoneauth middleware maps a Keystone project/tenant to an account + in Swift by adding a prefix (``AUTH_`` by default) to the tenant/project + id.. For example, if the project id is ``1234``, the path is + ``/v1/AUTH_1234``. If you need to have a different reseller_prefix to be able to mix different auth servers you can configure the option - reseller_prefix in your keystoneauth entry like this : + ``reseller_prefix`` in your keystoneauth entry like this:: + + reseller_prefix = NEWAUTH + + Don't forget to also update the Keystone service endpoint configuration to + use NEWAUTH in the path. + + It is possible to have several accounts associated with the same project. + This is done by listing several prefixes as shown in the following + example:: - reseller_prefix = NEWAUTH_ + reseller_prefix = AUTH, SERVICE - Make sure you have a underscore at the end of your new - reseller_prefix option. + This means that for project id '1234', the paths '/v1/AUTH_1234' and + '/v1/SERVICE_1234' are associated with the project and are authorized + using roles that a user has with that project. The core use of this feature + is that it is possible to provide different rules for each account + prefix. The following parameters may be prefixed with the appropriate + prefix:: + + operator_roles + service_roles + + For backward compatibility, if either of these parameters is specified + without a prefix then it applies to all reseller_prefixes. Here is an + example, using two prefixes:: + + reseller_prefix = AUTH, SERVICE + # The next three lines have identical effects (since the first applies + # to both prefixes). + operator_roles = admin, swiftoperator + AUTH_operator_roles = admin, swiftoperator + SERVICE_operator_roles = admin, swiftoperator + # The next line only applies to accounts with the SERVICE prefix + SERVICE_operator_roles = admin, some_other_role + + X-Service-Token tokens are supported by the inclusion of the service_roles + configuration option. When present, this option requires that the + X-Service-Token header supply a token from a user who has a role listed + in service_roles. Here is an example configuration:: + + reseller_prefix = AUTH, SERVICE + AUTH_operator_roles = admin, swiftoperator + SERVICE_operator_roles = admin, swiftoperator + SERVICE_service_roles = service + + The keystoneauth middleware supports cross-tenant access control using the + syntax ``:`` to specify a grantee in container Access Control + Lists (ACLs). For a request to be granted by an ACL, the grantee + ```` must match the UUID of the tenant to which the request + X-Auth-Token is scoped and the grantee ```` must match the UUID of + the user authenticated by that token. + + Note that names must no longer be used in cross-tenant ACLs because with + the introduction of domains in keystone names are no longer globally + unique. + + For backwards compatibility, ACLs using names will be granted by + keystoneauth when it can be established that the grantee tenant, + the grantee user and the tenant being accessed are either not yet in a + domain (e.g. the X-Auth-Token has been obtained via the keystone v2 + API) or are all in the default domain to which legacy accounts would + have been migrated. The default domain is identified by its UUID, + which by default has the value ``default``. This can be changed by + setting the ``default_domain_id`` option in the keystoneauth + configuration:: + + default_domain_id = default + + The backwards compatible behavior can be disabled by setting the config + option ``allow_names_in_acls`` to false:: + + allow_names_in_acls = false + + To enable this backwards compatibility, keystoneauth will attempt to + determine the domain id of a tenant when any new account is created, + and persist this as account metadata. If an account is created for a tenant + using a token with reselleradmin role that is not scoped on that tenant, + keystoneauth is unable to determine the domain id of the tenant; + keystoneauth will assume that the tenant may not be in the default domain + and therefore not match names in ACLs for that account. + + By default, middleware higher in the WSGI pipeline may override auth + processing, useful for middleware such as tempurl and formpost. If you know + you're not going to use such middleware and you want a bit of extra + security you can disable this behaviour by setting the ``allow_overrides`` + option to ``false``:: + + allow_overrides = false :param app: The next WSGI app in the pipeline :param conf: The dict of configuration values @@ -85,149 +174,360 @@ def __init__(self, app, conf): self.app = app self.conf = conf self.logger = swift_utils.get_logger(conf, log_route='keystoneauth') - self.reseller_prefix = conf.get('reseller_prefix', 'AUTH_').strip() - self.operator_roles = conf.get('operator_roles', - 'admin, swiftoperator') + self.reseller_prefixes, self.account_rules = \ + config_read_reseller_options(conf, + dict(operator_roles=['admin', + 'swiftoperator'], + service_roles=[], + project_reader_roles=[])) self.reseller_admin_role = conf.get('reseller_admin_role', - 'ResellerAdmin') + 'ResellerAdmin').lower() + self.system_reader_roles = {role.lower() for role in list_from_csv( + conf.get('system_reader_roles', ''))} + config_is_admin = conf.get('is_admin', "false").lower() - self.is_admin = swift_utils.config_true_value(config_is_admin) + if swift_utils.config_true_value(config_is_admin): + self.logger.warning("The 'is_admin' option for keystoneauth is no " + "longer supported. Remove the 'is_admin' " + "option from your keystoneauth config") + config_overrides = conf.get('allow_overrides', 't').lower() self.allow_overrides = swift_utils.config_true_value(config_overrides) + self.default_domain_id = conf.get('default_domain_id', 'default') + self.allow_names_in_acls = swift_utils.config_true_value( + conf.get('allow_names_in_acls', 'true')) def __call__(self, environ, start_response): - identity = self._keystone_identity(environ) + env_identity = self._keystone_identity(environ) # Check if one of the middleware like tempurl or formpost have # set the swift.authorize_override environ and want to control the # authentication if (self.allow_overrides and environ.get('swift.authorize_override', False)): - msg = 'Authorizing from an overriding middleware (i.e: tempurl)' + msg = 'Authorizing from an overriding middleware' self.logger.debug(msg) return self.app(environ, start_response) - if identity: - self.logger.debug('Using identity: %r' % (identity)) - environ['keystone.identity'] = identity - environ['REMOTE_USER'] = identity.get('tenant') - environ['swift.authorize'] = self.authorize + if env_identity: + self.logger.debug('Using identity: %r', env_identity) + environ['REMOTE_USER'] = env_identity.get('tenant') + environ['keystone.identity'] = env_identity + environ['swift.authorize'] = functools.partial( + self.authorize, env_identity) + user_roles = (r.lower() for r in env_identity.get('roles', [])) + if self.reseller_admin_role in user_roles: + environ['reseller_request'] = True + # Set access_user_id for consistent logging across auth middlewares + access_logging = environ.setdefault('swift.access_logging', {}) + user_id, user_name = env_identity.get('user', (None, None)) + access_logging['user_id'] = user_id or user_name else: self.logger.debug('Authorizing as anonymous') environ['swift.authorize'] = self.authorize_anonymous environ['swift.clean_acl'] = swift_acl.clean_acl - return self.app(environ, start_response) + def keystone_start_response(status, response_headers, exc_info=None): + project_domain_id = None + for key, val in response_headers: + if key.lower() == PROJECT_DOMAIN_ID_SYSMETA_HEADER: + project_domain_id = val + break + if project_domain_id: + response_headers.append((PROJECT_DOMAIN_ID_HEADER, + project_domain_id)) + return start_response(status, response_headers, exc_info) + + return self.app(environ, keystone_start_response) def _keystone_identity(self, environ): """Extract the identity from the Keystone auth component.""" - if environ.get('HTTP_X_IDENTITY_STATUS') != 'Confirmed': + if (environ.get('HTTP_X_IDENTITY_STATUS') != 'Confirmed' + or environ.get( + 'HTTP_X_SERVICE_IDENTITY_STATUS') not in (None, 'Confirmed')): return - roles = [] - if 'HTTP_X_ROLES' in environ: - roles = environ['HTTP_X_ROLES'].split(',') - identity = {'user': environ.get('HTTP_X_USER_NAME'), - 'tenant': (environ.get('HTTP_X_TENANT_ID'), - environ.get('HTTP_X_TENANT_NAME')), - 'roles': roles} + roles = list_from_csv(environ.get('HTTP_X_ROLES', '')) + service_roles = list_from_csv(environ.get('HTTP_X_SERVICE_ROLES', '')) + identity = {'user': (environ.get('HTTP_X_USER_ID'), + environ.get('HTTP_X_USER_NAME')), + 'tenant': (environ.get('HTTP_X_PROJECT_ID', + environ.get('HTTP_X_TENANT_ID')), + environ.get('HTTP_X_PROJECT_NAME', + environ.get('HTTP_X_TENANT_NAME'))), + 'roles': roles, + 'service_roles': service_roles} + token_info = environ.get('keystone.token_info', {}) + auth_version = 0 + user_domain = project_domain = (None, None) + if 'access' in token_info: + # ignore any domain id headers that authtoken may have set + auth_version = 2 + elif 'token' in token_info: + auth_version = 3 + user_domain = (environ.get('HTTP_X_USER_DOMAIN_ID'), + environ.get('HTTP_X_USER_DOMAIN_NAME')) + project_domain = (environ.get('HTTP_X_PROJECT_DOMAIN_ID'), + environ.get('HTTP_X_PROJECT_DOMAIN_NAME')) + identity['user_domain'] = user_domain + identity['project_domain'] = project_domain + identity['auth_version'] = auth_version return identity - def _get_account_for_tenant(self, tenant_id): - return '%s%s' % (self.reseller_prefix, tenant_id) + def _get_account_name(self, prefix, tenant_id): + return '%s%s' % (prefix, tenant_id) - def _reseller_check(self, account, tenant_id): - """Check reseller prefix.""" - return account == self._get_account_for_tenant(tenant_id) + def _account_matches_tenant(self, account, tenant_id): + """Check if account belongs to a project/tenant""" + for prefix in self.reseller_prefixes: + if self._get_account_name(prefix, tenant_id) == account: + return True + return False + + def _get_account_prefix(self, account): + """Get the prefix of an account""" + # Empty prefix matches everything, so try to match others first + for prefix in [pre for pre in self.reseller_prefixes if pre != '']: + if account.startswith(prefix): + return prefix + if '' in self.reseller_prefixes: + return '' + return None + + def _get_project_domain_id(self, environ): + info = get_account_info(environ, self.app, 'KS') + domain_id = info.get('sysmeta', {}).get('project-domain-id') + exists = (is_success(info.get('status', 0)) + and info.get('account_really_exists', True)) + return exists, domain_id + + def _set_project_domain_id(self, req, path_parts, env_identity): + ''' + Try to determine the project domain id and save it as + account metadata. Do this for a PUT or POST to the + account, and also for a container PUT in case that + causes the account to be auto-created. + ''' + if PROJECT_DOMAIN_ID_SYSMETA_HEADER in req.headers: + return + + version, account, container, obj = path_parts + method = req.method + if (obj or (container and method != 'PUT') + or method not in ['PUT', 'POST']): + return - def _authorize_cross_tenant(self, user, tenant_id, tenant_name, roles): - """ Check cross-tenant ACLs + tenant_id, tenant_name = env_identity['tenant'] + exists, sysmeta_id = self._get_project_domain_id(req.environ) + req_has_id, req_id, new_id = False, None, None + if self._account_matches_tenant(account, tenant_id): + # domain id can be inferred from request (may be None) + req_has_id = True + req_id = env_identity['project_domain'][0] + if not exists: + # new account so set a domain id + new_id = req_id if req_has_id else UNKNOWN_ID + elif sysmeta_id is None and req_id == self.default_domain_id: + # legacy account, update if default domain id in req + new_id = req_id + elif sysmeta_id == UNKNOWN_ID and req_has_id: + # unknown domain, update if req confirms domain + new_id = req_id or '' + elif req_has_id and sysmeta_id != req_id: + self.logger.warning("Inconsistent project domain id: " + + "%s in token vs %s in account metadata." + % (req_id, sysmeta_id)) + + if new_id is not None: + req.headers[PROJECT_DOMAIN_ID_SYSMETA_HEADER] = new_id + + def _is_name_allowed_in_acl(self, req, path_parts, identity): + if not self.allow_names_in_acls: + return False + user_domain_id = identity['user_domain'][0] + if user_domain_id and user_domain_id != self.default_domain_id: + return False - Match tenant_id:user, tenant_name:user, and *:user. + proj_domain_id = identity['project_domain'][0] + if proj_domain_id and proj_domain_id != self.default_domain_id: + return False - :param user: The user name from the identity token. + # request user and scoped project are both in default domain + tenant_id, tenant_name = identity['tenant'] + version, account, container, obj = path_parts + if self._account_matches_tenant(account, tenant_id): + # account == scoped project, so account is also in default domain + allow = True + else: + # retrieve account project domain id from account sysmeta + exists, acc_domain_id = self._get_project_domain_id(req.environ) + allow = exists and acc_domain_id in [self.default_domain_id, None] + if allow: + self.logger.debug("Names allowed in acls.") + return allow + + def _authorize_cross_tenant(self, user_id, user_name, + tenant_id, tenant_name, roles, + allow_names=True): + """Check cross-tenant ACLs. + + Match tenant:user, tenant and user could be its id, name or '*' + + :param user_id: The user id from the identity token. + :param user_name: The user name from the identity token. :param tenant_id: The tenant ID from the identity token. :param tenant_name: The tenant name from the identity token. :param roles: The given container ACL. + :param allow_names: If True then attempt to match tenant and user names + as well as id's. - :returns: True if tenant_id:user, tenant_name:user, or *:user matches - the given ACL. False otherwise. + :returns: matched string if tenant(name/id/*):user(name/id/*) matches + the given ACL. + None otherwise. """ - wildcard_tenant_match = '*:%s' % (user) - tenant_id_user_match = '%s:%s' % (tenant_id, user) - tenant_name_user_match = '%s:%s' % (tenant_name, user) - - return (wildcard_tenant_match in roles - or tenant_id_user_match in roles - or tenant_name_user_match in roles) - - def authorize(self, req): - env = req.environ - env_identity = env.get('keystone.identity', {}) - tenant_id, tenant_name = env_identity.get('tenant') - user = env_identity.get('user', '') + tenant_match = [tenant_id, '*'] + user_match = [user_id, '*'] + if allow_names: + tenant_match = tenant_match + [tenant_name] + user_match = user_match + [user_name] + for tenant in tenant_match: + for user in user_match: + s = '%s:%s' % (tenant, user) + if s in roles: + return s + return None + + def authorize(self, env_identity, req): + # Cleanup - make sure that a previously set swift_owner setting is + # cleared now. This might happen for example with COPY requests. + req.environ.pop('swift_owner', None) + + tenant_id, tenant_name = env_identity['tenant'] + user_id, user_name = env_identity['user'] referrers, roles = swift_acl.parse_acl(getattr(req, 'acl', None)) + # allow OPTIONS requests to proceed as normal + if req.method == 'OPTIONS': + return + try: - part = swift_utils.split_path(req.path, 1, 4, True) + part = req.split_path(1, 4, True) version, account, container, obj = part except ValueError: return HTTPNotFound(request=req) - user_roles = env_identity.get('roles', []) + self._set_project_domain_id(req, part, env_identity) - # Give unconditional access to a user with the reseller_admin - # role. + user_roles = [r.lower() for r in env_identity.get('roles', [])] + user_service_roles = [r.lower() for r in env_identity.get( + 'service_roles', [])] + + # Give unconditional access to a user with the reseller_admin role. if self.reseller_admin_role in user_roles: msg = 'User %s has reseller admin authorizing' - self.logger.debug(msg % tenant_id) + self.logger.debug(msg, tenant_id) req.environ['swift_owner'] = True return + # Being in system_reader_roles is almost as good as reseller_admin. + if self.system_reader_roles.intersection(user_roles): + # Note that if a system reader is trying to write, we're letting + # the request fall on other access checks below. This way, + # a compliance auditor can write a log file as a normal member. + if req.method in ('GET', 'HEAD'): + msg = 'User %s has system reader authorizing' + self.logger.debug(msg, tenant_id) + # We aren't setting 'swift_owner' nor 'reseller_request' + # because they are only ever used for something that modifies + # the contents of the cluster (setting ACL, deleting accounts). + return + + # If we are not reseller admin and user is trying to delete its own + # account then deny it. + if not container and not obj and req.method == 'DELETE': + # User is not allowed to issue a DELETE on its own account + msg = 'User %s:%s is not allowed to delete its own account' + self.logger.debug(msg, tenant_name, user_name) + return self.denied_response(req) + # cross-tenant authorization - if self._authorize_cross_tenant(user, tenant_id, tenant_name, roles): - log_msg = 'user %s:%s, %s:%s, or *:%s allowed in ACL authorizing' - self.logger.debug(log_msg % (tenant_name, user, - tenant_id, user, user)) + matched_acl = None + if roles: + allow_names = self._is_name_allowed_in_acl(req, part, env_identity) + matched_acl = self._authorize_cross_tenant(user_id, user_name, + tenant_id, tenant_name, + roles, allow_names) + if matched_acl is not None: + log_msg = 'user %s allowed in ACL authorizing.' + self.logger.debug(log_msg, matched_acl) + return + + acl_authorized = self._authorize_unconfirmed_identity(req, obj, + referrers, + roles) + if acl_authorized: return # Check if a user tries to access an account that does not match their # token - if not self._reseller_check(account, tenant_id): - log_msg = 'tenant mismatch: %s != %s' % (account, tenant_id) - self.logger.debug(log_msg) + if not self._account_matches_tenant(account, tenant_id): + log_msg = 'tenant mismatch: %s != %s' + self.logger.debug(log_msg, account, tenant_id) return self.denied_response(req) - # Check the roles the user is belonging to. If the user is - # part of the role defined in the config variable - # operator_roles (like admin) then it will be - # promoted as an admin of the account/tenant. - for role in self.operator_roles.split(','): - role = role.strip() - if role in user_roles: - log_msg = 'allow user with role %s as account admin' % (role) - self.logger.debug(log_msg) - req.environ['swift_owner'] = True - return - - # If user is of the same name of the tenant then make owner of it. - if self.is_admin and user == tenant_name: + # Compare roles from tokens against the configuration options: + # + # X-Auth-Token role Has specified X-Service-Token role Grant + # in operator_roles? service_roles? in service_roles? swift_owner? + # ------------------ -------------- -------------------- ------------ + # yes yes yes yes + # yes yes no no + # yes no don't care yes + # no don't care don't care no + # ------------------ -------------- -------------------- ------------ + account_prefix = self._get_account_prefix(account) + operator_roles = self.account_rules[account_prefix]['operator_roles'] + have_operator_role = set(operator_roles).intersection( + set(user_roles)) + service_roles = self.account_rules[account_prefix]['service_roles'] + have_service_role = set(service_roles).intersection( + set(user_service_roles)) + allowed = False + if have_operator_role and (service_roles and have_service_role): + allowed = True + elif have_operator_role and not service_roles: + allowed = True + if allowed: + log_msg = 'allow user with role(s) %s as account admin' + self.logger.debug(log_msg, ','.join(have_operator_role.union( + have_service_role))) req.environ['swift_owner'] = True return - authorized = self._authorize_unconfirmed_identity(req, obj, referrers, - roles) - if authorized: - return - elif authorized is not None: + # The project_reader_roles is almost as good as operator_roles. But + # it does not work with service tokens and does not get 'swift_owner'. + # And, it only serves GET requests, obviously. + project_reader_roles = self.account_rules[account_prefix][ + 'project_reader_roles'] + have_reader_role = set(project_reader_roles).intersection( + set(user_roles)) + if have_reader_role: + if req.method in ('GET', 'HEAD'): + msg = 'User %s with role(s) %s has project reader authorizing' + self.logger.debug(msg, tenant_id, + ','.join(project_reader_roles)) + return + + if acl_authorized is not None: return self.denied_response(req) # Check if we have the role in the userroles and allow it for user_role in user_roles: - if user_role in roles: + if user_role in (r.lower() for r in roles): log_msg = 'user %s:%s allowed in ACL: %s authorizing' - self.logger.debug(log_msg % (tenant_name, user, user_role)) + self.logger.debug(log_msg, tenant_name, user_name, + user_role) return return self.denied_response(req) @@ -239,13 +539,18 @@ def authorize_anonymous(self, req): :returns: None if authorization is granted, an error page otherwise. """ try: - part = swift_utils.split_path(req.path, 1, 4, True) + part = req.split_path(1, 4, True) version, account, container, obj = part except ValueError: return HTTPNotFound(request=req) + # allow OPTIONS requests to proceed as normal + if req.method == 'OPTIONS': + return + is_authoritative_authz = (account and - account.startswith(self.reseller_prefix)) + (self._get_account_prefix(account) in + self.reseller_prefixes)) if not is_authoritative_authz: return self.denied_response(req) @@ -268,15 +573,15 @@ def _authorize_unconfirmed_identity(self, req, obj, referrers, roles): and (req.environ['swift_sync_key'] == req.headers.get('x-container-sync-key', None)) and 'x-timestamp' in req.headers): - log_msg = 'allowing proxy %s for container-sync' % req.remote_addr - self.logger.debug(log_msg) + log_msg = 'allowing proxy %s for container-sync' + self.logger.debug(log_msg, req.remote_addr) return True # Check if referrer is allowed. if swift_acl.referrer_allowed(req.referer, referrers): if obj or '.rlistings' in roles: - log_msg = 'authorizing %s via referer ACL' % req.referrer - self.logger.debug(log_msg) + log_msg = 'authorizing %s via referer ACL' + self.logger.debug(log_msg, req.referrer) return True return False diff --git a/swift/common/middleware/list_endpoints.py b/swift/common/middleware/list_endpoints.py new file mode 100644 index 0000000000..edceec421e --- /dev/null +++ b/swift/common/middleware/list_endpoints.py @@ -0,0 +1,264 @@ +# Copyright (c) 2012 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +List endpoints for an object, account or container. + +This middleware makes it possible to integrate swift with software +that relies on data locality information to avoid network overhead, +such as Hadoop. + +Using the original API, answers requests of the form:: + + /endpoints/{account}/{container}/{object} + /endpoints/{account}/{container} + /endpoints/{account} + /endpoints/v1/{account}/{container}/{object} + /endpoints/v1/{account}/{container} + /endpoints/v1/{account} + +with a JSON-encoded list of endpoints of the form:: + + http://{server}:{port}/{dev}/{part}/{acc}/{cont}/{obj} + http://{server}:{port}/{dev}/{part}/{acc}/{cont} + http://{server}:{port}/{dev}/{part}/{acc} + +correspondingly, e.g.:: + + http://10.1.1.1:6200/sda1/2/a/c2/o1 + http://10.1.1.1:6200/sda1/2/a/c2 + http://10.1.1.1:6200/sda1/2/a + +Using the v2 API, answers requests of the form:: + + /endpoints/v2/{account}/{container}/{object} + /endpoints/v2/{account}/{container} + /endpoints/v2/{account} + +with a JSON-encoded dictionary containing a key 'endpoints' that maps to a list +of endpoints having the same form as described above, and a key 'headers' that +maps to a dictionary of headers that should be sent with a request made to +the endpoints, e.g.:: + + { "endpoints": {"http://10.1.1.1:6210/sda1/2/a/c3/o1", + "http://10.1.1.1:6230/sda3/2/a/c3/o1", + "http://10.1.1.1:6240/sda4/2/a/c3/o1"}, + "headers": {"X-Backend-Storage-Policy-Index": "1"}} + +In this example, the 'headers' dictionary indicates that requests to the +endpoint URLs should include the header 'X-Backend-Storage-Policy-Index: 1' +because the object's container is using storage policy index 1. + +The '/endpoints/' path is customizable ('list_endpoints_path' +configuration parameter). + +Intended for consumption by third-party services living inside the +cluster (as the endpoints make sense only inside the cluster behind +the firewall); potentially written in a different language. + +This is why it's provided as a REST API and not just a Python API: +to avoid requiring clients to write their own ring parsers in their +languages, and to avoid the necessity to distribute the ring file +to clients and keep it up-to-date. + +Note that the call is not authenticated, which means that a proxy +with this middleware enabled should not be open to an untrusted +environment (everyone can query the locality data using this middleware). +""" + +import json + +from urllib.parse import quote, unquote + +from swift.common.ring import Ring +from swift.common.utils import get_logger, split_path +from swift.common.swob import Request, Response +from swift.common.swob import HTTPBadRequest, HTTPMethodNotAllowed +from swift.common.storage_policy import POLICIES +from swift.proxy.controllers.base import get_container_info + +RESPONSE_VERSIONS = (1.0, 2.0) + + +class ListEndpointsMiddleware(object): + """ + List endpoints for an object, account or container. + + See above for a full description. + + Uses configuration parameter `swift_dir` (default `/etc/swift`). + + :param app: The next WSGI filter or app in the paste.deploy + chain. + :param conf: The configuration dict for the middleware. + """ + + def __init__(self, app, conf): + self.app = app + self.logger = get_logger(conf, log_route='endpoints') + self.swift_dir = conf.get('swift_dir', '/etc/swift') + self.account_ring = Ring(self.swift_dir, ring_name='account') + self.container_ring = Ring(self.swift_dir, ring_name='container') + self.endpoints_path = conf.get('list_endpoints_path', '/endpoints/') + if not self.endpoints_path.endswith('/'): + self.endpoints_path += '/' + self.default_response_version = 1.0 + self.response_map = { + 1.0: self.v1_format_response, + 2.0: self.v2_format_response, + } + + def get_object_ring(self, policy_idx): + """ + Get the ring object to use to handle a request based on its policy. + + :policy_idx: policy index as defined in swift.conf + :returns: appropriate ring object + """ + return POLICIES.get_object_ring(policy_idx, self.swift_dir) + + def _parse_version(self, raw_version): + err_msg = 'Unsupported version %r' % raw_version + try: + version = float(raw_version.lstrip('v')) + except ValueError: + raise ValueError(err_msg) + if not any(version == v for v in RESPONSE_VERSIONS): + raise ValueError(err_msg) + return version + + def _parse_path(self, request): + """ + Parse path parts of request into a tuple of version, account, + container, obj. Unspecified container or obj is filled in as + None; account is required; version is always returned as a + float using the configured default response version if not + specified in the request. + + :param request: the swob request + + :returns: parsed path parts as a tuple with version filled in as + configured default response version if not specified. + :raises ValueError: if path is invalid, message will say why. + """ + clean_path = request.path[len(self.endpoints_path) - 1:] + # try to peel off version + try: + raw_version, rest = split_path(clean_path, 1, 2, True) + except ValueError: + raise ValueError('No account specified') + try: + version = self._parse_version(raw_version) + except ValueError: + if raw_version.startswith('v') and '_' not in raw_version: + # looks more like an invalid version than an account + raise + # probably no version specified, but if the client really + # said /endpoints/v_3/account they'll probably be sorta + # confused by the useless response and lack of error. + version = self.default_response_version + rest = clean_path + else: + rest = '/' + rest if rest else '/' + try: + account, container, obj = split_path(rest, 1, 3, True) + except ValueError: + raise ValueError('No account specified') + return version, account, container, obj + + def v1_format_response(self, req, endpoints, **kwargs): + return Response(json.dumps(endpoints), + content_type='application/json') + + def v2_format_response(self, req, endpoints, storage_policy_index, + **kwargs): + resp = { + 'endpoints': endpoints, + 'headers': {}, + } + if storage_policy_index is not None: + resp['headers'][ + 'X-Backend-Storage-Policy-Index'] = str(storage_policy_index) + return Response(json.dumps(resp), + content_type='application/json') + + def __call__(self, env, start_response): + request = Request(env) + if not request.path.startswith(self.endpoints_path): + return self.app(env, start_response) + + if request.method != 'GET': + return HTTPMethodNotAllowed( + req=request, headers={"Allow": "GET"})(env, start_response) + + try: + version, account, container, obj = self._parse_path(request) + except ValueError as err: + return HTTPBadRequest(str(err))(env, start_response) + + account = unquote(account) + if container is not None: + container = unquote(container) + if obj is not None: + obj = unquote(obj) + + storage_policy_index = None + if obj is not None: + container_info = get_container_info( + {'PATH_INFO': '/v1/%s/%s' % (account, container)}, + self.app, swift_source='LE') + storage_policy_index = container_info['storage_policy'] + obj_ring = self.get_object_ring(storage_policy_index) + partition, nodes = obj_ring.get_nodes( + account, container, obj) + endpoint_template = 'http://{ip}:{port}/{device}/{partition}/' + \ + '{account}/{container}/{obj}' + elif container is not None: + partition, nodes = self.container_ring.get_nodes( + account, container) + endpoint_template = 'http://{ip}:{port}/{device}/{partition}/' + \ + '{account}/{container}' + else: + partition, nodes = self.account_ring.get_nodes( + account) + endpoint_template = 'http://{ip}:{port}/{device}/{partition}/' + \ + '{account}' + + endpoints = [] + for node in nodes: + endpoint = endpoint_template.format( + ip=node['ip'], + port=node['port'], + device=node['device'], + partition=partition, + account=quote(account), + container=quote(container or ''), + obj=quote(obj or '')) + endpoints.append(endpoint) + + resp = self.response_map[version]( + request, endpoints=endpoints, + storage_policy_index=storage_policy_index) + return resp(env, start_response) + + +def filter_factory(global_conf, **local_conf): + conf = global_conf.copy() + conf.update(local_conf) + + def list_endpoints_filter(app): + return ListEndpointsMiddleware(app, conf) + + return list_endpoints_filter diff --git a/swift/common/middleware/listing_formats.py b/swift/common/middleware/listing_formats.py new file mode 100644 index 0000000000..290a73152a --- /dev/null +++ b/swift/common/middleware/listing_formats.py @@ -0,0 +1,265 @@ +# Copyright (c) 2017 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from xml.etree.cElementTree import Element, SubElement, tostring + +from swift.common.constraints import valid_api_version +from swift.common.header_key_dict import HeaderKeyDict +from swift.common.http import HTTP_NO_CONTENT +from swift.common.request_helpers import get_param +from swift.common.swob import HTTPException, HTTPNotAcceptable, Request, \ + RESPONSE_REASONS, HTTPBadRequest, wsgi_quote, wsgi_to_bytes +from swift.common.utils import RESERVED, get_logger, list_from_csv + + +#: Mapping of query string ``format=`` values to their corresponding +#: content-type values. +FORMAT2CONTENT_TYPE = {'plain': 'text/plain', 'json': 'application/json', + 'xml': 'application/xml'} +#: Maximum size of a valid JSON container listing body. If we receive +#: a container listing response larger than this, assume it's a staticweb +#: response and pass it on to the client. +# Default max object length is 1024, default container listing limit is 1e4; +# add a fudge factor for things like hash, last_modified, etc. +MAX_CONTAINER_LISTING_CONTENT_LENGTH = 1024 * 10000 * 2 + + +def get_listing_content_type(req): + """ + Determine the content type to use for an account or container listing + response. + + :param req: request object + :returns: content type as a string (e.g. text/plain, application/json) + :raises HTTPNotAcceptable: if the requested content type is not acceptable + :raises HTTPBadRequest: if the 'format' query param is provided and + not valid UTF-8 + """ + query_format = get_param(req, 'format') + if query_format: + req.accept = FORMAT2CONTENT_TYPE.get( + query_format.lower(), FORMAT2CONTENT_TYPE['plain']) + try: + out_content_type = req.accept.best_match( + ['text/plain', 'application/json', 'application/xml', 'text/xml']) + except ValueError: + raise HTTPBadRequest(request=req, body=b'Invalid Accept header') + if not out_content_type: + raise HTTPNotAcceptable(request=req) + return out_content_type + + +def to_xml(document_element): + result = tostring(document_element, encoding='UTF-8').replace( + b"", + b'', 1) + if not result.startswith(b'\n' + result + return result + + +def account_to_xml(listing, account_name): + doc = Element('account', name=account_name) + doc.text = '\n' + for record in listing: + if 'subdir' in record: + name = record.pop('subdir') + sub = SubElement(doc, 'subdir', name=name) + else: + sub = SubElement(doc, 'container') + for field in ('name', 'count', 'bytes', 'last_modified'): + SubElement(sub, field).text = str(record.pop(field)) + for field in ('storage_policy',): + if field in record: + SubElement(sub, field).text = str(record.pop(field)) + sub.tail = '\n' + return to_xml(doc) + + +def container_to_xml(listing, base_name): + doc = Element('container', name=base_name) + for record in listing: + if 'subdir' in record: + name = record.pop('subdir') + sub = SubElement(doc, 'subdir', name=name) + SubElement(sub, 'name').text = name + else: + sub = SubElement(doc, 'object') + for field in ('name', 'hash', 'bytes', 'content_type', + 'last_modified'): + SubElement(sub, field).text = str(record.pop(field)) + return to_xml(doc) + + +def listing_to_text(listing): + def get_lines(): + for item in listing: + if 'name' in item: + yield item['name'].encode('utf-8') + b'\n' + else: + yield item['subdir'].encode('utf-8') + b'\n' + return b''.join(get_lines()) + + +class ListingFilter(object): + def __init__(self, app, conf, logger=None): + self.app = app + self.logger = logger or get_logger(conf, log_route='listing-filter') + + def filter_reserved(self, listing, account, container): + new_listing = [] + for entry in list(listing): + for key in ('name', 'subdir'): + value = entry.get(key, '') + if RESERVED in value: + if container: + self.logger.warning( + 'Container listing for %s/%s had ' + 'reserved byte in %s: %r', + wsgi_quote(account), wsgi_quote(container), + key, value) + else: + self.logger.warning( + 'Account listing for %s had ' + 'reserved byte in %s: %r', + wsgi_quote(account), key, value) + break # out of the *key* loop; check next entry + else: + new_listing.append(entry) + return new_listing + + def __call__(self, env, start_response): + req = Request(env) + try: + # account and container only + version, acct, cont = req.split_path(2, 3) + except ValueError: + is_account_or_container_req = False + else: + is_account_or_container_req = True + if not is_account_or_container_req: + return self.app(env, start_response) + + if not valid_api_version(version) or req.method not in ('GET', 'HEAD'): + return self.app(env, start_response) + + # OK, definitely have an account/container request. + # Get the desired content-type, then force it to a JSON request. + try: + out_content_type = get_listing_content_type(req) + except HTTPException as err: + return err(env, start_response) + + params = req.params + can_vary = 'format' not in params + params['format'] = 'json' + req.params = params + + # Give other middlewares a chance to be in charge + env.setdefault('swift.format_listing', True) + status, headers, resp_iter = req.call_application(self.app) + if not env.get('swift.format_listing'): + start_response(status, headers) + return resp_iter + + if not status.startswith(('200 ', '204 ')): + start_response(status, headers) + return resp_iter + + headers_dict = HeaderKeyDict(headers) + resp_content_type = headers_dict.get( + 'content-type', '').partition(';')[0] + resp_length = headers_dict.get('content-length') + + if can_vary: + if 'vary' in headers_dict: + value = headers_dict['vary'] + if 'accept' not in list_from_csv(value.lower()): + headers_dict['vary'] = value + ', Accept' + else: + headers_dict['vary'] = 'Accept' + + if resp_content_type != 'application/json': + start_response(status, list(headers_dict.items())) + return resp_iter + + if req.method == 'HEAD': + headers_dict['content-type'] = out_content_type + '; charset=utf-8' + # proxy logging (and maybe other mw?) seem to be good about + # sticking this on HEAD/204 but we do it here to be responsible + # and explicit + headers_dict['content-length'] = 0 + start_response(status, list(headers_dict.items())) + return resp_iter + + if resp_length is None or \ + int(resp_length) > MAX_CONTAINER_LISTING_CONTENT_LENGTH: + start_response(status, list(headers_dict.items())) + return resp_iter + + body = b''.join(resp_iter) + try: + listing = json.loads(body) + # Do a couple sanity checks + if not isinstance(listing, list): + raise ValueError + if not all(isinstance(item, dict) for item in listing): + raise ValueError + except ValueError: + # Static web listing that's returning invalid JSON? + # Just pass it straight through; that's about all we *can* do. + start_response(status, list(headers_dict.items())) + return [body] + + if not req.allow_reserved_names: + listing = self.filter_reserved(listing, acct, cont) + + try: + if out_content_type.endswith('/xml'): + if cont: + body = container_to_xml( + listing, wsgi_to_bytes(cont).decode('utf-8')) + else: + body = account_to_xml( + listing, wsgi_to_bytes(acct).decode('utf-8')) + elif out_content_type == 'text/plain': + body = listing_to_text(listing) + else: + body = json.dumps(listing).encode('ascii') + except KeyError: + # listing was in a bad format -- funky static web listing?? + start_response(status, list(headers_dict.items())) + return [body] + + if not body: + status = '%s %s' % (HTTP_NO_CONTENT, + RESPONSE_REASONS[HTTP_NO_CONTENT][0]) + + headers_dict['content-type'] = out_content_type + '; charset=utf-8' + headers_dict['content-length'] = len(body) + start_response(status, list(headers_dict.items())) + return [body] + + +def filter_factory(global_conf, **local_conf): + conf = global_conf.copy() + conf.update(local_conf) + + def listing_filter(app): + return ListingFilter(app, conf) + return listing_filter diff --git a/swift/common/middleware/memcache.py b/swift/common/middleware/memcache.py index 13e16d4c68..1bb142657d 100644 --- a/swift/common/middleware/memcache.py +++ b/swift/common/middleware/memcache.py @@ -1,4 +1,4 @@ -# Copyright (c) 2010-2012 OpenStack, LLC. +# Copyright (c) 2010-2012 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,10 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -from ConfigParser import ConfigParser, NoSectionError, NoOptionError - -from swift.common.memcached import MemcacheRing +from swift.common.memcached import load_memcache +from swift.common.utils import get_logger class MemcacheMiddleware(object): @@ -26,39 +24,8 @@ class MemcacheMiddleware(object): def __init__(self, app, conf): self.app = app - self.memcache_servers = conf.get('memcache_servers') - serialization_format = conf.get('memcache_serialization_support') - - if not self.memcache_servers or serialization_format is None: - path = os.path.join(conf.get('swift_dir', '/etc/swift'), - 'memcache.conf') - memcache_conf = ConfigParser() - if memcache_conf.read(path): - if not self.memcache_servers: - try: - self.memcache_servers = \ - memcache_conf.get('memcache', 'memcache_servers') - except (NoSectionError, NoOptionError): - pass - if serialization_format is None: - try: - serialization_format = \ - memcache_conf.get('memcache', - 'memcache_serialization_support') - except (NoSectionError, NoOptionError): - pass - - if not self.memcache_servers: - self.memcache_servers = '127.0.0.1:11211' - if serialization_format is None: - serialization_format = 2 - else: - serialization_format = int(serialization_format) - - self.memcache = MemcacheRing( - [s.strip() for s in self.memcache_servers.split(',') if s.strip()], - allow_pickle=(serialization_format == 0), - allow_unpickle=(serialization_format <= 1)) + self.logger = get_logger(conf, log_route='memcache') + self.memcache = load_memcache(conf, self.logger) def __call__(self, env, start_response): env['swift.cache'] = self.memcache diff --git a/swift/common/middleware/name_check.py b/swift/common/middleware/name_check.py index 39411690c9..b13c5a76bd 100644 --- a/swift/common/middleware/name_check.py +++ b/swift/common/middleware/name_check.py @@ -1,4 +1,4 @@ -# Copyright (c) 2012 OpenStack, LLC. +# Copyright (c) 2012 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,14 +15,17 @@ ''' Created on February 27, 2012 -A filter that disallows any paths that contain defined forbidden characters -or that exceed a defined length. +A filter that disallows any paths that contain defined forbidden characters or +that exceed a defined length. -Place in proxy filter before proxy, e.g. +Place early in the proxy-server pipeline after the left-most occurrence of the +``proxy-logging`` middleware (if present) and before the final +``proxy-logging`` middleware (if present) or the ``proxy-serer`` app itself, +e.g.:: [pipeline:main] - pipeline = catch_errors healthcheck name_check cache ratelimit tempauth sos - proxy-logging proxy-server + pipeline = catch_errors healthcheck proxy-logging name_check cache \ +ratelimit tempauth sos proxy-logging proxy-server [filter:name_check] use = egg:swift#name_check @@ -39,14 +42,14 @@ import re from swift.common.utils import get_logger -from urllib2 import unquote +from swift.common.registry import register_swift_info from swift.common.swob import Request, HTTPBadRequest FORBIDDEN_CHARS = "\'\"`<>" MAX_LENGTH = 255 -FORBIDDEN_REGEXP = "/\./|/\.\./|/\.$|/\.\.$" +FORBIDDEN_REGEXP = r"/\./|/\.\./|/\.$|/\.\.$" class NameCheckMiddleware(object): @@ -56,7 +59,7 @@ def __init__(self, app, conf): self.conf = conf self.forbidden_chars = self.conf.get('forbidden_chars', FORBIDDEN_CHARS) - self.maximum_length = self.conf.get('maximum_length', MAX_LENGTH) + self.maximum_length = int(self.conf.get('maximum_length', MAX_LENGTH)) self.forbidden_regexp = self.conf.get('forbidden_regexp', FORBIDDEN_REGEXP) if self.forbidden_regexp: @@ -65,6 +68,15 @@ def __init__(self, app, conf): self.forbidden_regexp_compiled = None self.logger = get_logger(self.conf, log_route='name_check') + self.register_info() + + def register_info(self): + register_swift_info('name_check', + forbidden_chars=self.forbidden_chars, + maximum_length=self.maximum_length, + forbidden_regexp=self.forbidden_regexp + ) + def check_character(self, req): ''' Checks req.path for any forbidden characters @@ -75,12 +87,7 @@ def check_character(self, req): self.logger.debug("name_check: self.forbidden_chars %s" % self.forbidden_chars) - for c in unquote(req.path): - if c in self.forbidden_chars: - return True - else: - pass - return False + return any((c in req.path_info) for c in self.forbidden_chars) def check_length(self, req): ''' @@ -88,11 +95,8 @@ def check_length(self, req): Returns True if the length exceeds the maximum Returns False if the length is <= the maximum ''' - length = len(unquote(req.path)) - if length > self.maximum_length: - return True - else: - return False + length = len(req.path_info) + return length > self.maximum_length def check_regexp(self, req): ''' @@ -107,8 +111,7 @@ def check_regexp(self, req): self.logger.debug("name_check: self.forbidden_regexp %s" % self.forbidden_regexp) - unquoted_path = unquote(req.path) - match = self.forbidden_regexp_compiled.search(unquoted_path) + match = self.forbidden_regexp_compiled.search(req.path_info) return (match is not None) def __call__(self, env, start_response): @@ -117,18 +120,20 @@ def __call__(self, env, start_response): if self.check_character(req): return HTTPBadRequest( request=req, - body=("Object/Container name contains forbidden chars from %s" + body=("Object/Container/Account name contains forbidden " + "chars from %s" % self.forbidden_chars))(env, start_response) elif self.check_length(req): return HTTPBadRequest( request=req, - body=("Object/Container name longer than the allowed maximum " + body=("Object/Container/Account name longer than the " + "allowed maximum " "%s" % self.maximum_length))(env, start_response) elif self.check_regexp(req): return HTTPBadRequest( request=req, - body=("Object/Container name contains a forbidden substring " - "from regular expression %s" + body=("Object/Container/Account name contains a forbidden " + "substring from regular expression %s" % self.forbidden_regexp))(env, start_response) else: # Pass on to downstream WSGI component diff --git a/swift/common/middleware/proxy_logging.py b/swift/common/middleware/proxy_logging.py index a63e10436f..5fc46b9715 100644 --- a/swift/common/middleware/proxy_logging.py +++ b/swift/common/middleware/proxy_logging.py @@ -1,4 +1,4 @@ -# Copyright (c) 2010-2011 OpenStack, LLC. +# Copyright (c) 2010-2011 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,70 +19,170 @@ This serves as both the default logging implementation and an example of how to plug in your own logging format/method. -The logging format implemented below is as follows: +The logging format implemented below is as follows:: -client_ip remote_addr datetime request_method request_path protocol - status_int referer user_agent auth_token bytes_recvd bytes_sent - client_etag transaction_id headers request_time source + client_ip remote_addr end_time.datetime method path protocol + status_int referer user_agent auth_token bytes_recvd bytes_sent + client_etag transaction_id headers request_time source log_info + start_time end_time policy_index These values are space-separated, and each is url-encoded, so that they can -be separated with a simple .split() +be separated with a simple ``.split()``. -* remote_addr is the contents of the REMOTE_ADDR environment variable, while - client_ip is swift's best guess at the end-user IP, extracted variously - from the X-Forwarded-For header, X-Cluster-Ip header, or the REMOTE_ADDR - environment variable. +* ``remote_addr`` is the contents of the REMOTE_ADDR environment variable, + while ``client_ip`` is swift's best guess at the end-user IP, extracted + variously from the X-Forwarded-For header, X-Cluster-Ip header, or the + REMOTE_ADDR environment variable. + +* ``status_int`` is the integer part of the ``status`` string passed to this + middleware's start_response function, unless the WSGI environment has an item + with key ``swift.proxy_logging_status``, in which case the value of that item + is used. Other middleware's may set ``swift.proxy_logging_status`` to + override the logging of ``status_int``. In either case, the logged + ``status_int`` value is forced to 499 if a client disconnect is detected + while this middleware is handling a request, or 500 if an exception is caught + while handling a request. + +* ``source`` (``swift.source`` in the WSGI environment) indicates the code + that generated the request, such as most middleware. (See below for + more detail.) + +* ``log_info`` (``swift.log_info`` in the WSGI environment) is for additional + information that could prove quite useful, such as any ``x-delete-at`` + value or other "behind the scenes" activity that might not + otherwise be detectable from the plain log information. Code that + wishes to add additional log information should use code like + ``env.setdefault('swift.log_info', []).append(your_info)`` so as to + not disturb others' log information. * Values that are missing (e.g. due to a header not being present) or zero are generally represented by a single hyphen ('-'). + +.. note:: + The message format may be configured using the ``log_msg_template`` option, + allowing fields to be added, removed, re-ordered, and even anonymized. For + more information, see https://docs.openstack.org/swift/latest/logs.html + +The proxy-logging can be used twice in the proxy server's pipeline when there +is middleware installed that can return custom responses that don't follow the +standard pipeline to the proxy server. + +For example, with staticweb, the middleware might intercept a request to +/v1/AUTH_acc/cont/, make a subrequest to the proxy to retrieve +/v1/AUTH_acc/cont/index.html and, in effect, respond to the client's original +request using the 2nd request's body. In this instance the subrequest will be +logged by the rightmost middleware (with a ``swift.source`` set) and the +outgoing request (with body overridden) will be logged by leftmost middleware. + +Requests that follow the normal pipeline (use the same wsgi environment +throughout) will not be double logged because an environment variable +(``swift.proxy_access_log_made``) is checked/set when a log is made. + +All middleware making subrequests should take care to set ``swift.source`` when +needed. With the doubled proxy logs, any consumer/processor of swift's proxy +logs should look at the ``swift.source`` field, the rightmost log value, to +decide if this is a middleware subrequest or not. A log processor calculating +bandwidth usage will want to only sum up logs with no ``swift.source``. """ +import os import time -from urllib import quote, unquote +from collections import ChainMap +from swift.common.constraints import valid_api_version +from swift.common.middleware.catch_errors import ByteEnforcer +from swift.common.middleware.s3api.utils import extract_bucket_and_key, \ + is_s3_req +from swift.common.request_helpers import get_log_info from swift.common.swob import Request from swift.common.utils import (get_logger, get_remote_client, - get_valid_utf8_str, config_true_value) + config_true_value, reiterate, + close_if_possible, cap_length, + InputProxy, list_from_csv, + get_policy_index, LogStringFormatter, + split_path, StrAnonymizer, StrFormatTime) +from swift.common.statsd_client import get_labeled_statsd_client + +from swift.common.storage_policy import POLICIES +from swift.common.registry import get_sensitive_headers, \ + get_sensitive_params, register_sensitive_header + + +def statsd_metric_resp_labels(base_labels, status_int=None, policy_index=None): + # compose labels used for response metrics + extra_labels = {} + if policy_index is not None: + extra_labels['policy'] = policy_index + if status_int: + extra_labels['status'] = status_int + labels_source = ChainMap(extra_labels, base_labels) + return labels_source -class InputProxy(object): +class CallbackInputProxy(InputProxy): """ - File-like object that counts bytes read. - To be swapped in for wsgi.input for accounting purposes. + :param wsgi_input: file-like object to be wrapped + :param callback: a function or a callable that + accept args (chunk, eof), + and returns chunk or a modified chunk. + eof is ``True`` if there are no more bytes to + read from the wrapped input, ``False`` otherwise. """ - def __init__(self, wsgi_input): - """ - :param wsgi_input: file-like object to wrap the functionality of - """ - self.wsgi_input = wsgi_input - self.bytes_received = 0 - self.client_disconnect = False + def __init__(self, wsgi_input, callback): + super().__init__(wsgi_input) + self.callback = callback - def read(self, *args, **kwargs): - """ - Pass read request to the underlying file-like object and - add bytes read to total. - """ - try: - chunk = self.wsgi_input.read(*args, **kwargs) - except Exception: - self.client_disconnect = True - raise - self.bytes_received += len(chunk) - return chunk + def chunk_update(self, chunk, eof, *arg, **kwargs): + return self.callback(chunk, eof) - def readline(self, *args, **kwargs): - """ - Pass readline request to the underlying file-like object and - add bytes read to total. + +class BufferXferEmitCallback(object): + def __init__(self, metric_name, labels, statsd, + emit_buffer_xfer_bytes_sec): + self.metric_name = metric_name + self.labels = labels + self.statsd = statsd + self.emit_buffer_xfer_bytes_sec = emit_buffer_xfer_bytes_sec + self.emit_bytes = 0 + self.next_emit_time = 0 + if self.emit_buffer_xfer_bytes_sec > 0: + self.next_emit_time = (time.time() + + self.emit_buffer_xfer_bytes_sec) + + def __call__(self, buffer, eof=False): + self._maybe_emit_stat(buffer, eof) + return buffer + + def _maybe_emit_stat(self, buffer, eof=False): """ - try: - line = self.wsgi_input.readline(*args, **kwargs) - except Exception: - self.client_disconnect = True - raise - self.bytes_received += len(line) - return line + Accumulate the length of ``buffer`` and periodically emit a stat + with the accumulated length. + + :param buffer: the buffer that has been read. + :param eof: if True, a stat is emitted immediately; otherwise a + stat will be emitted when ``next_emit_time`` has been reached. + """ + + if self.emit_buffer_xfer_bytes_sec < 0: + return + buffer_len = len(buffer) + self.emit_bytes += buffer_len + if not self.labels.get('account', None): + # tolerate no account, maybe it'll be there in time for next stat + return + + now = time.time() + if eof is False and self.next_emit_time > now: + return + + if self.emit_bytes != 0: + self.statsd.update_stats( + self.metric_name, + self.emit_bytes, + labels=self.labels, + ) + self.emit_bytes = 0 + self.next_emit_time = (now + self.emit_buffer_xfer_bytes_sec) class ProxyLoggingMiddleware(object): @@ -90,9 +190,29 @@ class ProxyLoggingMiddleware(object): Middleware that logs Swift proxy requests in the swift log format. """ - def __init__(self, app, conf): + def __init__(self, app, conf, logger=None): self.app = app - self.log_hdrs = config_true_value(conf.get('log_headers', 'no')) + self.pid = os.getpid() + self.log_formatter = LogStringFormatter(default='-', quote=True) + self.log_msg_template = conf.get( + 'log_msg_template', ( + '{client_ip} {remote_addr} {end_time.datetime} {method} ' + '{path} {protocol} {status_int} {referer} {user_agent} ' + '{auth_token} {bytes_recvd} {bytes_sent} {client_etag} ' + '{transaction_id} {headers} {request_time} {source} ' + '{log_info} {start_time} {end_time} {policy_index} ' + '{access_user_id}')) + # The salt is only used in StrAnonymizer. This class requires bytes, + # convert it now to prevent useless convertion later. + self.anonymization_method = conf.get('log_anonymization_method', 'md5') + self.anonymization_salt = conf.get('log_anonymization_salt', '') + self.storage_domains = list_from_csv(conf.get('storage_domain', '')) + self.log_hdrs = config_true_value(conf.get( + 'access_log_headers', + conf.get('log_headers', 'no'))) + log_hdrs_only = list_from_csv(conf.get( + 'access_log_headers_only', '')) + self.log_hdrs_only = [x.title() for x in log_hdrs_only] # The leading access_* check is in case someone assumes that # log_statsd_valid_http_methods behaves like the other log_statsd_* @@ -100,135 +220,483 @@ def __init__(self, app, conf): self.valid_methods = conf.get( 'access_log_statsd_valid_http_methods', conf.get('log_statsd_valid_http_methods', - 'GET,HEAD,POST,PUT,DELETE,COPY,OPTIONS')) + 'GET,HEAD,POST,PUT,DELETE,COPY,OPTIONS,UPDATE')) self.valid_methods = [m.strip().upper() for m in self.valid_methods.split(',') if m.strip()] + + # Copy supported access_log_* options to the corresponding log_* + # option, possibly overriding the log_* option. Note that this includes + # some statsd options that have access_log_* or log_* prefixes. access_log_conf = {} for key in ('log_facility', 'log_name', 'log_level', 'log_udp_host', 'log_udp_port', 'log_statsd_host', 'log_statsd_port', 'log_statsd_default_sample_rate', + 'log_statsd_sample_rate_factor', 'log_statsd_metric_prefix'): value = conf.get('access_' + key, conf.get(key, None)) if value: access_log_conf[key] = value - self.access_logger = get_logger(access_log_conf, - log_route='proxy-access') - self.access_logger.set_statsd_prefix('proxy-server') + for key, value in conf.items(): + if key.startswith('statsd_'): + access_log_conf[key] = value + self.access_logger = logger or get_logger( + access_log_conf, + log_route=conf.get('access_log_route', 'proxy-access'), + statsd_tail_prefix='proxy-server') + self.statsd = get_labeled_statsd_client( + access_log_conf, self.access_logger.logger) + self.reveal_sensitive_prefix = int( + conf.get('reveal_sensitive_prefix', 16)) + self.check_log_msg_template_validity() + self.emit_buffer_xfer_bytes_sec = float( + conf.get('statsd_emit_buffer_xfer_bytes_seconds', -1)) + + def check_log_msg_template_validity(self): + replacements = { + # Time information + 'end_time': StrFormatTime(1000001), + 'start_time': StrFormatTime(1000000), + # Information worth to anonymize + 'client_ip': StrAnonymizer('1.2.3.4', self.anonymization_method, + self.anonymization_salt), + 'remote_addr': StrAnonymizer('4.3.2.1', self.anonymization_method, + self.anonymization_salt), + 'domain': StrAnonymizer('', self.anonymization_method, + self.anonymization_salt), + 'path': StrAnonymizer('/', self.anonymization_method, + self.anonymization_salt), + 'referer': StrAnonymizer('ref', self.anonymization_method, + self.anonymization_salt), + 'user_agent': StrAnonymizer('swift', self.anonymization_method, + self.anonymization_salt), + 'headers': StrAnonymizer('header', self.anonymization_method, + self.anonymization_salt), + 'client_etag': StrAnonymizer('etag', self.anonymization_method, + self.anonymization_salt), + 'account': StrAnonymizer('a', self.anonymization_method, + self.anonymization_salt), + 'container': StrAnonymizer('c', self.anonymization_method, + self.anonymization_salt), + 'object': StrAnonymizer('', self.anonymization_method, + self.anonymization_salt), + # Others information + 'method': 'GET', + 'protocol': '', + 'status_int': '0', + 'auth_token': '1234...', # nosec B105 + 'bytes_recvd': '1', + 'bytes_sent': '0', + 'transaction_id': 'tx1234', + 'request_time': '0.05', + 'source': '', + 'log_info': '', + 'policy_index': '', + 'ttfb': '0.05', + 'pid': '42', + 'wire_status_int': '200', + 'access_user_id': StrAnonymizer('AKIAIOSFODNN7EXAMPLE', + self.anonymization_method, + self.anonymization_salt), + } + try: + self.log_formatter.format(self.log_msg_template, **replacements) + except Exception as e: + raise ValueError('Cannot interpolate log_msg_template: %s' % e) + + def method_from_req(self, req): + return req.environ.get('swift.orig_req_method', req.method) + + def req_already_logged(self, env): + return env.get('swift.proxy_access_log_made') + + def mark_req_logged(self, env): + env['swift.proxy_access_log_made'] = True + + def obscure_sensitive(self, value): + return cap_length(value, self.reveal_sensitive_prefix) + + def obscure_req(self, req): + for header in get_sensitive_headers(): + if header in req.headers: + req.headers[header] = \ + self.obscure_sensitive(req.headers[header]) + + obscure_params = get_sensitive_params() + new_params = [] + any_obscured = False + for k, v in req.params.items(): + if k in obscure_params: + new_params.append((k, self.obscure_sensitive(v))) + any_obscured = True + else: + new_params.append((k, v)) + if any_obscured: + req.params = new_params + + def get_access_user_id(self, req): + """ + Get access user ID from request environ. + + :param req: swob.Request object for the request + :returns: User ID for logging if available, None otherwise + """ + return req.environ.get('swift.access_logging', {}).get('user_id') - def log_request(self, env, status_int, bytes_received, bytes_sent, - request_time, client_disconnect): + def log_request(self, req, status_int, bytes_received, bytes_sent, + start_time, end_time, resp_headers=None, ttfb=0, + wire_status_int=None): """ Log a request. - :param env: WSGI environment + :param req: swob.Request object for the request :param status_int: integer code for the response status :param bytes_received: bytes successfully read from the request body :param bytes_sent: bytes yielded to the WSGI server - :param request_time: time taken to satisfy the request, in seconds + :param start_time: timestamp request started + :param end_time: timestamp request completed + :param resp_headers: dict of the response headers + :param ttfb: time to first byte + :param wire_status_int: the on the wire status int """ - req = Request(env) - if client_disconnect: # log disconnected clients as '499' status code - status_int = 499 - req_path = get_valid_utf8_str(req.path) - the_request = quote(unquote(req_path)) - if req.query_string: - the_request = the_request + '?' + req.query_string + self.obscure_req(req) + domain = req.environ.get('HTTP_HOST', + req.environ.get('SERVER_NAME', None)) + if ':' in domain: + domain, port = domain.rsplit(':', 1) + resp_headers = resp_headers or {} logged_headers = None if self.log_hdrs: - logged_headers = '\n'.join('%s: %s' % (k, v) - for k, v in req.headers.items()) - method = req.environ.get('swift.orig_req_method', req.method) - self.access_logger.info(' '.join( - quote(str(x) if x else '-') - for x in ( - get_remote_client(req), - req.remote_addr, - time.strftime('%d/%b/%Y/%H/%M/%S', time.gmtime()), - method, - the_request, + if self.log_hdrs_only: + logged_headers = '\n'.join('%s: %s' % (k, v) + for k, v in req.headers.items() + if k in self.log_hdrs_only) + else: + logged_headers = '\n'.join('%s: %s' % (k, v) + for k, v in req.headers.items()) + + method = self.method_from_req(req) + duration_time_str = "%.4f" % (end_time - start_time) + policy_index = get_policy_index(req.headers, resp_headers) + + swift_path = req.environ.get('swift.backend_path', req.path) + acc, cont, obj = self.get_aco_from_path(swift_path) + + replacements = { + # Time information + 'end_time': StrFormatTime(end_time), + 'start_time': StrFormatTime(start_time), + # Information worth to anonymize + 'client_ip': StrAnonymizer(get_remote_client(req), + self.anonymization_method, + self.anonymization_salt), + 'remote_addr': StrAnonymizer(req.remote_addr, + self.anonymization_method, + self.anonymization_salt), + 'domain': StrAnonymizer(domain, self.anonymization_method, + self.anonymization_salt), + 'path': StrAnonymizer(req.path_qs, self.anonymization_method, + self.anonymization_salt), + 'referer': StrAnonymizer(req.referer, self.anonymization_method, + self.anonymization_salt), + 'user_agent': StrAnonymizer(req.user_agent, + self.anonymization_method, + self.anonymization_salt), + 'headers': StrAnonymizer(logged_headers, self.anonymization_method, + self.anonymization_salt), + 'client_etag': StrAnonymizer(req.headers.get('etag'), + self.anonymization_method, + self.anonymization_salt), + 'account': StrAnonymizer(acc, self.anonymization_method, + self.anonymization_salt), + 'container': StrAnonymizer(cont, self.anonymization_method, + self.anonymization_salt), + 'object': StrAnonymizer(obj, self.anonymization_method, + self.anonymization_salt), + # Others information + 'method': method, + 'protocol': req.environ.get('SERVER_PROTOCOL'), - status_int, - req.referer, - req.user_agent, + 'status_int': status_int, + 'auth_token': req.headers.get('x-auth-token'), - bytes_received, - bytes_sent, - req.headers.get('etag', None), - req.environ.get('swift.trans_id'), - logged_headers, - '%.4f' % request_time, - req.environ.get('swift.source'), - ))) - # Log timing and bytes-transfered data to StatsD - if req.path.startswith('/v1/'): - try: - stat_type = [None, 'account', 'container', - 'object'][req.path.strip('/').count('/')] - except IndexError: - stat_type = 'object' - else: - stat_type = env.get('swift.source') - # Only log data for valid controllers (or SOS) to keep the metric count - # down (egregious errors will get logged by the proxy server itself). - if stat_type: - stat_method = method if method in self.valid_methods \ - else 'BAD_METHOD' - metric_name = '.'.join((stat_type, stat_method, str(status_int))) - self.access_logger.timing(metric_name + '.timing', - request_time * 1000) - self.access_logger.update_stats(metric_name + '.xfer', + 'bytes_recvd': bytes_received, + 'bytes_sent': bytes_sent, + 'transaction_id': req.environ.get('swift.trans_id'), + 'request_time': duration_time_str, + 'source': req.environ.get('swift.source'), + 'log_info': get_log_info(req.environ), + 'policy_index': policy_index, + 'ttfb': ttfb, + 'pid': self.pid, + 'wire_status_int': wire_status_int or status_int, + 'access_user_id': StrAnonymizer( + self.get_access_user_id(req), self.anonymization_method, + self.anonymization_salt), + } + self.access_logger.info( + self.log_formatter.format(self.log_msg_template, + **replacements)) + + # Log timing and bytes-transferred data to StatsD + metric_method = self.statsd_metric_method(method) + metric_name = self.statsd_metric_name(req, status_int, metric_method) + metric_name_policy = self.statsd_metric_name_policy( + req, status_int, metric_method, policy_index) + + self.access_logger.timing(metric_name + '.timing', + (end_time - start_time) * 1000) + self.access_logger.update_stats(metric_name + '.xfer', + bytes_received + bytes_sent) + if metric_name_policy: + self.access_logger.timing(metric_name_policy + '.timing', + (end_time - start_time) * 1000) + self.access_logger.update_stats(metric_name_policy + '.xfer', bytes_received + bytes_sent) + labels = self.statsd_metric_labels( + req, status_int, metric_method, + acc=acc, cont=cont, policy_index=policy_index) + self.statsd.timing( + 'swift_proxy_server_request_timing', + (end_time - start_time) * 1000, + labels=labels, + ) + self.statsd.update_stats( + 'swift_proxy_server_request_body_bytes', + bytes_received, + labels=labels, + ) + self.statsd.update_stats( + 'swift_proxy_server_response_body_bytes', + bytes_sent, + labels=labels, + ) + + def get_aco_from_path(self, swift_path): + try: + version, acc, cont, obj = split_path(swift_path, 1, 4, True) + if not valid_api_version(version): + raise ValueError + except ValueError: + acc, cont, obj = None, None, None + return acc, cont, obj + + def get_resource_type_from_aco(self, req, acc, cont, obj): + if obj: + return 'object' + if cont: + return 'container' + if acc: + return 'account' + return req.environ.get('swift.source') or 'UNKNOWN' + + def get_resource_type(self, req): + swift_path = req.environ.get('swift.backend_path', req.path) + acc, cont, obj = self.get_aco_from_path(swift_path) + return self.get_resource_type_from_aco(req, acc, cont, obj) + + def statsd_metric_method(self, method): + return method if method in self.valid_methods else 'BAD_METHOD' + + def statsd_metric_name(self, req, status_int, metric_method): + resource_type = self.get_resource_type(req) + return '.'.join((resource_type, metric_method, str(status_int))) + + def update_swift_base_labels(self, req): + acc, cont, obj = self.get_aco_from_path(req.path) + base_labels = req.environ.get('swift.base_labels') + if base_labels is None: + # expected in the left-most proxy_logging instance + if acc is None and is_s3_req(req): + cont, obj = extract_bucket_and_key( + req, self.storage_domains, False) + + method = self.method_from_req(req) + metric_method = self.statsd_metric_method(method) + resource_type = self.get_resource_type_from_aco( + req, acc, cont, obj) + base_labels = { + 'method': metric_method, + } + base_labels['api'] = 'S3' if is_s3_req(req) else 'swift' + if resource_type != 'UNKNOWN' or not is_s3_req(req): + base_labels['resource'] = resource_type + if acc: + base_labels['account'] = acc + if cont: + base_labels['container'] = cont + req.environ['swift.base_labels'] = base_labels + elif acc: + # expected in the right-most proxy_logging instance + resource_type = self.get_resource_type_from_aco( + req, acc, cont, obj) + base_labels.setdefault('account', acc) + base_labels.setdefault('resource', resource_type) + + def statsd_metric_name_policy(self, req, status_int, metric_method, + policy_index): + if policy_index is None: + return None + resource_type = self.get_resource_type(req) + if resource_type == 'object': + # The policy may not exist + policy = POLICIES.get_by_index(policy_index) + if policy: + return '.'.join((resource_type, 'policy', str(policy_index), + metric_method, str(status_int))) + else: + return None + else: + return None + + def statsd_metric_labels(self, req, status_int, metric_method, acc=None, + cont=None, policy_index=None): + # overlay freshly derived labels onto base_labels just in case any + # changed w.r.t. base labels while the request was being handled (in + # particular, container may be different in swift.backend_path) + # TODO: remove unnecessary duplication in the overlay e.g. method, + # account + resource_type = self.get_resource_type(req) + + labels = { + 'resource': resource_type, + 'method': metric_method, + 'status': status_int, + } + if acc: + labels['account'] = acc + if cont: + labels['container'] = cont + if resource_type == 'object' and \ + policy_index is not None and \ + POLICIES.get_by_index(policy_index) is not None: + labels['policy'] = policy_index + return ChainMap(labels, req.environ['swift.base_labels']) + def __call__(self, env, start_response): + req = Request(env) + self.update_swift_base_labels(req) + + if self.req_already_logged(env): + return self.app(env, start_response) + + self.mark_req_logged(env) + start_response_args = [None] - input_proxy = InputProxy(env['wsgi.input']) + + xfer_metric_name = 'swift_proxy_server_request_body_streaming_bytes' + base_labels = req.environ.get('swift.base_labels') + + statsd_emit_callback = BufferXferEmitCallback( + xfer_metric_name, base_labels, self.statsd, + self.emit_buffer_xfer_bytes_sec) + input_proxy = CallbackInputProxy(env['wsgi.input'], + statsd_emit_callback) env['wsgi.input'] = input_proxy start_time = time.time() def my_start_response(status, headers, exc_info=None): start_response_args[0] = (status, list(headers), exc_info) + def status_int_for_logging(): + # log disconnected clients as '499' status code + if input_proxy.client_disconnect: + return 499 + return env.get('swift.proxy_logging_status') + def iter_response(iterable): - iterator = iter(iterable) - try: - chunk = iterator.next() - while not chunk: - chunk = iterator.next() - except StopIteration: - chunk = '' + iterator = reiterate(iterable) + content_length = None for h, v in start_response_args[0][1]: - if h.lower() in ('content-length', 'transfer-encoding'): + if h.lower() == 'content-length': + content_length = int(v) + break + elif h.lower() == 'transfer-encoding': break else: - if not chunk: - start_response_args[0][1].append(('content-length', '0')) - elif isinstance(iterable, list): + if isinstance(iterator, list): + content_length = sum(len(i) for i in iterator) start_response_args[0][1].append( - ('content-length', str(sum(len(i) for i in iterable)))) + ('Content-Length', str(content_length))) + + method = self.method_from_req(req) + if method == 'HEAD': + content_length = 0 + if content_length is not None: + iterator = ByteEnforcer(iterator, content_length) + + wire_status_int = int(start_response_args[0][0].split(' ', 1)[0]) + resp_headers = dict(start_response_args[0][1]) start_response(*start_response_args[0]) + + policy_index = get_policy_index(req.headers, resp_headers) + + # Log timing information for time-to-first-byte (GET requests only) + ttfb = 0.0 + if method == 'GET': + swift_path = req.environ.get('swift.backend_path', req.path) + acc, cont, _ = self.get_aco_from_path(swift_path) + labels = self.statsd_metric_labels( + req, wire_status_int, method, + acc=acc, cont=cont, policy_index=policy_index) + metric_name = self.statsd_metric_name( + req, wire_status_int, method) + metric_name_policy = self.statsd_metric_name_policy( + req, wire_status_int, method, policy_index) + + ttfb = time.time() - start_time + if metric_name: + self.access_logger.timing( + metric_name + '.first-byte.timing', ttfb * 1000) + if metric_name_policy: + self.access_logger.timing( + metric_name_policy + '.first-byte.timing', ttfb * 1000) + + self.statsd.timing( + 'swift_proxy_server_request_ttfb', + ttfb * 1000, + labels=labels, + ) + + resp_xfer_labels = statsd_metric_resp_labels( + base_labels, status_int=wire_status_int, + policy_index=policy_index) + bytes_sent = 0 - client_disconnect = False + statsd_emit_callback = BufferXferEmitCallback( + 'swift_proxy_server_response_body_streaming_bytes', + resp_xfer_labels, + self.statsd, self.emit_buffer_xfer_bytes_sec) try: - while chunk: + for chunk in iterator: bytes_sent += len(chunk) + statsd_emit_callback(chunk) yield chunk - chunk = iterator.next() except GeneratorExit: # generator was closed before we finished - client_disconnect = True + env['swift.proxy_logging_status'] = 499 + raise + except Exception: + env['swift.proxy_logging_status'] = 500 raise finally: - status_int = int(start_response_args[0][0].split(' ', 1)[0]) + statsd_emit_callback(b'', eof=True) + env.setdefault('swift.proxy_logging_status', wire_status_int) + status_int = status_int_for_logging() self.log_request( - env, status_int, input_proxy.bytes_received, bytes_sent, - time.time() - start_time, - client_disconnect or input_proxy.client_disconnect) + req, status_int, input_proxy.bytes_received, bytes_sent, + start_time, time.time(), resp_headers=resp_headers, + ttfb=ttfb, wire_status_int=wire_status_int) + close_if_possible(iterator) try: iterable = self.app(env, my_start_response) except Exception: + req = Request(env) + env['swift.proxy_logging_status'] = 500 + status_int = status_int_for_logging() self.log_request( - env, 500, input_proxy.bytes_received, 0, - time.time() - start_time, input_proxy.client_disconnect) + req, status_int, input_proxy.bytes_received, 0, start_time, + time.time()) raise else: return iter_response(iterable) @@ -238,6 +706,12 @@ def filter_factory(global_conf, **local_conf): conf = global_conf.copy() conf.update(local_conf) + # Normally it would be the middleware that uses the header that + # would register it, but because there could be 3rd party auth middlewares + # that use 'x-auth-token' or 'x-storage-token' we special case it here. + register_sensitive_header('x-auth-token') + register_sensitive_header('x-storage-token') + def proxy_logger(app): return ProxyLoggingMiddleware(app, conf) return proxy_logger diff --git a/swift/common/middleware/ratelimit.py b/swift/common/middleware/ratelimit.py index cee3e0637d..365f2dfcf2 100644 --- a/swift/common/middleware/ratelimit.py +++ b/swift/common/middleware/ratelimit.py @@ -1,3 +1,4 @@ +# Copyright (c) 2010-2013 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,14 +13,74 @@ # See the License for the specific language governing permissions and # limitations under the License. import time + import eventlet -from swift.common.utils import split_path, cache_from_env, get_logger -from swift.proxy.controllers.base import get_container_memcache_key +from swift.common.utils import cache_from_env, get_logger +from swift.common.registry import register_swift_info +from swift.proxy.controllers.base import get_account_info, get_container_info +from swift.common.constraints import valid_api_version from swift.common.memcached import MemcacheConnectionError from swift.common.swob import Request, Response +def interpret_conf_limits(conf, name_prefix, info=None): + """ + Parses general parms for rate limits looking for things that + start with the provided name_prefix within the provided conf + and returns lists for both internal use and for /info + + :param conf: conf dict to parse + :param name_prefix: prefix of config parms to look for + :param info: set to return extra stuff for /info registration + """ + conf_limits = [] + for conf_key in conf: + if conf_key.startswith(name_prefix): + cont_size = int(conf_key[len(name_prefix):]) + rate = float(conf[conf_key]) + conf_limits.append((cont_size, rate)) + + conf_limits.sort() + ratelimits = [] + conf_limits_info = list(conf_limits) + while conf_limits: + cur_size, cur_rate = conf_limits.pop(0) + if conf_limits: + next_size, next_rate = conf_limits[0] + slope = (float(next_rate) - float(cur_rate)) \ + / (next_size - cur_size) + + def new_scope(cur_size, slope, cur_rate): + # making new scope for variables + return lambda x: (x - cur_size) * slope + cur_rate + line_func = new_scope(cur_size, slope, cur_rate) + else: + line_func = lambda x: cur_rate + + ratelimits.append((cur_size, cur_rate, line_func)) + if info is None: + return ratelimits + else: + return ratelimits, conf_limits_info + + +def get_maxrate(ratelimits, size): + """ + Returns number of requests allowed per second for given size. + """ + last_func = None + if size: + size = int(size) + for ratesize, rate, func in ratelimits: + if size < ratesize: + break + last_func = func + if last_func: + return last_func(size) + return None + + class MaxSleepTimeHitError(Exception): pass @@ -35,11 +96,10 @@ class RateLimitMiddleware(object): BLACK_LIST_SLEEP = 1 def __init__(self, app, conf, logger=None): + self.app = app - if logger: - self.logger = logger - else: - self.logger = get_logger(conf, log_route='ratelimit') + self.logger = logger or get_logger(conf, log_route='ratelimit') + self.memcache_client = None self.account_ratelimit = float(conf.get('account_ratelimit', 0)) self.max_sleep_time_seconds = \ float(conf.get('max_sleep_time_seconds', 60)) @@ -50,94 +110,98 @@ def __init__(self, app, conf, logger=None): self.ratelimit_whitelist = \ [acc.strip() for acc in conf.get('account_whitelist', '').split(',') if acc.strip()] + if self.ratelimit_whitelist: + self.logger.warning('Option account_whitelist is deprecated. Use ' + 'an internal client to POST a `X-Account-' + 'Sysmeta-Global-Write-Ratelimit: WHITELIST` ' + 'header to the specific accounts instead.') self.ratelimit_blacklist = \ [acc.strip() for acc in conf.get('account_blacklist', '').split(',') if acc.strip()] - self.memcache_client = None - conf_limits = [] - for conf_key in conf.keys(): - if conf_key.startswith('container_ratelimit_'): - cont_size = int(conf_key[len('container_ratelimit_'):]) - rate = float(conf[conf_key]) - conf_limits.append((cont_size, rate)) - - conf_limits.sort() - self.container_ratelimits = [] - while conf_limits: - cur_size, cur_rate = conf_limits.pop(0) - if conf_limits: - next_size, next_rate = conf_limits[0] - slope = (float(next_rate) - float(cur_rate)) \ - / (next_size - cur_size) - - def new_scope(cur_size, slope, cur_rate): - # making new scope for variables - return lambda x: (x - cur_size) * slope + cur_rate - line_func = new_scope(cur_size, slope, cur_rate) - else: - line_func = lambda x: cur_rate - - self.container_ratelimits.append((cur_size, cur_rate, line_func)) + if self.ratelimit_blacklist: + self.logger.warning('Option account_blacklist is deprecated. Use ' + 'an internal client to POST a `X-Account-' + 'Sysmeta-Global-Write-Ratelimit: BLACKLIST` ' + 'header to the specific accounts instead.') + self.container_ratelimits = interpret_conf_limits( + conf, 'container_ratelimit_') + self.container_listing_ratelimits = interpret_conf_limits( + conf, 'container_listing_ratelimit_') - def get_container_maxrate(self, container_size): - """ - Returns number of requests allowed per second for given container size. - """ - last_func = None - if container_size: - container_size = int(container_size) - for size, rate, func in self.container_ratelimits: - if container_size < size: - break - last_func = func - if last_func: - return last_func(container_size) - return None + def get_container_size(self, env): + rv = 0 + container_info = get_container_info( + env, self.app, swift_source='RL') + if isinstance(container_info, dict): + rv = container_info.get( + 'object_count', container_info.get('container_size', 0)) + return rv - def get_ratelimitable_key_tuples(self, req_method, account_name, - container_name=None, obj_name=None): + def get_ratelimitable_key_tuples(self, req, account_name, + container_name=None, obj_name=None, + global_ratelimit=None): """ Returns a list of key (used in memcache), ratelimit tuples. Keys should be checked in order. - :param req_method: HTTP method + :param req: swob request :param account_name: account name from path :param container_name: container name from path :param obj_name: object name from path + :param global_ratelimit: this account has an account wide + ratelimit on all writes combined """ keys = [] # COPYs are not limited + if self.account_ratelimit and \ account_name and container_name and not obj_name and \ - req_method in ('PUT', 'DELETE'): + req.method in ('PUT', 'DELETE'): keys.append(("ratelimit/%s" % account_name, self.account_ratelimit)) if account_name and container_name and obj_name and \ - req_method in ('PUT', 'DELETE', 'POST'): - container_size = None - memcache_key = get_container_memcache_key(account_name, - container_name) - container_info = self.memcache_client.get(memcache_key) - if isinstance(container_info, dict): - container_size = container_info.get( - 'count', container_info.get('container_size', 0)) - container_rate = self.get_container_maxrate(container_size) - if container_rate: - keys.append(("ratelimit/%s/%s" % (account_name, - container_name), - container_rate)) + req.method in ('PUT', 'DELETE', 'POST', 'COPY'): + container_size = self.get_container_size(req.environ) + container_rate = get_maxrate( + self.container_ratelimits, container_size) + if container_rate: + keys.append(( + "ratelimit/%s/%s" % (account_name, container_name), + container_rate)) + + if account_name and container_name and not obj_name and \ + req.method == 'GET': + container_size = self.get_container_size(req.environ) + container_rate = get_maxrate( + self.container_listing_ratelimits, container_size) + if container_rate: + keys.append(( + "ratelimit_listing/%s/%s" % (account_name, container_name), + container_rate)) + + if account_name and req.method in ('PUT', 'DELETE', 'POST', 'COPY'): + if global_ratelimit: + try: + global_ratelimit = float(global_ratelimit) + if global_ratelimit > 0: + keys.append(( + "ratelimit/global-write/%s" % account_name, + global_ratelimit)) + except ValueError: + pass + return keys def _get_sleep_time(self, key, max_rate): - ''' + """ Returns the amount of time (a float in seconds) that the app should sleep. :param key: a memcache key :param max_rate: maximum rate allowed in requests per second - :raises: MaxSleepTimeHitError if max sleep time is exceeded. - ''' + :raises MaxSleepTimeHitError: if max sleep time is exceeded. + """ try: now_m = int(round(time.time() * self.clock_accuracy)) time_per_request_m = int(round(self.clock_accuracy / max_rate)) @@ -166,43 +230,66 @@ def _get_sleep_time(self, key, max_rate): return 0 def handle_ratelimit(self, req, account_name, container_name, obj_name): - ''' + """ Performs rate limiting and account white/black listing. Sleeps - if necessary. + if necessary. If self.memcache_client is not set, immediately returns + None. :param account_name: account name from path :param container_name: container name from path :param obj_name: object name from path - ''' - if account_name in self.ratelimit_blacklist: - self.logger.error(_('Returning 497 because of blacklisting: %s'), + """ + if not self.memcache_client: + return None + + if req.environ.get('swift.ratelimit.handled'): + return None + req.environ['swift.ratelimit.handled'] = True + + try: + account_info = get_account_info(req.environ, self.app, + swift_source='RL') + account_global_ratelimit = \ + account_info.get('sysmeta', {}).get('global-write-ratelimit') + except ValueError: + account_global_ratelimit = None + + if account_name in self.ratelimit_whitelist or \ + account_global_ratelimit == 'WHITELIST': + return None + + if account_name in self.ratelimit_blacklist or \ + account_global_ratelimit == 'BLACKLIST': + self.logger.error('Returning 497 because of blacklisting: %s', account_name) eventlet.sleep(self.BLACK_LIST_SLEEP) return Response(status='497 Blacklisted', body='Your account has been blacklisted', request=req) - if account_name in self.ratelimit_whitelist: - return None + for key, max_rate in self.get_ratelimitable_key_tuples( - req.method, account_name, container_name=container_name, - obj_name=obj_name): + req, account_name, container_name=container_name, + obj_name=obj_name, global_ratelimit=account_global_ratelimit): try: need_to_sleep = self._get_sleep_time(key, max_rate) if self.log_sleep_time_seconds and \ need_to_sleep > self.log_sleep_time_seconds: self.logger.warning( - _("Ratelimit sleep log: %(sleep)s for " - "%(account)s/%(container)s/%(object)s"), + "Ratelimit sleep log: %(sleep)s for " + "%(account)s/%(container)s/%(object)s", {'sleep': need_to_sleep, 'account': account_name, 'container': container_name, 'object': obj_name}) if need_to_sleep > 0: eventlet.sleep(need_to_sleep) - except MaxSleepTimeHitError, e: + except MaxSleepTimeHitError as e: + if obj_name: + path = '/'.join((account_name, container_name, obj_name)) + else: + path = '/'.join((account_name, container_name)) self.logger.error( - _('Returning 498 for %(meth)s to %(acc)s/%(cont)s/%(obj)s ' - '. Ratelimit (Max Sleep) %(e)s'), - {'meth': req.method, 'acc': account_name, - 'cont': container_name, 'obj': obj_name, 'e': str(e)}) + 'Returning 498 for %(meth)s to %(path)s. ' + 'Ratelimit (Max Sleep) %(e)s', + {'meth': req.method, 'path': path, 'e': str(e)}) error_resp = Response(status='498 Rate Limited', body='Slow down', request=req) return error_resp @@ -221,12 +308,14 @@ def __call__(self, env, start_response): self.memcache_client = cache_from_env(env) if not self.memcache_client: self.logger.warning( - _('Warning: Cannot ratelimit without a memcached client')) + 'Cannot ratelimit without a memcached client') return self.app(env, start_response) try: - version, account, container, obj = split_path(req.path, 1, 4, True) + version, account, container, obj = req.split_path(1, 4, True) except ValueError: return self.app(env, start_response) + if not valid_api_version(version): + return self.app(env, start_response) ratelimit_resp = self.handle_ratelimit(req, account, container, obj) if ratelimit_resp is None: return self.app(env, start_response) @@ -241,6 +330,20 @@ def filter_factory(global_conf, **local_conf): conf = global_conf.copy() conf.update(local_conf) + account_ratelimit = float(conf.get('account_ratelimit', 0)) + max_sleep_time_seconds = float(conf.get('max_sleep_time_seconds', 60)) + container_ratelimits, cont_limit_info = interpret_conf_limits( + conf, 'container_ratelimit_', info=1) + container_listing_ratelimits, cont_list_limit_info = \ + interpret_conf_limits(conf, 'container_listing_ratelimit_', info=1) + # not all limits are exposed (intentionally) + register_swift_info('ratelimit', + account_ratelimit=account_ratelimit, + max_sleep_time_seconds=max_sleep_time_seconds, + container_ratelimits=cont_limit_info, + container_listing_ratelimits=cont_list_limit_info) + def limit_filter(app): return RateLimitMiddleware(app, conf) + return limit_filter diff --git a/swift/common/middleware/read_only.py b/swift/common/middleware/read_only.py new file mode 100644 index 0000000000..b905ff1c97 --- /dev/null +++ b/swift/common/middleware/read_only.py @@ -0,0 +1,125 @@ +# Copyright (c) 2010-2015 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from swift.common.constraints import check_account_format, valid_api_version +from swift.common.swob import HTTPMethodNotAllowed, Request +from swift.common.utils import get_logger, config_true_value +from swift.common.registry import register_swift_info +from swift.proxy.controllers.base import get_info + +""" +========= +Read Only +========= + +The ability to make an entire cluster or individual accounts read only is +implemented as pluggable middleware. When a cluster or an account is in read +only mode, requests that would result in writes to the cluser are not allowed. +A 405 is returned on such requests. "COPY", "DELETE", "POST", and +"PUT" are the HTTP methods that are considered writes. + +------------- +Configuration +------------- + +All configuration is optional. + +============= ======= ==================================================== +Option Default Description +------------- ------- ---------------------------------------------------- +read_only false Set to 'true' to put the entire cluster in read only + mode. +allow_deletes false Set to 'true' to allow deletes. +============= ======= ==================================================== + +--------------------------- +Marking Individual Accounts +--------------------------- + +If a system administrator wants to mark individual accounts as read only, +he/she can set X-Account-Sysmeta-Read-Only on an account to 'true'. + +If a system administrator wants to allow writes to individual accounts, +when a cluster is in read only mode, he/she can set +X-Account-Sysmeta-Read-Only on an account to 'false'. + +This header will be hidden from the user, because of the gatekeeper middleware, +and can only be set using a direct client to the account nodes. +""" + + +class ReadOnlyMiddleware(object): + """ + Middleware that make an entire cluster or individual accounts read only. + """ + + def __init__(self, app, conf, logger=None): + self.app = app + self.logger = logger or get_logger(conf, log_route='read_only') + self.read_only = config_true_value(conf.get('read_only')) + self.write_methods = {'COPY', 'POST', 'PUT'} + if not config_true_value(conf.get('allow_deletes')): + self.write_methods.add('DELETE') + + def __call__(self, env, start_response): + req = Request(env) + + if req.method not in self.write_methods: + return self.app(env, start_response) + + try: + version, account, container, obj = req.split_path(2, 4, True) + if not valid_api_version(version): + raise ValueError + except ValueError: + return self.app(env, start_response) + + if req.method == 'COPY' and 'Destination-Account' in req.headers: + dest_account = req.headers.get('Destination-Account') + account = check_account_format(req, dest_account) + + if self.account_read_only(req, account): + msg = 'Writes are disabled for this account.' + return HTTPMethodNotAllowed(body=msg)(env, start_response) + + return self.app(env, start_response) + + def account_read_only(self, req, account): + """ + Check whether an account should be read-only. + + This considers both the cluster-wide config value as well as the + per-account override in X-Account-Sysmeta-Read-Only. + """ + info = get_info(self.app, req.environ, account, swift_source='RO') + read_only = info.get('sysmeta', {}).get('read-only', '') + if not read_only: + return self.read_only + return config_true_value(read_only) + + +def filter_factory(global_conf, **local_conf): + """ + paste.deploy app factory for creating WSGI proxy apps. + """ + conf = global_conf.copy() + conf.update(local_conf) + + if config_true_value(conf.get('read_only')): + register_swift_info('read_only') + + def read_only_filter(app): + return ReadOnlyMiddleware(app, conf) + + return read_only_filter diff --git a/swift/common/middleware/recon.py b/swift/common/middleware/recon.py index 4bcb8f6db6..0a6d35d07c 100644 --- a/swift/common/middleware/recon.py +++ b/swift/common/middleware/recon.py @@ -1,4 +1,4 @@ -# Copyright (c) 2010-2012 OpenStack, LLC. +# Copyright (c) 2010-2012 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,17 +14,19 @@ # limitations under the License. import errno +import json import os +import time +from resource import getpagesize -from swift.common.swob import Request, Response -from swift.common.utils import split_path, get_logger, config_true_value +from swift import __version__ as swiftver from swift.common.constraints import check_mount -from resource import getpagesize -from hashlib import md5 -try: - import simplejson as json -except ImportError: - import json +from swift.common.storage_policy import POLICIES +from swift.common.swob import Request, Response +from swift.common.utils import get_logger, SWIFT_CONF_FILE, md5_hash_for_file +from swift.common.recon import RECON_OBJECT_FILE, RECON_CONTAINER_FILE, \ + RECON_ACCOUNT_FILE, RECON_DRIVE_FILE, RECON_RELINKER_FILE, \ + DEFAULT_RECON_CACHE_PATH class ReconMiddleware(object): @@ -33,8 +35,8 @@ class ReconMiddleware(object): /recon/load|mem|async... will return various system metrics. - Needs to be added to the pipeline and a requires a filter - declaration in the object-server.conf: + Needs to be added to the pipeline and requires a filter + declaration in the [account|container|object]-server conf file: [filter:recon] use = egg:swift#recon @@ -43,44 +45,61 @@ class ReconMiddleware(object): def __init__(self, app, conf, *args, **kwargs): self.app = app - self.devices = conf.get('devices', '/srv/node/') + self.devices = conf.get('devices', '/srv/node') swift_dir = conf.get('swift_dir', '/etc/swift') self.logger = get_logger(conf, log_route='recon') self.recon_cache_path = conf.get('recon_cache_path', - '/var/cache/swift') + DEFAULT_RECON_CACHE_PATH) self.object_recon_cache = os.path.join(self.recon_cache_path, - 'object.recon') + RECON_OBJECT_FILE) self.container_recon_cache = os.path.join(self.recon_cache_path, - 'container.recon') + RECON_CONTAINER_FILE) self.account_recon_cache = os.path.join(self.recon_cache_path, - 'account.recon') + RECON_ACCOUNT_FILE) + self.drive_recon_cache = os.path.join(self.recon_cache_path, + RECON_DRIVE_FILE) + self.relink_recon_cache = os.path.join(self.recon_cache_path, + RECON_RELINKER_FILE) self.account_ring_path = os.path.join(swift_dir, 'account.ring.gz') self.container_ring_path = os.path.join(swift_dir, 'container.ring.gz') - self.object_ring_path = os.path.join(swift_dir, 'object.ring.gz') - self.rings = [self.account_ring_path, self.container_ring_path, - self.object_ring_path] - self.mount_check = config_true_value(conf.get('mount_check', 'true')) - def _from_recon_cache(self, cache_keys, cache_file, openr=open): + self.rings = [self.account_ring_path, self.container_ring_path] + # include all object ring files (for all policies) + for policy in POLICIES: + self.rings.append(os.path.join(swift_dir, + policy.ring_name + '.ring.gz')) + + def _from_recon_cache(self, cache_keys, cache_file, openr=open, + ignore_missing=False): """retrieve values from a recon cache file :params cache_keys: list of cache items to retrieve :params cache_file: cache file to retrieve items from. :params openr: open to use [for unittests] - :return: dict of cache items and their value or none if not found + :params ignore_missing: Some recon stats are very temporary, in this + case it would be better to not log if things are missing. + :return: dict of cache items and their values or none if not found """ try: with openr(cache_file, 'r') as f: recondata = json.load(f) - return dict((key, recondata.get(key)) for key in cache_keys) - except IOError: - self.logger.exception(_('Error reading recon cache file')) + return {key: recondata.get(key) for key in cache_keys} + except IOError as err: + if err.errno == errno.ENOENT and ignore_missing: + pass + else: + self.logger.exception('Error reading recon cache file') except ValueError: - self.logger.exception(_('Error parsing recon cache file')) + self.logger.exception('Error parsing recon cache file') except Exception: - self.logger.exception(_('Error retrieving recon data')) + self.logger.exception('Error retrieving recon data') return dict((key, None) for key in cache_keys) + def get_version(self): + """get swift version""" + verinfo = {'version': swiftver} + return verinfo + def get_mounted(self, openr=open): """get ALL mounted fs from /proc/mounts""" mounts = [] @@ -115,31 +134,53 @@ def get_mem(self, openr=open): def get_async_info(self): """get # of async pendings""" - return self._from_recon_cache(['async_pending'], + return self._from_recon_cache(['async_pending', 'async_pending_last'], self.object_recon_cache) + def get_driveaudit_error(self): + """get # of drive audit errors""" + return self._from_recon_cache(['drive_audit_errors'], + self.drive_recon_cache) + + def get_sharding_info(self): + """get sharding info""" + return self._from_recon_cache(["sharding_stats", + "sharding_time", + "sharding_last"], + self.container_recon_cache) + def get_replication_info(self, recon_type): """get replication info""" + replication_list = ['replication_time', + 'replication_stats', + 'replication_last'] if recon_type == 'account': - return self._from_recon_cache(['replication_time', - 'replication_stats'], + return self._from_recon_cache(replication_list, self.account_recon_cache) elif recon_type == 'container': - return self._from_recon_cache(['replication_time', - 'replication_stats'], + return self._from_recon_cache(replication_list, self.container_recon_cache) elif recon_type == 'object': - return self._from_recon_cache(['object_replication_time'], + replication_list += ['object_replication_time', + 'object_replication_last'] + return self._from_recon_cache(replication_list, self.object_recon_cache) else: return None + def get_reconstruction_info(self): + """get reconstruction info""" + reconstruction_list = ['object_reconstruction_last', + 'object_reconstruction_time'] + return self._from_recon_cache(reconstruction_list, + self.object_recon_cache) + def get_device_info(self): """get devices""" try: return {self.devices: os.listdir(self.devices)} except Exception: - self.logger.exception(_('Error listing devices')) + self.logger.exception('Error listing devices') return {self.devices: None} def get_updater_info(self, recon_type): @@ -148,7 +189,9 @@ def get_updater_info(self, recon_type): return self._from_recon_cache(['container_updater_sweep'], self.container_recon_cache) elif recon_type == 'object': - return self._from_recon_cache(['object_updater_sweep'], + return self._from_recon_cache(['object_updater_sweep', + 'object_updater_stats', + 'object_updater_last'], self.object_recon_cache) else: return None @@ -185,17 +228,36 @@ def get_unmounted(self): """list unmounted (failed?) devices""" mountlist = [] for entry in os.listdir(self.devices): - mpoint = {'device': entry, - 'mounted': check_mount(self.devices, entry)} - if not mpoint['mounted']: - mountlist.append(mpoint) + if not os.path.isdir(os.path.join(self.devices, entry)): + continue + + try: + check_mount(self.devices, entry) + except OSError as err: + mounted = str(err) + except ValueError: + mounted = False + else: + continue + mountlist.append({'device': entry, 'mounted': mounted}) return mountlist def get_diskusage(self): """get disk utilization statistics""" devices = [] for entry in os.listdir(self.devices): - if check_mount(self.devices, entry): + if not os.path.isdir(os.path.join(self.devices, entry)): + continue + + try: + check_mount(self.devices, entry) + except OSError as err: + devices.append({'device': entry, 'mounted': str(err), + 'size': '', 'used': '', 'avail': ''}) + except ValueError: + devices.append({'device': entry, 'mounted': False, + 'size': '', 'used': '', 'avail': ''}) + else: path = os.path.join(self.devices, entry) disk = os.statvfs(path) capacity = disk.f_bsize * disk.f_blocks @@ -204,41 +266,55 @@ def get_diskusage(self): devices.append({'device': entry, 'mounted': True, 'size': capacity, 'used': used, 'avail': available}) - else: - devices.append({'device': entry, 'mounted': False, - 'size': '', 'used': '', 'avail': ''}) return devices - def get_ring_md5(self, openr=open): + def get_ring_md5(self): """get all ring md5sum's""" sums = {} for ringfile in self.rings: - md5sum = md5() if os.path.exists(ringfile): try: - with openr(ringfile, 'rb') as f: - block = f.read(4096) - while block: - md5sum.update(block) - block = f.read(4096) - sums[ringfile] = md5sum.hexdigest() - except IOError, err: + sums[ringfile] = md5_hash_for_file(ringfile) + except IOError as err: sums[ringfile] = None if err.errno != errno.ENOENT: - self.logger.exception(_('Error reading ringfile')) + self.logger.exception('Error reading ringfile') return sums + def get_swift_conf_md5(self): + """get md5 of swift.conf""" + hexsum = None + try: + hexsum = md5_hash_for_file(SWIFT_CONF_FILE) + except IOError as err: + if err.errno != errno.ENOENT: + self.logger.exception('Error reading swift.conf') + return {SWIFT_CONF_FILE: hexsum} + def get_quarantine_count(self): """get obj/container/account quarantine counts""" - qcounts = {"objects": 0, "containers": 0, "accounts": 0} + qcounts = {"objects": 0, "containers": 0, "accounts": 0, + "policies": {}} qdir = "quarantined" for device in os.listdir(self.devices): - for qtype in qcounts: - qtgt = os.path.join(self.devices, device, qdir, qtype) - if os.path.exists(qtgt): + qpath = os.path.join(self.devices, device, qdir) + if os.path.exists(qpath): + for qtype in os.listdir(qpath): + qtgt = os.path.join(qpath, qtype) linkcount = os.lstat(qtgt).st_nlink if linkcount > 2: - qcounts[qtype] += linkcount - 2 + if qtype.startswith('objects'): + if '-' in qtype: + pkey = qtype.split('-', 1)[1] + else: + pkey = '0' + qcounts['policies'].setdefault(pkey, + {'objects': 0}) + qcounts['policies'][pkey]['objects'] \ + += linkcount - 2 + qcounts['objects'] += linkcount - 2 + else: + qcounts[qtype] += linkcount - 2 return qcounts def get_socket_info(self, openr=open): @@ -272,8 +348,21 @@ def get_socket_info(self, openr=open): raise return sockstat + def get_time(self): + """get current time""" + + return time.time() + + def get_relinker_info(self): + """get relinker info, if any""" + + stat_keys = ['devices', 'workers'] + return self._from_recon_cache(stat_keys, + self.relink_recon_cache, + ignore_missing=True) + def GET(self, req): - root, rcheck, rtype = split_path(req.path, 1, 3, True) + root, rcheck, rtype = req.split_path(1, 3, True) all_rtypes = ['account', 'container', 'object'] if rcheck == "mem": content = self.get_mem() @@ -284,7 +373,7 @@ def GET(self, req): elif rcheck == 'replication' and rtype in all_rtypes: content = self.get_replication_info(rtype) elif rcheck == 'replication' and rtype is None: - #handle old style object replication requests + # handle old style object replication requests content = self.get_replication_info('object') elif rcheck == "devices": content = self.get_device_info() @@ -302,10 +391,24 @@ def GET(self, req): content = self.get_diskusage() elif rcheck == "ringmd5": content = self.get_ring_md5() + elif rcheck == "swiftconfmd5": + content = self.get_swift_conf_md5() elif rcheck == "quarantined": content = self.get_quarantine_count() elif rcheck == "sockstat": content = self.get_socket_info() + elif rcheck == "version": + content = self.get_version() + elif rcheck == "driveaudit": + content = self.get_driveaudit_error() + elif rcheck == "time": + content = self.get_time() + elif rcheck == "sharding": + content = self.get_sharding_info() + elif rcheck == "relinker": + content = self.get_relinker_info() + elif rcheck == "reconstruction" and rtype == 'object': + content = self.get_reconstruction_info() else: content = "Invalid path: %s" % req.path return Response(request=req, status="404 Not Found", diff --git a/swift/common/middleware/s3api/__init__.py b/swift/common/middleware/s3api/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/swift/common/middleware/s3api/acl_handlers.py b/swift/common/middleware/s3api/acl_handlers.py new file mode 100644 index 0000000000..f5b8c587b9 --- /dev/null +++ b/swift/common/middleware/s3api/acl_handlers.py @@ -0,0 +1,485 @@ +# Copyright (c) 2014 OpenStack Foundation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +------------ +Acl Handlers +------------ + +Why do we need this +^^^^^^^^^^^^^^^^^^^ + +To make controller classes clean, we need these handlers. +It is really useful for customizing acl checking algorithms for +each controller. + +Basic Information +^^^^^^^^^^^^^^^^^ + +BaseAclHandler wraps basic Acl handling. +(i.e. it will check acl from ACL_MAP by using HEAD) + +How to extend +^^^^^^^^^^^^^ + +Make a handler with the name of the controller. +(e.g. BucketAclHandler is for BucketController) +It consists of method(s) for actual S3 method on controllers as follows. + +Example:: + + class BucketAclHandler(BaseAclHandler): + def PUT: + << put acl handling algorithms here for PUT bucket >> + +.. note:: + If the method DON'T need to recall _get_response in outside of + acl checking, the method have to return the response it needs at + the end of method. + +""" +from swift.common.middleware.s3api.subresource import ACL, Owner, encode_acl +from swift.common.middleware.s3api.s3response import MissingSecurityHeader, \ + MalformedACLError, UnexpectedContent, AccessDenied +from swift.common.middleware.s3api.etree import fromstring, XMLSyntaxError, \ + DocumentInvalid +from swift.common.middleware.s3api.utils import MULTIUPLOAD_SUFFIX, \ + sysmeta_header + + +def get_acl_handler(controller_name): + for base_klass in [BaseAclHandler, MultiUploadAclHandler]: + # pylint: disable-msg=E1101 + for handler in base_klass.__subclasses__(): + handler_suffix_len = len('AclHandler') \ + if not handler.__name__ == 'S3AclHandler' else len('Handler') + if handler.__name__[:-handler_suffix_len] == controller_name: + return handler + return BaseAclHandler + + +class BaseAclHandler(object): + """ + BaseAclHandler: Handling ACL for basic requests mapped on ACL_MAP + """ + def __init__(self, req, logger, container=None, obj=None, headers=None): + self.req = req + self.container = req.container_name if container is None else container + self.obj = req.object_name if obj is None else obj + self.method = req.environ['REQUEST_METHOD'] + self.user_id = self.req.user_id + self.headers = req.headers if headers is None else headers + self.logger = logger + + def request_with(self, container, obj, headers): + return type(self)(self.req, self.logger, + container=container, obj=obj, headers=headers) + + def handle_acl(self, app, method, container=None, obj=None, headers=None): + method = method or self.method + + ah = self.request_with(container, obj, headers) + if hasattr(ah, method): + return getattr(ah, method)(app) + else: + return ah._handle_acl(app, method) + + def _handle_acl(self, app, sw_method, container=None, obj=None, + permission=None, headers=None): + """ + General acl handling method. + This method expects to call Request._get_response() in outside of + this method so that this method returns response only when sw_method + is HEAD. + """ + + container = self.container if container is None else container + obj = self.obj if obj is None else obj + sw_method = sw_method or self.req.environ['REQUEST_METHOD'] + resource = 'object' if obj else 'container' + headers = self.headers if headers is None else headers + + self.logger.debug( + 'checking permission: %s %s %s %s' % + (container, obj, sw_method, dict(headers))) + + if not container: + return + + if not permission and (self.method, sw_method, resource) in ACL_MAP: + acl_check = ACL_MAP[(self.method, sw_method, resource)] + resource = acl_check.get('Resource') or resource + permission = acl_check['Permission'] + + if not permission: + self.logger.debug( + '%s %s %s %s' % (container, obj, sw_method, headers)) + raise Exception('No permission to be checked exists') + + if resource == 'object': + version_id = self.req.params.get('versionId') + if version_id is None: + query = {} + else: + query = {'version-id': version_id} + if self.req.method == 'HEAD': + # This HEAD for ACL is going to also be the definitive response + # to the client so we need to include client params. We don't + # do this for other client request methods because they may + # have invalid combinations of params and headers for a swift + # HEAD request. + part_number = self.req.params.get('partNumber') + if part_number is not None: + query['part-number'] = part_number + resp = self.req.get_acl_response(app, 'HEAD', + container, obj, + headers, query=query) + acl = resp.object_acl + elif resource == 'container': + resp = self.req.get_acl_response(app, 'HEAD', + container, '') + acl = resp.bucket_acl + + try: + acl.check_permission(self.user_id, permission) + except Exception as e: + self.logger.debug(acl) + self.logger.debug('permission denined: %s %s %s' % + (e, self.user_id, permission)) + raise + + if sw_method == 'HEAD': + return resp + + def get_acl(self, headers, body, bucket_owner, object_owner=None): + """ + Get ACL instance from S3 (e.g. x-amz-grant) headers or S3 acl xml body. + """ + acl = ACL.from_headers(headers, bucket_owner, object_owner, + as_private=False) + + if acl is None: + # Get acl from request body if possible. + if not body: + raise MissingSecurityHeader(missing_header_name='x-amz-acl') + try: + elem = fromstring(body, ACL.root_tag) + acl = ACL.from_elem( + elem, True, self.req.conf.allow_no_owner) + except (XMLSyntaxError, DocumentInvalid): + raise MalformedACLError() + except Exception as e: + self.logger.error(e) + raise + else: + if body: + # Specifying grant with both header and xml is not allowed. + raise UnexpectedContent() + + return acl + + +class BucketAclHandler(BaseAclHandler): + """ + BucketAclHandler: Handler for BucketController + """ + def DELETE(self, app): + if self.container.endswith(MULTIUPLOAD_SUFFIX): + # anyways, delete multiupload container doesn't need acls + # because it depends on GET segment container result for + # cleanup + pass + else: + return self._handle_acl(app, 'DELETE') + + def HEAD(self, app): + if self.method == 'DELETE': + return self._handle_acl(app, 'DELETE') + else: + return self._handle_acl(app, 'HEAD') + + def GET(self, app): + if self.method == 'DELETE' and \ + self.container.endswith(MULTIUPLOAD_SUFFIX): + pass + else: + return self._handle_acl(app, 'GET') + + def PUT(self, app): + req_acl = ACL.from_headers(self.req.headers, + Owner(self.user_id, self.user_id)) + + if not self.req.environ.get('swift_owner'): + raise AccessDenied() + + # To avoid overwriting the existing bucket's ACL, we send PUT + # request first before setting the ACL to make sure that the target + # container does not exist. + self.req.get_acl_response(app, 'PUT', self.container) + + # update metadata + self.req.bucket_acl = req_acl + + # FIXME If this request is failed, there is a possibility that the + # bucket which has no ACL is left. + return self.req.get_acl_response(app, 'POST') + + +class ObjectAclHandler(BaseAclHandler): + """ + ObjectAclHandler: Handler for ObjectController + """ + def HEAD(self, app): + # No check object permission needed at DELETE Object + if self.method != 'DELETE': + return self._handle_acl(app, 'HEAD') + + def PUT(self, app): + b_resp = self._handle_acl(app, 'HEAD', obj='') + req_acl = ACL.from_headers(self.req.headers, + b_resp.bucket_acl.owner, + Owner(self.user_id, self.user_id)) + self.req.object_acl = req_acl + + +class S3AclHandler(BaseAclHandler): + """ + S3AclHandler: Handler for S3AclController + """ + def HEAD(self, app): + self._handle_acl(app, 'HEAD', permission='READ_ACP') + + def GET(self, app): + self._handle_acl(app, 'HEAD', permission='READ_ACP') + + def PUT(self, app): + if self.req.is_object_request: + b_resp = self.req.get_acl_response(app, 'HEAD', obj='') + o_resp = self._handle_acl(app, 'HEAD', permission='WRITE_ACP') + req_acl = self.get_acl(self.req.headers, + self.req.xml(ACL.max_xml_length), + b_resp.bucket_acl.owner, + o_resp.object_acl.owner) + + # Don't change the owner of the resource by PUT acl request. + o_resp.object_acl.check_owner(req_acl.owner.id) + + for g in req_acl.grants: + self.logger.debug( + 'Grant %s %s permission on the object /%s/%s' % + (g.grantee, g.permission, self.req.container_name, + self.req.object_name)) + self.req.object_acl = req_acl + else: + self._handle_acl(app, self.method) + + def POST(self, app): + if self.req.is_bucket_request: + resp = self._handle_acl(app, 'HEAD', permission='WRITE_ACP') + + req_acl = self.get_acl(self.req.headers, + self.req.xml(ACL.max_xml_length), + resp.bucket_acl.owner) + + # Don't change the owner of the resource by PUT acl request. + resp.bucket_acl.check_owner(req_acl.owner.id) + + for g in req_acl.grants: + self.logger.debug( + 'Grant %s %s permission on the bucket /%s' % + (g.grantee, g.permission, self.req.container_name)) + self.req.bucket_acl = req_acl + else: + self._handle_acl(app, self.method) + + +class MultiObjectDeleteAclHandler(BaseAclHandler): + """ + MultiObjectDeleteAclHandler: Handler for MultiObjectDeleteController + """ + def HEAD(self, app): + # Only bucket write acl is required + if not self.obj: + return self._handle_acl(app, 'HEAD') + + def DELETE(self, app): + # Only bucket write acl is required + pass + + +class MultiUploadAclHandler(BaseAclHandler): + """ + MultiUpload stuff requires acl checking just once for BASE container + so that MultiUploadAclHandler extends BaseAclHandler to check acl only + when the verb defined. We should define the verb as the first step to + request to backend Swift at incoming request. + + Basic Rules: + - BASE container name is always w/o 'MULTIUPLOAD_SUFFIX' + - Any check timing is ok but we should check it as soon as possible. + + ========== ====== ============= ========== + Controller Verb CheckResource Permission + ========== ====== ============= ========== + Part PUT Container WRITE + Uploads GET Container READ + Uploads POST Container WRITE + Upload GET Container READ + Upload DELETE Container WRITE + Upload POST Container WRITE + ========== ====== ============= ========== + + """ + def __init__(self, req, logger, **kwargs): + super(MultiUploadAclHandler, self).__init__(req, logger, **kwargs) + self.acl_checked = False + + def handle_acl(self, app, method, container=None, obj=None, headers=None): + method = method or self.method + ah = self.request_with(container, obj, headers) + # MultiUpload stuffs don't need acl check basically. + if hasattr(ah, method): + return getattr(ah, method)(app) + + def HEAD(self, app): + # For _check_upload_info + self._handle_acl(app, 'HEAD', self.container, '') + + +class PartAclHandler(MultiUploadAclHandler): + """ + PartAclHandler: Handler for PartController + """ + def __init__(self, req, logger, **kwargs): + # pylint: disable-msg=E1003 + super(MultiUploadAclHandler, self).__init__(req, logger, **kwargs) + + def HEAD(self, app): + if self.container.endswith(MULTIUPLOAD_SUFFIX): + # For _check_upload_info + container = self.container[:-len(MULTIUPLOAD_SUFFIX)] + self._handle_acl(app, 'HEAD', container, '') + else: + # For check_copy_source + return self._handle_acl(app, 'HEAD', self.container, self.obj) + + +class UploadsAclHandler(MultiUploadAclHandler): + """ + UploadsAclHandler: Handler for UploadsController + """ + def handle_acl(self, app, method, *args, **kwargs): + method = method or self.method + if hasattr(self, method): + return getattr(self, method)(app) + else: + pass + + def GET(self, app): + # List Multipart Upload + self._handle_acl(app, 'GET', self.container, '') + + def PUT(self, app): + if not self.acl_checked: + resp = self._handle_acl(app, 'HEAD', obj='') + req_acl = ACL.from_headers(self.req.headers, + resp.bucket_acl.owner, + Owner(self.user_id, self.user_id)) + acl_headers = encode_acl('object', req_acl) + self.req.headers[sysmeta_header('object', 'tmpacl')] = \ + acl_headers[sysmeta_header('object', 'acl')] + self.acl_checked = True + + +class UploadAclHandler(MultiUploadAclHandler): + """ + UploadAclHandler: Handler for UploadController + """ + def handle_acl(self, app, method, *args, **kwargs): + method = method or self.method + if hasattr(self, method): + return getattr(self, method)(app) + else: + pass + + def HEAD(self, app): + # FIXME: GET HEAD case conflicts with GET service + method = 'GET' if self.method == 'GET' else 'HEAD' + self._handle_acl(app, method, self.container, '') + + def PUT(self, app): + container = self.req.container_name + MULTIUPLOAD_SUFFIX + obj = '%s/%s' % (self.obj, self.req.params['uploadId']) + resp = self.req._get_response(app, 'HEAD', container, obj) + self.req.headers[sysmeta_header('object', 'acl')] = \ + resp.sysmeta_headers.get(sysmeta_header('object', 'tmpacl')) + + +""" +ACL_MAP = + { + ('', '', ''): + {'Resource': '', + 'Permission': ''}, + ... + } + +s3_method: Method of S3 Request from user to s3api +swift_method: Method of Swift Request from s3api to swift +swift_resource: Resource of Swift Request from s3api to swift +check_resource: +check_permission: +""" +ACL_MAP = { + # HEAD Bucket + ('HEAD', 'HEAD', 'container'): + {'Permission': 'READ'}, + # GET Service + ('GET', 'HEAD', 'container'): + {'Permission': 'OWNER'}, + # GET Bucket, List Parts, List Multipart Upload + ('GET', 'GET', 'container'): + {'Permission': 'READ'}, + # PUT Object, PUT Object Copy + ('PUT', 'HEAD', 'container'): + {'Permission': 'WRITE'}, + # DELETE Bucket + ('DELETE', 'DELETE', 'container'): + {'Permission': 'OWNER'}, + # HEAD Object + ('HEAD', 'HEAD', 'object'): + {'Permission': 'READ'}, + # GET Object + ('GET', 'GET', 'object'): + {'Permission': 'READ'}, + # PUT Object Copy, Upload Part Copy + ('PUT', 'HEAD', 'object'): + {'Permission': 'READ'}, + # Abort Multipart Upload + ('DELETE', 'HEAD', 'container'): + {'Permission': 'WRITE'}, + # Delete Object + ('DELETE', 'DELETE', 'object'): + {'Resource': 'container', + 'Permission': 'WRITE'}, + # Complete Multipart Upload, DELETE Multiple Objects, + # Initiate Multipart Upload + ('POST', 'HEAD', 'container'): + {'Permission': 'WRITE'}, + # Versioning + ('PUT', 'POST', 'container'): + {'Permission': 'WRITE'}, + ('DELETE', 'GET', 'container'): + {'Permission': 'WRITE'}, +} diff --git a/swift/common/middleware/s3api/acl_utils.py b/swift/common/middleware/s3api/acl_utils.py new file mode 100644 index 0000000000..b2821a3d2b --- /dev/null +++ b/swift/common/middleware/s3api/acl_utils.py @@ -0,0 +1,100 @@ +# Copyright (c) 2014 OpenStack Foundation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from swift.common.middleware.s3api.exception import ACLError +from swift.common.middleware.s3api.etree import fromstring, XMLSyntaxError, \ + DocumentInvalid, XMLNS_XSI +from swift.common.middleware.s3api.s3response import S3NotImplemented, \ + MalformedACLError, InvalidArgument + + +def swift_acl_translate(acl, group='', user='', xml=False): + """ + Takes an S3 style ACL and returns a list of header/value pairs that + implement that ACL in Swift, or "NotImplemented" if there isn't a way to do + that yet. + """ + swift_acl = {} + swift_acl['public-read'] = [['X-Container-Read', '.r:*,.rlistings']] + # Swift does not support public write: + # https://answers.launchpad.net/swift/+question/169541 + swift_acl['public-read-write'] = [['X-Container-Write', '.r:*'], + ['X-Container-Read', + '.r:*,.rlistings']] + + # TODO: if there's a way to get group and user, this should work for + # private: + # swift_acl['private'] = \ + # [['HTTP_X_CONTAINER_WRITE', group + ':' + user], \ + # ['HTTP_X_CONTAINER_READ', group + ':' + user]] + swift_acl['private'] = [['X-Container-Write', '.'], + ['X-Container-Read', '.']] + + # Swift doesn't have per-object ACLs, so this is best-effort + swift_acl['bucket-owner-full-control'] = swift_acl['private'] + swift_acl['bucket-owner-read'] = swift_acl['private'] + + if xml: + # We are working with XML and need to parse it + try: + elem = fromstring(acl, 'AccessControlPolicy') + except (XMLSyntaxError, DocumentInvalid): + raise MalformedACLError() + acl = 'unknown' + for grant in elem.findall('./AccessControlList/Grant'): + permission = grant.find('./Permission').text + grantee = grant.find('./Grantee').get('{%s}type' % XMLNS_XSI) + if permission == "FULL_CONTROL" and grantee == 'CanonicalUser' and\ + acl != 'public-read' and acl != 'public-read-write': + acl = 'private' + elif permission == "READ" and grantee == 'Group' and\ + acl != 'public-read-write': + acl = 'public-read' + elif permission == "WRITE" and grantee == 'Group': + acl = 'public-read-write' + else: + acl = 'unsupported' + + if acl in ('authenticated-read', 'log-delivery-write'): + raise S3NotImplemented() + elif acl not in swift_acl: + raise ACLError() + + return swift_acl[acl] + + +def handle_acl_header(req): + """ + Handle the x-amz-acl header. + Note that this header currently used for only normal-acl + (not implemented) on s3acl. + TODO: add translation to swift acl like as x-container-read to s3acl + """ + + amz_acl = req.environ['HTTP_X_AMZ_ACL'] + # Translate the Amazon ACL to something that can be + # implemented in Swift, 501 otherwise. Swift uses POST + # for ACLs, whereas S3 uses PUT. + del req.environ['HTTP_X_AMZ_ACL'] + if req.query_string: + req.query_string = '' + + try: + translated_acl = swift_acl_translate(amz_acl) + except ACLError: + raise InvalidArgument('x-amz-acl', amz_acl) + + for header, acl in translated_acl: + req.headers[header] = acl diff --git a/swift/common/middleware/s3api/controllers/__init__.py b/swift/common/middleware/s3api/controllers/__init__.py new file mode 100644 index 0000000000..9e14bd0035 --- /dev/null +++ b/swift/common/middleware/s3api/controllers/__init__.py @@ -0,0 +1,58 @@ +# Copyright (c) 2014 OpenStack Foundation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from swift.common.middleware.s3api.controllers.base import Controller, \ + UnsupportedController +from swift.common.middleware.s3api.controllers.service import ServiceController +from swift.common.middleware.s3api.controllers.bucket import BucketController +from swift.common.middleware.s3api.controllers.obj import ObjectController + +from swift.common.middleware.s3api.controllers.acl import AclController +from swift.common.middleware.s3api.controllers.s3_acl import S3AclController +from swift.common.middleware.s3api.controllers.multi_delete import \ + MultiObjectDeleteController +from swift.common.middleware.s3api.controllers.multi_upload import \ + UploadController, PartController, UploadsController +from swift.common.middleware.s3api.controllers.location import \ + LocationController +from swift.common.middleware.s3api.controllers.logging import \ + LoggingStatusController +from swift.common.middleware.s3api.controllers.versioning import \ + VersioningController +from swift.common.middleware.s3api.controllers.tagging import \ + TaggingController +from swift.common.middleware.s3api.controllers.object_lock import \ + ObjectLockController + +__all__ = [ + 'Controller', + 'ServiceController', + 'BucketController', + 'ObjectController', + + 'AclController', + 'S3AclController', + 'MultiObjectDeleteController', + 'PartController', + 'UploadsController', + 'UploadController', + 'LocationController', + 'LoggingStatusController', + 'VersioningController', + 'TaggingController', + 'ObjectLockController', + + 'UnsupportedController', +] diff --git a/swift/common/middleware/s3api/controllers/acl.py b/swift/common/middleware/s3api/controllers/acl.py new file mode 100644 index 0000000000..09d49a0bb1 --- /dev/null +++ b/swift/common/middleware/s3api/controllers/acl.py @@ -0,0 +1,131 @@ +# Copyright (c) 2010-2014 OpenStack Foundation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from swift.common.http import HTTP_OK +from swift.common.middleware.acl import parse_acl, referrer_allowed +from swift.common.utils import public + +from swift.common.middleware.s3api.exception import ACLError +from swift.common.middleware.s3api.controllers.base import Controller +from swift.common.middleware.s3api.s3response import ( + HTTPOk, S3NotImplemented, MalformedACLError, UnexpectedContent, + MissingSecurityHeader) +from swift.common.middleware.s3api.etree import Element, SubElement, tostring +from swift.common.middleware.s3api.acl_utils import swift_acl_translate, \ + XMLNS_XSI + + +MAX_ACL_BODY_SIZE = 200 * 1024 + + +def get_acl(account_name, headers): + """ + Attempts to construct an S3 ACL based on what is found in the swift headers + """ + + elem = Element('AccessControlPolicy') + owner = SubElement(elem, 'Owner') + SubElement(owner, 'ID').text = account_name + SubElement(owner, 'DisplayName').text = account_name + access_control_list = SubElement(elem, 'AccessControlList') + + # grant FULL_CONTROL to myself by default + grant = SubElement(access_control_list, 'Grant') + grantee = SubElement(grant, 'Grantee', nsmap={'xsi': XMLNS_XSI}) + grantee.set('{%s}type' % XMLNS_XSI, 'CanonicalUser') + SubElement(grantee, 'ID').text = account_name + SubElement(grantee, 'DisplayName').text = account_name + SubElement(grant, 'Permission').text = 'FULL_CONTROL' + + referrers, _ = parse_acl(headers.get('x-container-read')) + if referrer_allowed('unknown', referrers): + # grant public-read access + grant = SubElement(access_control_list, 'Grant') + grantee = SubElement(grant, 'Grantee', nsmap={'xsi': XMLNS_XSI}) + grantee.set('{%s}type' % XMLNS_XSI, 'Group') + SubElement(grantee, 'URI').text = \ + 'http://acs.amazonaws.com/groups/global/AllUsers' + SubElement(grant, 'Permission').text = 'READ' + + referrers, _ = parse_acl(headers.get('x-container-write')) + if referrer_allowed('unknown', referrers): + # grant public-write access + grant = SubElement(access_control_list, 'Grant') + grantee = SubElement(grant, 'Grantee', nsmap={'xsi': XMLNS_XSI}) + grantee.set('{%s}type' % XMLNS_XSI, 'Group') + SubElement(grantee, 'URI').text = \ + 'http://acs.amazonaws.com/groups/global/AllUsers' + SubElement(grant, 'Permission').text = 'WRITE' + + body = tostring(elem) + + return HTTPOk(body=body, content_type="text/plain") + + +class AclController(Controller): + """ + Handles the following APIs: + + * GET Bucket acl + * PUT Bucket acl + * GET Object acl + * PUT Object acl + + Those APIs are logged as ACL operations in the S3 server log. + """ + @public + def GET(self, req): + """ + Handles GET Bucket acl and GET Object acl. + """ + resp = req.get_response(self.app, method='HEAD') + + return get_acl(req.user_id, resp.headers) + + @public + def PUT(self, req): + """ + Handles PUT Bucket acl and PUT Object acl. + """ + if req.is_object_request: + # Handle Object ACL + raise S3NotImplemented() + else: + # Handle Bucket ACL + xml = req.xml(MAX_ACL_BODY_SIZE) + if all(['HTTP_X_AMZ_ACL' in req.environ, xml]): + # S3 doesn't allow to give ACL with both ACL header and body. + raise UnexpectedContent() + elif not any(['HTTP_X_AMZ_ACL' in req.environ, xml]): + # Both canned ACL header and xml body are missing + raise MissingSecurityHeader(missing_header_name='x-amz-acl') + else: + # correct ACL exists in the request + if xml: + # We very likely have an XML-based ACL request. + # let's try to translate to the request header + try: + translated_acl = swift_acl_translate(xml, xml=True) + except ACLError: + raise MalformedACLError() + + for header, acl in translated_acl: + req.headers[header] = acl + + resp = req.get_response(self.app, 'POST') + resp.status = HTTP_OK + resp.headers.update({'Location': req.container_name}) + + return resp diff --git a/swift/common/middleware/s3api/controllers/base.py b/swift/common/middleware/s3api/controllers/base.py new file mode 100644 index 0000000000..3652b151fa --- /dev/null +++ b/swift/common/middleware/s3api/controllers/base.py @@ -0,0 +1,100 @@ +# Copyright (c) 2010-2014 OpenStack Foundation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools + +from swift.common.middleware.s3api.s3response import S3NotImplemented, \ + InvalidRequest +from swift.common.middleware.s3api.utils import camel_to_snake + + +def bucket_operation(func=None, err_resp=None, err_msg=None): + """ + A decorator to ensure that the request is a bucket operation. If the + target resource is an object, this decorator updates the request by default + so that the controller handles it as a bucket operation. If 'err_resp' is + specified, this raises it on error instead. + """ + def _bucket_operation(func): + @functools.wraps(func) + def wrapped(self, req): + if not req.is_bucket_request: + if err_resp: + raise err_resp(msg=err_msg) + + self.logger.debug('A key is specified for bucket API.') + req.object_name = None + + return func(self, req) + + return wrapped + + if func: + return _bucket_operation(func) + else: + return _bucket_operation + + +def object_operation(func): + """ + A decorator to ensure that the request is an object operation. If the + target resource is not an object, this raises an error response. + """ + @functools.wraps(func) + def wrapped(self, req): + if not req.is_object_request: + raise InvalidRequest('A key must be specified') + + return func(self, req) + + return wrapped + + +def check_container_existence(func): + """ + A decorator to ensure the container existence. + """ + @functools.wraps(func) + def check_container(self, req): + req.get_container_info(self.app) + return func(self, req) + + return check_container + + +class Controller(object): + """ + Base WSGI controller class for the middleware + """ + def __init__(self, app, conf, logger, **kwargs): + self.app = app + self.conf = conf + self.logger = logger + + @classmethod + def resource_type(cls): + """ + Returns the target resource type of this controller. + """ + name = cls.__name__[:-len('Controller')] + return camel_to_snake(name).upper() + + +class UnsupportedController(Controller): + """ + Handles unsupported requests. + """ + def __init__(self, app, conf, logger, **kwargs): + raise S3NotImplemented('The requested resource is not implemented') diff --git a/swift/common/middleware/s3api/controllers/bucket.py b/swift/common/middleware/s3api/controllers/bucket.py new file mode 100644 index 0000000000..c4c8530c16 --- /dev/null +++ b/swift/common/middleware/s3api/controllers/bucket.py @@ -0,0 +1,415 @@ +# Copyright (c) 2010-2014 OpenStack Foundation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from base64 import standard_b64encode as b64encode +from base64 import standard_b64decode as b64decode + +from urllib.parse import quote + +from swift.common import swob +from swift.common.http import HTTP_OK +from swift.common.middleware.versioned_writes.object_versioning import \ + DELETE_MARKER_CONTENT_TYPE +from swift.common.utils import json, public, config_true_value, Timestamp, \ + cap_length +from swift.common.registry import get_swift_info + +from swift.common.middleware.s3api.controllers.base import Controller +from swift.common.middleware.s3api.etree import Element, SubElement, \ + tostring, fromstring, XMLSyntaxError, DocumentInvalid +from swift.common.middleware.s3api.s3response import \ + HTTPOk, S3NotImplemented, InvalidArgument, \ + MalformedXML, InvalidLocationConstraint, NoSuchBucket, \ + BucketNotEmpty, VersionedBucketNotEmpty, InternalError, \ + ServiceUnavailable, NoSuchKey +from swift.common.middleware.s3api.utils import MULTIUPLOAD_SUFFIX, S3Timestamp + +MAX_PUT_BUCKET_BODY_SIZE = 10240 + + +class BucketController(Controller): + """ + Handles bucket request. + """ + def _delete_segments_bucket(self, req): + """ + Before delete bucket, delete segments bucket if existing. + """ + container = req.container_name + MULTIUPLOAD_SUFFIX + marker = '' + seg = '' + + try: + resp = req.get_response(self.app, 'HEAD') + if int(resp.sw_headers['X-Container-Object-Count']) > 0: + if resp.sw_headers.get('X-Container-Sysmeta-Versions-Enabled'): + raise VersionedBucketNotEmpty() + else: + raise BucketNotEmpty() + # FIXME: This extra HEAD saves unexpected segment deletion + # but if a complete multipart upload happen while cleanup + # segment container below, completed object may be missing its + # segments unfortunately. To be safer, it might be good + # to handle if the segments can be deleted for each object. + except NoSuchBucket: + pass + + try: + while True: + # delete all segments + resp = req.get_response(self.app, 'GET', container, + query={'format': 'json', + 'marker': marker}) + segments = json.loads(resp.body) + for seg in segments: + try: + req.get_response( + self.app, 'DELETE', container, + swob.bytes_to_wsgi(seg['name'].encode('utf8'))) + except NoSuchKey: + pass + except InternalError: + raise ServiceUnavailable() + if segments: + marker = seg['name'] + else: + break + req.get_response(self.app, 'DELETE', container) + except NoSuchBucket: + return + except (BucketNotEmpty, InternalError): + raise ServiceUnavailable() + + @public + def HEAD(self, req): + """ + Handle HEAD Bucket (Get Metadata) request + """ + resp = req.get_response(self.app) + + return HTTPOk(headers=resp.headers) + + def _parse_request_options(self, req, max_keys): + encoding_type = req.params.get('encoding-type') + if encoding_type is not None and encoding_type != 'url': + err_msg = 'Invalid Encoding Method specified in Request' + raise InvalidArgument('encoding-type', encoding_type, err_msg) + + # in order to judge that truncated is valid, check whether + # max_keys + 1 th element exists in swift. + query = { + 'limit': max_keys + 1, + } + if 'prefix' in req.params: + query['prefix'] = swob.wsgi_to_str(req.params['prefix']) + if 'delimiter' in req.params: + query['delimiter'] = swob.wsgi_to_str(req.params['delimiter']) + fetch_owner = False + if 'versions' in req.params: + query['versions'] = swob.wsgi_to_str(req.params['versions']) + listing_type = 'object-versions' + version_marker = swob.wsgi_to_str(req.params.get( + 'version-id-marker')) + if 'key-marker' in req.params: + query['marker'] = swob.wsgi_to_str(req.params['key-marker']) + if version_marker is not None: + if version_marker != 'null': + try: + Timestamp(version_marker) + except ValueError: + raise InvalidArgument( + 'version-id-marker', version_marker, + 'Invalid version id specified') + query['version_marker'] = version_marker + elif version_marker is not None: + err_msg = ('A version-id marker cannot be specified without ' + 'a key marker.') + raise InvalidArgument('version-id-marker', + version_marker, err_msg) + elif int(req.params.get('list-type', '1')) == 2: + listing_type = 'version-2' + if 'start-after' in req.params: + query['marker'] = swob.wsgi_to_str(req.params['start-after']) + # continuation-token overrides start-after + if 'continuation-token' in req.params: + decoded = b64decode( + req.params['continuation-token']).decode('utf8') + query['marker'] = decoded + if 'fetch-owner' in req.params: + fetch_owner = config_true_value(req.params['fetch-owner']) + else: + listing_type = 'version-1' + if 'marker' in req.params: + query['marker'] = swob.wsgi_to_str(req.params['marker']) + + return encoding_type, query, listing_type, fetch_owner + + def _build_versions_result(self, req, objects, encoding_type, + tag_max_keys, is_truncated): + elem = Element('ListVersionsResult') + SubElement(elem, 'Name').text = req.container_name + prefix = swob.wsgi_to_str(req.params.get('prefix')) + if prefix and encoding_type == 'url': + prefix = quote(prefix) + SubElement(elem, 'Prefix').text = prefix + key_marker = swob.wsgi_to_str(req.params.get('key-marker')) + if key_marker and encoding_type == 'url': + key_marker = quote(key_marker) + SubElement(elem, 'KeyMarker').text = key_marker + SubElement(elem, 'VersionIdMarker').text = swob.wsgi_to_str( + req.params.get('version-id-marker')) + if is_truncated: + if 'name' in objects[-1]: + SubElement(elem, 'NextKeyMarker').text = \ + objects[-1]['name'] + SubElement(elem, 'NextVersionIdMarker').text = \ + objects[-1].get('version') or 'null' + if 'subdir' in objects[-1]: + SubElement(elem, 'NextKeyMarker').text = \ + objects[-1]['subdir'] + SubElement(elem, 'NextVersionIdMarker').text = 'null' + SubElement(elem, 'MaxKeys').text = str(tag_max_keys) + delimiter = swob.wsgi_to_str(req.params.get('delimiter')) + if delimiter is not None: + if encoding_type == 'url': + delimiter = quote(delimiter) + SubElement(elem, 'Delimiter').text = delimiter + if encoding_type == 'url': + SubElement(elem, 'EncodingType').text = encoding_type + SubElement(elem, 'IsTruncated').text = \ + 'true' if is_truncated else 'false' + return elem + + def _build_base_listing_element(self, req, encoding_type): + elem = Element('ListBucketResult') + SubElement(elem, 'Name').text = req.container_name + prefix = swob.wsgi_to_str(req.params.get('prefix')) + if prefix and encoding_type == 'url': + prefix = quote(prefix) + SubElement(elem, 'Prefix').text = prefix + return elem + + def _build_list_bucket_result_type_one(self, req, objects, encoding_type, + tag_max_keys, is_truncated): + elem = self._build_base_listing_element(req, encoding_type) + marker = swob.wsgi_to_str(req.params.get('marker')) + if marker and encoding_type == 'url': + marker = quote(marker) + SubElement(elem, 'Marker').text = marker + if is_truncated and 'delimiter' in req.params: + if 'name' in objects[-1]: + name = objects[-1]['name'] + else: + name = objects[-1]['subdir'] + if encoding_type == 'url': + name = quote(name.encode('utf-8')) + SubElement(elem, 'NextMarker').text = name + # XXX: really? no NextMarker when no delimiter?? + SubElement(elem, 'MaxKeys').text = str(tag_max_keys) + delimiter = swob.wsgi_to_str(req.params.get('delimiter')) + if delimiter: + if encoding_type == 'url': + delimiter = quote(delimiter) + SubElement(elem, 'Delimiter').text = delimiter + if encoding_type == 'url': + SubElement(elem, 'EncodingType').text = encoding_type + SubElement(elem, 'IsTruncated').text = \ + 'true' if is_truncated else 'false' + return elem + + def _build_list_bucket_result_type_two(self, req, objects, encoding_type, + tag_max_keys, is_truncated): + elem = self._build_base_listing_element(req, encoding_type) + if is_truncated: + if 'name' in objects[-1]: + SubElement(elem, 'NextContinuationToken').text = \ + b64encode(objects[-1]['name'].encode('utf8')) + if 'subdir' in objects[-1]: + SubElement(elem, 'NextContinuationToken').text = \ + b64encode(objects[-1]['subdir'].encode('utf8')) + if 'continuation-token' in req.params: + SubElement(elem, 'ContinuationToken').text = \ + swob.wsgi_to_str(req.params['continuation-token']) + start_after = swob.wsgi_to_str(req.params.get('start-after')) + if start_after is not None: + if encoding_type == 'url': + start_after = quote(start_after) + SubElement(elem, 'StartAfter').text = start_after + SubElement(elem, 'KeyCount').text = str(len(objects)) + SubElement(elem, 'MaxKeys').text = str(tag_max_keys) + delimiter = swob.wsgi_to_str(req.params.get('delimiter')) + if delimiter: + if encoding_type == 'url': + delimiter = quote(delimiter) + SubElement(elem, 'Delimiter').text = delimiter + if encoding_type == 'url': + SubElement(elem, 'EncodingType').text = encoding_type + SubElement(elem, 'IsTruncated').text = \ + 'true' if is_truncated else 'false' + return elem + + def _add_subdir(self, elem, o, encoding_type): + common_prefixes = SubElement(elem, 'CommonPrefixes') + name = o['subdir'] + if encoding_type == 'url': + name = quote(name.encode('utf-8')) + SubElement(common_prefixes, 'Prefix').text = name + + def _add_object(self, req, elem, o, encoding_type, listing_type, + fetch_owner): + name = o['name'] + if encoding_type == 'url': + name = quote(name.encode('utf-8')) + + if listing_type == 'object-versions': + if o['content_type'] == DELETE_MARKER_CONTENT_TYPE: + contents = SubElement(elem, 'DeleteMarker') + else: + contents = SubElement(elem, 'Version') + SubElement(contents, 'Key').text = name + SubElement(contents, 'VersionId').text = o.get( + 'version_id') or 'null' + if 'object_versioning' in get_swift_info(): + SubElement(contents, 'IsLatest').text = ( + 'true' if o['is_latest'] else 'false') + else: + SubElement(contents, 'IsLatest').text = 'true' + else: + contents = SubElement(elem, 'Contents') + SubElement(contents, 'Key').text = name + SubElement(contents, 'LastModified').text = \ + S3Timestamp.from_isoformat(o['last_modified']).s3xmlformat + if contents.tag != 'DeleteMarker': + if 's3_etag' in o: + # New-enough MUs are already in the right format + etag = o['s3_etag'] + elif 'slo_etag' in o: + # SLOs may be in something *close* to the MU format + etag = '"%s-N"' % o['slo_etag'].strip('"') + else: + # Normal objects just use the MD5 + etag = o['hash'] + if len(etag) < 2 or etag[::len(etag) - 1] != '""': + # Normal objects just use the MD5 + etag = '"%s"' % o['hash'] + # This also catches sufficiently-old SLOs, but we have + # no way to identify those from container listings + # Otherwise, somebody somewhere (proxyfs, maybe?) made this + # look like an RFC-compliant ETag; we don't need to + # quote-wrap. + SubElement(contents, 'ETag').text = etag + SubElement(contents, 'Size').text = str(o['bytes']) + if fetch_owner or listing_type != 'version-2': + owner = SubElement(contents, 'Owner') + SubElement(owner, 'ID').text = req.user_id + SubElement(owner, 'DisplayName').text = req.user_id + if contents.tag != 'DeleteMarker': + SubElement(contents, 'StorageClass').text = 'STANDARD' + + def _add_objects_to_result(self, req, elem, objects, encoding_type, + listing_type, fetch_owner): + for o in objects: + if 'subdir' in o: + self._add_subdir(elem, o, encoding_type) + else: + self._add_object(req, elem, o, encoding_type, listing_type, + fetch_owner) + + @public + def GET(self, req): + """ + Handle GET Bucket (List Objects) request + """ + tag_max_keys = req.get_validated_param( + 'max-keys', self.conf.max_bucket_listing) + # TODO: Separate max_bucket_listing and default_bucket_listing + max_keys = min(tag_max_keys, self.conf.max_bucket_listing) + + encoding_type, query, listing_type, fetch_owner = \ + self._parse_request_options(req, max_keys) + + resp = req.get_response(self.app, query=query) + + try: + objects = json.loads(resp.body) + except (TypeError, ValueError): + self.logger.error('Got non-JSON response trying to list %s: %r', + req.path, cap_length(resp.body, 60)) + raise + + is_truncated = max_keys > 0 and len(objects) > max_keys + objects = objects[:max_keys] + + if listing_type == 'object-versions': + func = self._build_versions_result + elif listing_type == 'version-2': + func = self._build_list_bucket_result_type_two + else: + func = self._build_list_bucket_result_type_one + elem = func(req, objects, encoding_type, tag_max_keys, is_truncated) + self._add_objects_to_result( + req, elem, objects, encoding_type, listing_type, fetch_owner) + + body = tostring(elem) + + return HTTPOk(body=body, content_type='application/xml') + + @public + def PUT(self, req): + """ + Handle PUT Bucket request + """ + xml = req.xml(MAX_PUT_BUCKET_BODY_SIZE) + if xml: + # check location + try: + elem = fromstring( + xml, 'CreateBucketConfiguration', self.logger) + location = elem.find('./LocationConstraint').text + except (XMLSyntaxError, DocumentInvalid): + raise MalformedXML() + except Exception as e: + self.logger.error(e) + raise + + if location not in (self.conf.location, + self.conf.location.lower()): + # s3api cannot support multiple regions currently. + raise InvalidLocationConstraint() + + resp = req.get_response(self.app) + + resp.status = HTTP_OK + resp.location = '/' + req.container_name + + return resp + + @public + def DELETE(self, req): + """ + Handle DELETE Bucket request + """ + # NB: object_versioning is responsible for cleaning up its container + if self.conf.allow_multipart_uploads: + self._delete_segments_bucket(req) + resp = req.get_response(self.app) + return resp + + @public + def POST(self, req): + """ + Handle POST Bucket request + """ + raise S3NotImplemented() diff --git a/swift/common/middleware/s3api/controllers/location.py b/swift/common/middleware/s3api/controllers/location.py new file mode 100644 index 0000000000..b4b288d833 --- /dev/null +++ b/swift/common/middleware/s3api/controllers/location.py @@ -0,0 +1,42 @@ +# Copyright (c) 2010-2014 OpenStack Foundation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from swift.common.utils import public + +from swift.common.middleware.s3api.controllers.base import Controller, \ + bucket_operation +from swift.common.middleware.s3api.etree import Element, tostring +from swift.common.middleware.s3api.s3response import HTTPOk + + +class LocationController(Controller): + """ + Handles GET Bucket location, which is logged as a LOCATION operation in the + S3 server log. + """ + @public + @bucket_operation + def GET(self, req): + """ + Handles GET Bucket location. + """ + req.get_response(self.app, method='HEAD') + + elem = Element('LocationConstraint') + if self.conf.location != 'us-east-1': + elem.text = self.conf.location + body = tostring(elem) + + return HTTPOk(body=body, content_type='application/xml') diff --git a/swift/common/middleware/s3api/controllers/logging.py b/swift/common/middleware/s3api/controllers/logging.py new file mode 100644 index 0000000000..5eec0151bc --- /dev/null +++ b/swift/common/middleware/s3api/controllers/logging.py @@ -0,0 +1,54 @@ +# Copyright (c) 2010-2014 OpenStack Foundation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from swift.common.utils import public + +from swift.common.middleware.s3api.controllers.base import Controller, \ + bucket_operation +from swift.common.middleware.s3api.etree import Element, tostring +from swift.common.middleware.s3api.s3response import ( + HTTPOk, S3NotImplemented, NoLoggingStatusForKey) + + +class LoggingStatusController(Controller): + """ + Handles the following APIs: + + * GET Bucket logging + * PUT Bucket logging + + Those APIs are logged as LOGGING_STATUS operations in the S3 server log. + """ + @public + @bucket_operation(err_resp=NoLoggingStatusForKey) + def GET(self, req): + """ + Handles GET Bucket logging. + """ + req.get_response(self.app, method='HEAD') + + # logging disabled + elem = Element('BucketLoggingStatus') + body = tostring(elem) + + return HTTPOk(body=body, content_type='application/xml') + + @public + @bucket_operation(err_resp=NoLoggingStatusForKey) + def PUT(self, req): + """ + Handles PUT Bucket logging. + """ + raise S3NotImplemented() diff --git a/swift/common/middleware/s3api/controllers/multi_delete.py b/swift/common/middleware/s3api/controllers/multi_delete.py new file mode 100644 index 0000000000..e6d00dbd0d --- /dev/null +++ b/swift/common/middleware/s3api/controllers/multi_delete.py @@ -0,0 +1,181 @@ +# Copyright (c) 2010-2014 OpenStack Foundation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import json + +from swift.common.constraints import MAX_OBJECT_NAME_LENGTH +from swift.common.http import HTTP_NO_CONTENT +from swift.common.swob import str_to_wsgi +from swift.common.utils import public, StreamingPile +from swift.common.registry import get_swift_info + +from swift.common.middleware.s3api.controllers.base import Controller, \ + bucket_operation +from swift.common.middleware.s3api.etree import Element, SubElement, \ + fromstring, tostring, XMLSyntaxError, DocumentInvalid +from swift.common.middleware.s3api.s3response import HTTPOk, \ + S3NotImplemented, NoSuchKey, ErrorResponse, MalformedXML, \ + UserKeyMustBeSpecified, AccessDenied, MissingRequestBodyError + + +class MultiObjectDeleteController(Controller): + """ + Handles Delete Multiple Objects, which is logged as a MULTI_OBJECT_DELETE + operation in the S3 server log. + """ + def _gen_error_body(self, error, elem, delete_list): + for key, version in delete_list: + error_elem = SubElement(elem, 'Error') + SubElement(error_elem, 'Key').text = key + if version is not None: + SubElement(error_elem, 'VersionId').text = version + SubElement(error_elem, 'Code').text = error.__class__.__name__ + SubElement(error_elem, 'Message').text = error._msg + + return tostring(elem) + + @public + @bucket_operation + def POST(self, req): + """ + Handles Delete Multiple Objects. + """ + def object_key_iter(elem): + for obj in elem.iterchildren('Object'): + key = obj.find('./Key').text + if not key: + raise UserKeyMustBeSpecified() + version = obj.find('./VersionId') + if version is not None: + version = version.text + + yield key, version + + max_body_size = min( + # FWIW, AWS limits multideletes to 1000 keys, and swift limits + # object names to 1024 bytes (by default). Add a factor of two to + # allow some slop. + 2 * self.conf.max_multi_delete_objects * MAX_OBJECT_NAME_LENGTH, + # But, don't let operators shoot themselves in the foot + 10 * 1024 * 1024) + + try: + xml = req.xml(max_body_size) + if not xml: + raise MissingRequestBodyError() + + req.require_md5(xml) + elem = fromstring(xml, 'Delete', self.logger) + + quiet = elem.find('./Quiet') + self.quiet = quiet is not None and quiet.text.lower() == 'true' + + delete_list = list(object_key_iter(elem)) + if len(delete_list) > self.conf.max_multi_delete_objects: + raise MalformedXML() + except (XMLSyntaxError, DocumentInvalid): + raise MalformedXML() + except ErrorResponse: + raise + except Exception as e: + self.logger.error(e) + raise + + elem = Element('DeleteResult') + + # check bucket existence + try: + req.get_response(self.app, 'HEAD') + except AccessDenied as error: + body = self._gen_error_body(error, elem, delete_list) + return HTTPOk(body=body) + + if 'object_versioning' not in get_swift_info() and any( + version not in ('null', None) + for _key, version in delete_list): + raise S3NotImplemented() + + def do_delete(base_req, key, version): + req = copy.copy(base_req) + req.environ = copy.copy(base_req.environ) + req.object_name = str_to_wsgi(key) + if version: + req.params = {'version-id': version, 'symlink': 'get'} + + try: + try: + query = req.gen_multipart_manifest_delete_query( + self.app, version=version) + except NoSuchKey: + query = {} + if version: + query['version-id'] = version + query['symlink'] = 'get' + + resp = req.get_response(self.app, method='DELETE', query=query, + headers={'Accept': 'application/json'}) + # If async segment cleanup is available, we expect to get + # back a 204; otherwise, the delete is synchronous and we + # have to read the response to actually do the SLO delete + if query.get('multipart-manifest') and \ + resp.status_int != HTTP_NO_CONTENT: + try: + delete_result = json.loads(resp.body) + if delete_result['Errors']: + # NB: bulk includes 404s in "Number Not Found", + # not "Errors" + msg_parts = [delete_result['Response Status']] + msg_parts.extend( + '%s: %s' % (obj, status) + for obj, status in delete_result['Errors']) + return key, {'code': 'SLODeleteError', + 'message': '\n'.join(msg_parts)} + # else, all good + except (ValueError, TypeError, KeyError): + # Logs get all the gory details + self.logger.exception( + 'Could not parse SLO delete response (%s): %s', + resp.status, resp.body) + # Client gets something more generic + return key, {'code': 'SLODeleteError', + 'message': 'Unexpected swift response'} + except NoSuchKey: + pass + except ErrorResponse as e: + return key, {'code': e.__class__.__name__, 'message': e._msg} + except Exception: + self.logger.exception( + 'Unexpected Error handling DELETE of %r %r' % ( + req.container_name, key)) + return key, {'code': 'Server Error', 'message': 'Server Error'} + + return key, None + + with StreamingPile(self.conf.multi_delete_concurrency) as pile: + for key, err in pile.asyncstarmap(do_delete, ( + (req, key, version) for key, version in delete_list)): + if err: + error = SubElement(elem, 'Error') + SubElement(error, 'Key').text = key + SubElement(error, 'Code').text = err['code'] + SubElement(error, 'Message').text = err['message'] + elif not self.quiet: + deleted = SubElement(elem, 'Deleted') + SubElement(deleted, 'Key').text = key + + body = tostring(elem) + + return HTTPOk(body=body) diff --git a/swift/common/middleware/s3api/controllers/multi_upload.py b/swift/common/middleware/s3api/controllers/multi_upload.py new file mode 100644 index 0000000000..657d9f30df --- /dev/null +++ b/swift/common/middleware/s3api/controllers/multi_upload.py @@ -0,0 +1,858 @@ +# Copyright (c) 2010-2014 OpenStack Foundation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Implementation of S3 Multipart Upload. + +This module implements S3 Multipart Upload APIs with the Swift SLO feature. +The following explains how S3api uses swift container and objects to store S3 +upload information: + +----------------- +[bucket]+segments +----------------- + +A container to store upload information. [bucket] is the original bucket +where multipart upload is initiated. + +----------------------------- +[bucket]+segments/[upload_id] +----------------------------- + +An object of the ongoing upload id. The object is empty and used for +checking the target upload status. If the object exists, it means that the +upload is initiated but not either completed or aborted. + +------------------------------------------- +[bucket]+segments/[upload_id]/[part_number] +------------------------------------------- + +The last suffix is the part number under the upload id. When the client uploads +the parts, they will be stored in the namespace with +[bucket]+segments/[upload_id]/[part_number]. + +Example listing result in the [bucket]+segments container:: + + [bucket]+segments/[upload_id1] # upload id object for upload_id1 + [bucket]+segments/[upload_id1]/1 # part object for upload_id1 + [bucket]+segments/[upload_id1]/2 # part object for upload_id1 + [bucket]+segments/[upload_id1]/3 # part object for upload_id1 + [bucket]+segments/[upload_id2] # upload id object for upload_id2 + [bucket]+segments/[upload_id2]/1 # part object for upload_id2 + [bucket]+segments/[upload_id2]/2 # part object for upload_id2 + . + . + +Those part objects are directly used as segments of a Swift +Static Large Object when the multipart upload is completed. + +""" + +import binascii +import copy +import os +import re +import time + +from swift.common import constraints +from swift.common.swob import Range, bytes_to_wsgi, normalize_etag, \ + wsgi_to_str +from swift.common.utils import json, public, reiterate, md5, Timestamp +from swift.common.request_helpers import get_container_update_override_key, \ + get_param + +from urllib.parse import quote, urlparse + +from swift.common.middleware.s3api.controllers.base import Controller, \ + bucket_operation, object_operation, check_container_existence +from swift.common.middleware.s3api.s3response import InvalidArgument, \ + ErrorResponse, MalformedXML, KeyTooLongError, InvalidPart, \ + BucketAlreadyExists, EntityTooSmall, InvalidPartOrder, InvalidRequest, \ + HTTPOk, HTTPNoContent, NoSuchKey, NoSuchUpload, NoSuchBucket, \ + BucketAlreadyOwnedByYou, ServiceUnavailable, PreconditionFailed, \ + S3NotImplemented +from swift.common.middleware.s3api.utils import unique_id, \ + MULTIUPLOAD_SUFFIX, S3Timestamp, sysmeta_header +from swift.common.middleware.s3api.etree import Element, SubElement, \ + fromstring, tostring, XMLSyntaxError, DocumentInvalid +from swift.common.storage_policy import POLICIES + +DEFAULT_MAX_PARTS_LISTING = 1000 +DEFAULT_MAX_UPLOADS = 1000 + +MAX_COMPLETE_UPLOAD_BODY_SIZE = 2048 * 1024 + + +def _get_upload_info(req, app, upload_id): + """ + Make a HEAD request for existing upload object metadata. Tries the upload + marker first, and then falls back to the manifest object. + + :param req: an S3Request object. + :param app: the wsgi app. + :param upload_id: the upload id. + :returns: a tuple of (S3Response, boolean) where the boolean is True if the + response is from the upload marker and False otherwise. + :raises: NoSuchUpload if neither the marker nor the manifest were found. + """ + + container = req.container_name + MULTIUPLOAD_SUFFIX + obj = '%s/%s' % (req.object_name, upload_id) + + # XXX: if we leave the copy-source header, somewhere later we might + # drop in a ?version-id=... query string that's utterly inappropriate + # for the upload marker. Until we get around to fixing that, just pop + # it off for now... + copy_source = req.headers.pop('X-Amz-Copy-Source', None) + try: + resp = req.get_response(app, 'HEAD', container=container, obj=obj) + return resp, True + except NoSuchKey: + # ensure consistent path and policy are logged despite manifest HEAD + upload_marker_path = req.environ.get('s3api.backend_path') + policy_index = req.policy_index + try: + resp = req.get_response(app, 'HEAD') + if resp.sysmeta_headers.get(sysmeta_header( + 'object', 'upload-id')) == upload_id: + return resp, False + except NoSuchKey: + pass + finally: + # Ops often find it more useful for us to log the upload marker + # path, so put it back + if upload_marker_path is not None: + req.environ['s3api.backend_path'] = upload_marker_path + if policy_index is not None: + req.policy_index = policy_index + raise NoSuchUpload(upload_id=upload_id) + finally: + # ...making sure to restore any copy-source before returning + if copy_source is not None: + req.headers['X-Amz-Copy-Source'] = copy_source + + +def _make_complete_body(req, s3_etag, yielded_anything): + result_elem = Element('CompleteMultipartUploadResult') + + # NOTE: boto with sig v4 appends port to HTTP_HOST value at + # the request header when the port is non default value and it + # makes req.host_url like as http://localhost:8080:8080/path + # that obviously invalid. Probably it should be resolved at + # swift.common.swob though, tentatively we are parsing and + # reconstructing the correct host_url info here. + # in detail, https://github.com/boto/boto/pull/3513 + parsed_url = urlparse(req.host_url) + host_url = '%s://%s' % (parsed_url.scheme, parsed_url.hostname) + # Why are we doing our own port parsing? Because py3 decided + # to start raising ValueErrors on access after parsing such + # an invalid port + netloc = parsed_url.netloc.split('@')[-1].split(']')[-1] + if ':' in netloc: + port = netloc.split(':', 2)[1] + host_url += ':%s' % port + + SubElement(result_elem, 'Location').text = host_url + req.path + SubElement(result_elem, 'Bucket').text = req.container_name + SubElement(result_elem, 'Key').text = wsgi_to_str(req.object_name) + SubElement(result_elem, 'ETag').text = '"%s"' % s3_etag + body = tostring(result_elem, xml_declaration=not yielded_anything) + if yielded_anything: + return b'\n' + body + return body + + +class PartController(Controller): + """ + Handles the following APIs: + + * Upload Part + * Upload Part - Copy + + Those APIs are logged as PART operations in the S3 server log. + """ + @public + @object_operation + @check_container_existence + def PUT(self, req): + """ + Handles Upload Part and Upload Part Copy. + """ + + if 'uploadId' not in req.params: + raise InvalidArgument('ResourceType', 'partNumber', + 'Unexpected query string parameter') + + part_number = req.validate_part_number() + + upload_id = get_param(req, 'uploadId') + _get_upload_info(req, self.app, upload_id) + + req.container_name += MULTIUPLOAD_SUFFIX + req.object_name = '%s/%s/%d' % (req.object_name, upload_id, + part_number) + + req_timestamp = S3Timestamp.now() + req.headers['X-Timestamp'] = req_timestamp.internal + source_resp = req.check_copy_source(self.app) + if 'X-Amz-Copy-Source' in req.headers and \ + 'X-Amz-Copy-Source-Range' in req.headers: + rng = req.headers['X-Amz-Copy-Source-Range'] + + header_valid = True + try: + rng_obj = Range(rng) + if len(rng_obj.ranges) != 1: + header_valid = False + except ValueError: + header_valid = False + if not header_valid: + err_msg = ('The x-amz-copy-source-range value must be of the ' + 'form bytes=first-last where first and last are ' + 'the zero-based offsets of the first and last ' + 'bytes to copy') + raise InvalidArgument('x-amz-source-range', rng, err_msg) + + source_size = int(source_resp.headers['Content-Length']) + if not rng_obj.ranges_for_length(source_size): + err_msg = ('Range specified is not valid for source object ' + 'of size: %s' % source_size) + raise InvalidArgument('x-amz-source-range', rng, err_msg) + + req.headers['Range'] = rng + del req.headers['X-Amz-Copy-Source-Range'] + if 'X-Amz-Copy-Source' in req.headers: + # Clear some problematic headers that might be on the source + req.headers.update({ + sysmeta_header('object', 'etag'): '', + 'X-Object-Sysmeta-Swift3-Etag': '', # for legacy data + 'X-Object-Sysmeta-Slo-Etag': '', + 'X-Object-Sysmeta-Slo-Size': '', + get_container_update_override_key('etag'): '', + }) + resp = req.get_response(self.app) + + if 'X-Amz-Copy-Source' in req.headers: + resp.append_copy_resp_body(req.controller_name, + req_timestamp.s3xmlformat) + + resp.status = 200 + return resp + + +class UploadsController(Controller): + """ + Handles the following APIs: + + * List Multipart Uploads + * Initiate Multipart Upload + + Those APIs are logged as UPLOADS operations in the S3 server log. + """ + @public + @bucket_operation(err_resp=InvalidRequest, + err_msg="Key is not expected for the GET method " + "?uploads subresource") + @check_container_existence + def GET(self, req): + """ + Handles List Multipart Uploads + """ + + def separate_uploads(uploads, prefix, delimiter): + """ + separate_uploads will separate uploads into non_delimited_uploads + (a subset of uploads) and common_prefixes according to the + specified delimiter. non_delimited_uploads is a list of uploads + which exclude the delimiter. common_prefixes is a set of prefixes + prior to the specified delimiter. Note that the prefix in the + common_prefixes includes the delimiter itself. + + i.e. if '/' delimiter specified and then the uploads is consists of + ['foo', 'foo/bar'], this function will return (['foo'], ['foo/']). + + :param uploads: A list of uploads dictionary + :param prefix: A string of prefix reserved on the upload path. + (i.e. the delimiter must be searched behind the + prefix) + :param delimiter: A string of delimiter to split the path in each + upload + + :return (non_delimited_uploads, common_prefixes) + """ + non_delimited_uploads = [] + common_prefixes = set() + for upload in uploads: + key = upload['key'] + end = key.find(delimiter, len(prefix)) + if end >= 0: + common_prefix = key[:end + len(delimiter)] + common_prefixes.add(common_prefix) + else: + non_delimited_uploads.append(upload) + return non_delimited_uploads, sorted(common_prefixes) + + encoding_type = get_param(req, 'encoding-type') + if encoding_type is not None and encoding_type != 'url': + err_msg = 'Invalid Encoding Method specified in Request' + raise InvalidArgument('encoding-type', encoding_type, err_msg) + + keymarker = get_param(req, 'key-marker', '') + uploadid = get_param(req, 'upload-id-marker', '') + maxuploads = req.get_validated_param( + 'max-uploads', DEFAULT_MAX_UPLOADS, DEFAULT_MAX_UPLOADS) + + query = { + 'format': 'json', + 'marker': '', + } + + if uploadid and keymarker: + query.update({'marker': '%s/%s' % (keymarker, uploadid)}) + elif keymarker: + query.update({'marker': '%s/~' % (keymarker)}) + if 'prefix' in req.params: + query.update({'prefix': get_param(req, 'prefix')}) + + container = req.container_name + MULTIUPLOAD_SUFFIX + uploads = [] + prefixes = [] + + def object_to_upload(object_info): + obj, upid = object_info['name'].rsplit('/', 1) + obj_dict = {'key': obj, + 'upload_id': upid, + 'last_modified': object_info['last_modified']} + return obj_dict + + is_segment = re.compile('.*/[0-9]+$') + + while len(uploads) < maxuploads: + try: + resp = req.get_response(self.app, container=container, + query=query) + objects = json.loads(resp.body) + except NoSuchBucket: + # Assume NoSuchBucket as no uploads + objects = [] + if not objects: + break + + new_uploads = [object_to_upload(obj) for obj in objects + if not is_segment.match(obj.get('name', ''))] + new_prefixes = [] + if 'delimiter' in req.params: + prefix = get_param(req, 'prefix', '') + delimiter = get_param(req, 'delimiter') + new_uploads, new_prefixes = separate_uploads( + new_uploads, prefix, delimiter) + uploads.extend(new_uploads) + prefixes.extend(new_prefixes) + query['marker'] = objects[-1]['name'] + + truncated = len(uploads) >= maxuploads + if len(uploads) > maxuploads: + uploads = uploads[:maxuploads] + + nextkeymarker = '' + nextuploadmarker = '' + if len(uploads) > 1: + nextuploadmarker = uploads[-1]['upload_id'] + nextkeymarker = uploads[-1]['key'] + + result_elem = Element('ListMultipartUploadsResult') + SubElement(result_elem, 'Bucket').text = req.container_name + SubElement(result_elem, 'KeyMarker').text = keymarker + SubElement(result_elem, 'UploadIdMarker').text = uploadid + SubElement(result_elem, 'NextKeyMarker').text = nextkeymarker + SubElement(result_elem, 'NextUploadIdMarker').text = nextuploadmarker + if 'delimiter' in req.params: + SubElement(result_elem, 'Delimiter').text = \ + get_param(req, 'delimiter') + if 'prefix' in req.params: + SubElement(result_elem, 'Prefix').text = get_param(req, 'prefix') + SubElement(result_elem, 'MaxUploads').text = str(maxuploads) + if encoding_type is not None: + SubElement(result_elem, 'EncodingType').text = encoding_type + SubElement(result_elem, 'IsTruncated').text = \ + 'true' if truncated else 'false' + + # TODO: don't show uploads which are initiated before this bucket is + # created. + for u in uploads: + upload_elem = SubElement(result_elem, 'Upload') + name = u['key'] + if encoding_type == 'url': + name = quote(name) + SubElement(upload_elem, 'Key').text = name + SubElement(upload_elem, 'UploadId').text = u['upload_id'] + initiator_elem = SubElement(upload_elem, 'Initiator') + SubElement(initiator_elem, 'ID').text = req.user_id + SubElement(initiator_elem, 'DisplayName').text = req.user_id + owner_elem = SubElement(upload_elem, 'Owner') + SubElement(owner_elem, 'ID').text = req.user_id + SubElement(owner_elem, 'DisplayName').text = req.user_id + SubElement(upload_elem, 'StorageClass').text = 'STANDARD' + SubElement(upload_elem, 'Initiated').text = \ + S3Timestamp.from_isoformat(u['last_modified']).s3xmlformat + + for p in prefixes: + elem = SubElement(result_elem, 'CommonPrefixes') + SubElement(elem, 'Prefix').text = p + + body = tostring(result_elem) + + return HTTPOk(body=body, content_type='application/xml') + + @public + @object_operation + @check_container_existence + def POST(self, req): + """ + Handles Initiate Multipart Upload. + """ + if len(req.object_name) > constraints.MAX_OBJECT_NAME_LENGTH: + # Note that we can still run into trouble where the MPU is just + # within the limit, which means the segment names will go over + raise KeyTooLongError() + + # Create a unique S3 upload id from UUID to avoid duplicates. + upload_id = unique_id() + + seg_container = req.container_name + MULTIUPLOAD_SUFFIX + content_type = req.headers.get('Content-Type') + if content_type: + req.headers[sysmeta_header('object', 'has-content-type')] = 'yes' + req.headers[ + sysmeta_header('object', 'content-type')] = content_type + else: + req.headers[sysmeta_header('object', 'has-content-type')] = 'no' + req.headers['Content-Type'] = 'application/directory' + + try: + seg_req = copy.copy(req) + seg_req.environ = copy.copy(req.environ) + seg_req.container_name = seg_container + seg_req.get_container_info(self.app) + except NoSuchBucket: + try: + # multi-upload bucket doesn't exist, create one with + # same storage policy and acls as the primary bucket + info = req.get_container_info(self.app) + policy_name = POLICIES[info['storage_policy']].name + hdrs = {'X-Storage-Policy': policy_name} + if info.get('read_acl'): + hdrs['X-Container-Read'] = info['read_acl'] + if info.get('write_acl'): + hdrs['X-Container-Write'] = info['write_acl'] + seg_req.get_response(self.app, 'PUT', seg_container, '', + headers=hdrs) + except (BucketAlreadyExists, BucketAlreadyOwnedByYou): + pass + + obj = '%s/%s' % (req.object_name, upload_id) + + req.headers.pop('Etag', None) + req.headers.pop('Content-Md5', None) + + req.get_response(self.app, 'PUT', seg_container, obj, body='') + + result_elem = Element('InitiateMultipartUploadResult') + SubElement(result_elem, 'Bucket').text = req.container_name + SubElement(result_elem, 'Key').text = wsgi_to_str(req.object_name) + SubElement(result_elem, 'UploadId').text = upload_id + + body = tostring(result_elem) + + return HTTPOk(body=body, content_type='application/xml') + + +class UploadController(Controller): + """ + Handles the following APIs: + + * List Parts + * Abort Multipart Upload + * Complete Multipart Upload + + Those APIs are logged as UPLOAD operations in the S3 server log. + """ + @public + @object_operation + @check_container_existence + def GET(self, req): + """ + Handles List Parts. + """ + def filter_part_num_marker(o): + try: + num = int(os.path.basename(o['name'])) + return num > part_num_marker + except ValueError: + return False + + encoding_type = get_param(req, 'encoding-type') + if encoding_type is not None and encoding_type != 'url': + err_msg = 'Invalid Encoding Method specified in Request' + raise InvalidArgument('encoding-type', encoding_type, err_msg) + + upload_id = get_param(req, 'uploadId') + _get_upload_info(req, self.app, upload_id) + + maxparts = req.get_validated_param( + 'max-parts', DEFAULT_MAX_PARTS_LISTING, + self.conf.max_parts_listing) + part_num_marker = req.get_validated_param( + 'part-number-marker', 0) + + object_name = wsgi_to_str(req.object_name) + query = { + 'format': 'json', + 'prefix': '%s/%s/' % (object_name, upload_id), + 'delimiter': '/', + 'marker': '', + } + + container = req.container_name + MULTIUPLOAD_SUFFIX + # Because the parts are out of order in Swift, we list up to the + # maximum number of parts and then apply the marker and limit options. + objects = [] + while True: + resp = req.get_response(self.app, container=container, obj='', + query=query) + new_objects = json.loads(resp.body) + if not new_objects: + break + objects.extend(new_objects) + query['marker'] = new_objects[-1]['name'] + + last_part = 0 + + # If the caller requested a list starting at a specific part number, + # construct a sub-set of the object list. + objList = [obj for obj in objects if filter_part_num_marker(obj)] + + # pylint: disable-msg=E1103 + objList.sort(key=lambda o: int(o['name'].split('/')[-1])) + + if len(objList) > maxparts: + objList = objList[:maxparts] + truncated = True + else: + truncated = False + # TODO: We have to retrieve object list again when truncated is True + # and some objects filtered by invalid name because there could be no + # enough objects for limit defined by maxparts. + + if objList: + o = objList[-1] + last_part = os.path.basename(o['name']) + + result_elem = Element('ListPartsResult') + SubElement(result_elem, 'Bucket').text = req.container_name + if encoding_type == 'url': + object_name = quote(object_name) + SubElement(result_elem, 'Key').text = object_name + SubElement(result_elem, 'UploadId').text = upload_id + + initiator_elem = SubElement(result_elem, 'Initiator') + SubElement(initiator_elem, 'ID').text = req.user_id + SubElement(initiator_elem, 'DisplayName').text = req.user_id + owner_elem = SubElement(result_elem, 'Owner') + SubElement(owner_elem, 'ID').text = req.user_id + SubElement(owner_elem, 'DisplayName').text = req.user_id + + SubElement(result_elem, 'StorageClass').text = 'STANDARD' + SubElement(result_elem, 'PartNumberMarker').text = str(part_num_marker) + SubElement(result_elem, 'NextPartNumberMarker').text = str(last_part) + SubElement(result_elem, 'MaxParts').text = str(maxparts) + if 'encoding-type' in req.params: + SubElement(result_elem, 'EncodingType').text = \ + get_param(req, 'encoding-type') + SubElement(result_elem, 'IsTruncated').text = \ + 'true' if truncated else 'false' + + for i in objList: + part_elem = SubElement(result_elem, 'Part') + SubElement(part_elem, 'PartNumber').text = i['name'].split('/')[-1] + SubElement(part_elem, 'LastModified').text = \ + S3Timestamp.from_isoformat(i['last_modified']).s3xmlformat + SubElement(part_elem, 'ETag').text = '"%s"' % i['hash'] + SubElement(part_elem, 'Size').text = str(i['bytes']) + + body = tostring(result_elem) + + return HTTPOk(body=body, content_type='application/xml') + + @public + @object_operation + @check_container_existence + def DELETE(self, req): + """ + Handles Abort Multipart Upload. + """ + upload_id = get_param(req, 'uploadId') + _get_upload_info(req, self.app, upload_id) + + # First check to see if this multi-part upload was already + # completed. Look in the primary container, if the object exists, + # then it was completed and we return an error here. + container = req.container_name + MULTIUPLOAD_SUFFIX + obj = '%s/%s' % (req.object_name, upload_id) + req.get_response(self.app, container=container, obj=obj) + + # The completed object was not found so this + # must be a multipart upload abort. + # We must delete any uploaded segments for this UploadID and then + # delete the object in the main container as well + object_name = wsgi_to_str(req.object_name) + query = { + 'format': 'json', + 'prefix': '%s/%s/' % (object_name, upload_id), + 'delimiter': '/', + } + + resp = req.get_response(self.app, 'GET', container, '', query=query) + + # Iterate over the segment objects and delete them individually + objects = json.loads(resp.body) + while objects: + for o in objects: + container = req.container_name + MULTIUPLOAD_SUFFIX + obj = bytes_to_wsgi(o['name'].encode('utf-8')) + req.get_response(self.app, container=container, obj=obj) + query['marker'] = objects[-1]['name'] + resp = req.get_response(self.app, 'GET', container, '', + query=query) + objects = json.loads(resp.body) + + return HTTPNoContent() + + @public + @object_operation + @check_container_existence + def POST(self, req): + """ + Handles Complete Multipart Upload. + """ + upload_id = get_param(req, 'uploadId') + # Check for conditional requests before getting upload info so the + # headers can't bleed into the HEAD + if req.headers.get('If-None-Match', '*') != '*' or any( + h in req.headers for h in ( + 'If-Match', 'If-Modified-Since', 'If-Unmodified-Since')): + raise S3NotImplemented( + 'Conditional uploads are not supported.') + + resp, is_marker = _get_upload_info(req, self.app, upload_id) + if (is_marker and + resp.sw_headers.get('X-Backend-Timestamp') >= Timestamp.now()): + # Somehow the marker was created in the future w.r.t. this thread's + # clock. The manifest PUT may succeed but the subsequent marker + # DELETE will fail, so don't attempt either. + raise ServiceUnavailable + + headers = {'Accept': 'application/json', + sysmeta_header('object', 'upload-id'): upload_id} + for key, val in resp.headers.items(): + _key = key.lower() + if _key.startswith('x-amz-meta-'): + headers['x-object-meta-' + _key[11:]] = val + elif _key in ('content-encoding', 'content-language', + 'content-disposition', 'expires', 'cache-control'): + headers[key] = val + + hct_header = sysmeta_header('object', 'has-content-type') + if resp.sysmeta_headers.get(hct_header) == 'yes': + content_type = resp.sysmeta_headers.get( + sysmeta_header('object', 'content-type')) + elif hct_header in resp.sysmeta_headers: + # has-content-type is present but false, so no content type was + # set on initial upload. In that case, we won't set one on our + # PUT request. Swift will end up guessing one based on the + # object name. + content_type = None + else: + content_type = resp.headers.get('Content-Type') + + if content_type: + headers['Content-Type'] = content_type + + container = req.container_name + MULTIUPLOAD_SUFFIX + s3_etag_hasher = md5(usedforsecurity=False) + manifest = [] + previous_number = 0 + try: + xml = req.xml(MAX_COMPLETE_UPLOAD_BODY_SIZE) + if not xml: + raise InvalidRequest(msg='You must specify at least one part') + # If an MD5 was provided, we need to verify it. + if req.check_md5(xml): + # We're only interested in the body here, in the + # multipart-upload controller -- *don't* let etag get + # plumbed down to the object-server + req.headers.pop('etag', None) + + complete_elem = fromstring( + xml, 'CompleteMultipartUpload', self.logger) + for part_elem in complete_elem.iterchildren('Part'): + part_number = int(part_elem.find('./PartNumber').text) + + if part_number <= previous_number: + raise InvalidPartOrder(upload_id=upload_id) + previous_number = part_number + + etag = normalize_etag(part_elem.find('./ETag').text) + if etag is None: + raise InvalidPart(upload_id=upload_id, + part_number=part_number, + e_tag=etag) + if len(etag) != 32 or any(c not in '0123456789abcdef' + for c in etag): + raise InvalidPart(upload_id=upload_id, + part_number=part_number, + e_tag=etag) + manifest.append({ + 'path': '/%s/%s/%s/%d' % ( + wsgi_to_str(container), wsgi_to_str(req.object_name), + upload_id, part_number), + 'etag': etag}) + s3_etag_hasher.update(binascii.a2b_hex(etag)) + except (XMLSyntaxError, DocumentInvalid): + # NB: our schema definitions catch uploads with no parts here + raise MalformedXML() + except ErrorResponse: + raise + except Exception as e: + self.logger.error(e) + raise + + s3_etag = '%s-%d' % (s3_etag_hasher.hexdigest(), len(manifest)) + s3_etag_header = sysmeta_header('object', 'etag') + # This header should only already be present if the upload marker + # has been cleaned up and the current target uses the same upload-id + already_uploaded_s3_etag = resp.sysmeta_headers.get(s3_etag_header) + if already_uploaded_s3_etag == s3_etag: + # If the segments to use haven't changed, the work is already done + return HTTPOk(body=_make_complete_body(req, s3_etag, False), + content_type='application/xml') + elif already_uploaded_s3_etag: + # If the header's present but *doesn't* match, upload-id is + # no longer valid + raise NoSuchUpload(upload_id=upload_id) + headers[s3_etag_header] = s3_etag + # Leave base header value blank; SLO will populate + c_etag = '; s3_etag=%s' % s3_etag + headers[get_container_update_override_key('etag')] = c_etag + + too_small_message = ('s3api requires that each segment be at least ' + '%d bytes' % self.conf.min_segment_size) + + def size_checker(manifest): + # Check the size of each segment except the last and make sure + # they are all more than the minimum upload chunk size. + # Note that we need to use the *internal* keys, since we're + # looking at the manifest that's about to be written. + return [ + (item['name'], too_small_message) + for item in manifest[:-1] + if item and item['bytes'] < self.conf.min_segment_size] + + req.environ['swift.callback.slo_manifest_hook'] = size_checker + start_time = time.time() + + def response_iter(): + # NB: XML requires that the XML declaration, if present, be at the + # very start of the document. Clients *will* call us out on not + # being valid XML if we pass through whitespace before it. + # Track whether we've sent anything yet so we can yield out that + # declaration *first* + yielded_anything = False + + try: + try: + # TODO: add support for versioning + put_resp = req.get_response( + self.app, 'PUT', body=json.dumps(manifest), + query={'multipart-manifest': 'put', + 'heartbeat': 'on'}, + headers=headers) + if put_resp.status_int == 202: + body = [] + put_resp.fix_conditional_response() + for chunk in put_resp.response_iter: + if not chunk.strip(): + if time.time() - start_time < 10: + # Include some grace period to keep + # ceph-s3tests happy + continue + if not yielded_anything: + yield (b'\n') + yielded_anything = True + yield chunk + continue + body.append(chunk) + body = json.loads(b''.join(body)) + if body['Response Status'] == \ + '412 Precondition Failed': + raise PreconditionFailed + elif body['Response Status'] != '201 Created': + for seg, err in body['Errors']: + if err == too_small_message: + raise EntityTooSmall() + elif err in ('Etag Mismatch', '404 Not Found'): + raise InvalidPart(upload_id=upload_id) + raise InvalidRequest( + status=body['Response Status'], + msg='\n'.join(': '.join(err) + for err in body['Errors'])) + except InvalidRequest as err_resp: + msg = err_resp._msg + if too_small_message in msg: + raise EntityTooSmall(msg) + elif ', Etag Mismatch' in msg: + raise InvalidPart(upload_id=upload_id) + elif ', 404 Not Found' in msg: + raise InvalidPart(upload_id=upload_id) + else: + raise + + # clean up the multipart-upload record + obj = '%s/%s' % (req.object_name, upload_id) + try: + req.get_response(self.app, 'DELETE', container, obj) + except NoSuchKey: + # The important thing is that we wrote out a tombstone to + # make sure the marker got cleaned up. If it's already + # gone (e.g., because of concurrent completes or a retried + # complete), so much the better. + pass + + yield _make_complete_body(req, s3_etag, yielded_anything) + except ErrorResponse as err_resp: + if yielded_anything: + err_resp.xml_declaration = False + yield b'\n' + else: + # Oh good, we can still change HTTP status code, too! + resp.status = err_resp.status + for chunk in err_resp({}, lambda *a: None): + yield chunk + + resp = HTTPOk() # assume we're good for now... but see above! + resp.app_iter = reiterate(response_iter()) + resp.content_type = "application/xml" + + return resp diff --git a/swift/common/middleware/s3api/controllers/obj.py b/swift/common/middleware/s3api/controllers/obj.py new file mode 100644 index 0000000000..930beb0fe3 --- /dev/null +++ b/swift/common/middleware/s3api/controllers/obj.py @@ -0,0 +1,269 @@ +# Copyright (c) 2010-2014 OpenStack Foundation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from io import BytesIO +import json + +from swift.common import constraints +from swift.common.http import HTTP_OK, HTTP_PARTIAL_CONTENT, HTTP_NO_CONTENT +from swift.common.request_helpers import update_etag_is_at_header +from swift.common.swob import Range, content_range_header_value, \ + normalize_etag +from swift.common.utils import public, list_from_csv +from swift.common.registry import get_swift_info + +from swift.common.middleware.versioned_writes.object_versioning import \ + DELETE_MARKER_CONTENT_TYPE +from swift.common.middleware.s3api.utils import S3Timestamp, sysmeta_header +from swift.common.middleware.s3api.controllers.base import Controller +from swift.common.middleware.s3api.s3response import S3NotImplemented, \ + InvalidRange, NoSuchKey, NoSuchVersion, InvalidArgument, HTTPNoContent, \ + PreconditionFailed, KeyTooLongError + + +class ObjectController(Controller): + """ + Handles requests on objects + """ + def _gen_head_range_resp(self, req_range, resp): + """ + Swift doesn't handle Range header for HEAD requests. + So, this method generates HEAD range response from HEAD response. + S3 return HEAD range response, if the value of range satisfies the + conditions which are described in the following document. + - http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35 + """ + length = int(resp.headers.get('Content-Length')) + + try: + content_range = Range(req_range) + except ValueError: + return resp + + ranges = content_range.ranges_for_length(length) + if ranges == []: + raise InvalidRange() + elif ranges: + if len(ranges) == 1: + start, end = ranges[0] + resp.headers['Content-Range'] = \ + content_range_header_value(start, end, length) + resp.headers['Content-Length'] = (end - start) + resp.status = HTTP_PARTIAL_CONTENT + return resp + else: + # TODO: It is necessary to confirm whether need to respond to + # multi-part response.(e.g. bytes=0-10,20-30) + pass + + return resp + + def GETorHEAD(self, req): + had_match = False + for match_header in ('if-match', 'if-none-match'): + if match_header not in req.headers: + continue + had_match = True + for value in list_from_csv(req.headers[match_header]): + value = normalize_etag(value) + if value.endswith('-N'): + # Deal with fake S3-like etags for SLOs uploaded via Swift + req.headers[match_header] += ', ' + value[:-2] + + if had_match: + # Update where to look + update_etag_is_at_header(req, sysmeta_header('object', 'etag')) + + object_name = req.object_name + version_id = req.params.get('versionId') + if version_id not in ('null', None) and \ + 'object_versioning' not in get_swift_info(): + raise S3NotImplemented() + part_number = req.validate_part_number(check_max=False) + + query = {} + if version_id is not None: + query['version-id'] = version_id + if part_number is not None: + query['part-number'] = part_number + + if version_id not in ('null', None): + container_info = req.get_container_info(self.app) + if not container_info.get( + 'sysmeta', {}).get('versions-container', ''): + # Versioning has never been enabled + raise NoSuchVersion(object_name, version_id) + + resp = req.get_response(self.app, query=query) + + if not resp.is_slo: + # SLO ignores part_number for non-slo objects, but s3api only + # allows the query param for non-MPU if it's exactly 1. + part_number = req.validate_part_number(parts_count=1) + if part_number == 1: + # When the query param *is* exactly 1 the response status code + # and headers are updated. + resp.status = HTTP_PARTIAL_CONTENT + resp.headers['Content-Range'] = \ + 'bytes 0-%d/%s' % (int(resp.headers['Content-Length']) - 1, + resp.headers['Content-Length']) + # else: part_number is None + + if req.method == 'HEAD': + resp.app_iter = None + + if 'x-amz-meta-deleted' in resp.headers: + raise NoSuchKey(object_name) + + for key in ('content-type', 'content-language', 'expires', + 'cache-control', 'content-disposition', + 'content-encoding'): + if 'response-' + key in req.params: + resp.headers[key] = req.params['response-' + key] + + return resp + + @public + def HEAD(self, req): + """ + Handle HEAD Object request + """ + resp = self.GETorHEAD(req) + + if 'range' in req.headers: + req_range = req.headers['range'] + resp = self._gen_head_range_resp(req_range, resp) + + return resp + + @public + def GET(self, req): + """ + Handle GET Object request + """ + return self.GETorHEAD(req) + + @public + def PUT(self, req): + """ + Handle PUT Object and PUT Object (Copy) request + """ + if len(req.object_name) > constraints.MAX_OBJECT_NAME_LENGTH: + raise KeyTooLongError() + # set X-Timestamp by s3api to use at copy resp body + req_timestamp = S3Timestamp.now() + req.headers['X-Timestamp'] = req_timestamp.internal + if all(h in req.headers + for h in ('X-Amz-Copy-Source', 'X-Amz-Copy-Source-Range')): + raise InvalidArgument('x-amz-copy-source-range', + req.headers['X-Amz-Copy-Source-Range'], + 'Illegal copy header') + req.check_copy_source(self.app) + if not req.headers.get('Content-Type'): + # can't setdefault because it can be None for some reason + req.headers['Content-Type'] = 'binary/octet-stream' + resp = req.get_response(self.app) + + if 'X-Amz-Copy-Source' in req.headers: + resp.append_copy_resp_body(req.controller_name, + req_timestamp.s3xmlformat) + # delete object metadata from response + for key in list(resp.headers.keys()): + if key.lower().startswith('x-amz-meta-'): + del resp.headers[key] + + resp.status = HTTP_OK + return resp + + @public + def POST(self, req): + raise S3NotImplemented() + + def _restore_on_delete(self, req): + resp = req.get_response(self.app, 'GET', req.container_name, '', + query={'prefix': req.object_name, + 'versions': True}) + if resp.status_int != HTTP_OK: + return resp + old_versions = json.loads(resp.body) + resp = None + for item in old_versions: + if item['content_type'] == DELETE_MARKER_CONTENT_TYPE: + resp = None + break + try: + resp = req.get_response(self.app, 'PUT', query={ + 'version-id': item['version_id']}) + except PreconditionFailed: + self.logger.debug('skipping failed PUT?version-id=%s' % + item['version_id']) + continue + # if that worked, we'll go ahead and fix up the status code + resp.status_int = HTTP_NO_CONTENT + break + return resp + + @public + def DELETE(self, req): + """ + Handle DELETE Object request + """ + if 'versionId' in req.params and \ + req.params['versionId'] != 'null' and \ + 'object_versioning' not in get_swift_info(): + raise S3NotImplemented() + + version_id = req.params.get('versionId') + if version_id not in ('null', None): + container_info = req.get_container_info(self.app) + if not container_info.get( + 'sysmeta', {}).get('versions-container', ''): + # Versioning has never been enabled + return HTTPNoContent(headers={'x-amz-version-id': version_id}) + + try: + try: + query = req.gen_multipart_manifest_delete_query( + self.app, version=version_id) + except NoSuchKey: + query = {} + + req.headers['Content-Type'] = None # Ignore client content-type + + if version_id is not None: + query['version-id'] = version_id + query['symlink'] = 'get' + + resp = req.get_response(self.app, query=query) + # If we're going to continue using this request, we need to + # replace the now-spent body + req.environ['wsgi.input'] = BytesIO(b'') + req.headers['content-length'] = '0' + req.headers.pop('transfer-encoding', None) + if query.get('multipart-manifest') and resp.status_int == HTTP_OK: + for chunk in resp.app_iter: + pass # drain the bulk-deleter response + resp.status = HTTP_NO_CONTENT + resp.body = b'' + if resp.sw_headers.get('X-Object-Current-Version-Id') == 'null': + new_resp = self._restore_on_delete(req) + if new_resp: + resp = new_resp + except NoSuchKey: + # expect to raise NoSuchBucket when the bucket doesn't exist + req.get_container_info(self.app) + # else -- it's gone! Success. + return HTTPNoContent() + return resp diff --git a/swift/common/middleware/s3api/controllers/object_lock.py b/swift/common/middleware/s3api/controllers/object_lock.py new file mode 100644 index 0000000000..69a5295727 --- /dev/null +++ b/swift/common/middleware/s3api/controllers/object_lock.py @@ -0,0 +1,44 @@ +# Copyright (c) 2010-2023 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from swift.common.utils import public + +from swift.common.middleware.s3api.controllers.base import Controller, \ + bucket_operation, S3NotImplemented +from swift.common.middleware.s3api.s3response import \ + ObjectLockConfigurationNotFoundError + + +class ObjectLockController(Controller): + """ + Handles GET object-lock request, which always returns + Disabled + """ + @public + @bucket_operation + def GET(self, req): + """ + Handles GET object-lock param calls. + """ + raise ObjectLockConfigurationNotFoundError(req.container_name) + + @public + @bucket_operation + def PUT(self, req): + """ + Handles PUT object-lock param calls. + """ + # Basically we don't support it, so return a 501 + raise S3NotImplemented('The requested resource is not implemented') diff --git a/swift/common/middleware/s3api/controllers/s3_acl.py b/swift/common/middleware/s3api/controllers/s3_acl.py new file mode 100644 index 0000000000..ddd7fbe3d2 --- /dev/null +++ b/swift/common/middleware/s3api/controllers/s3_acl.py @@ -0,0 +1,67 @@ +# Copyright (c) 2014 OpenStack Foundation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from urllib.parse import quote +from swift.common.utils import public + +from swift.common.middleware.s3api.controllers.base import Controller +from swift.common.middleware.s3api.s3response import HTTPOk +from swift.common.middleware.s3api.etree import tostring + + +class S3AclController(Controller): + """ + Handles the following APIs: + + * GET Bucket acl + * PUT Bucket acl + * GET Object acl + * PUT Object acl + + Those APIs are logged as ACL operations in the S3 server log. + """ + @public + def GET(self, req): + """ + Handles GET Bucket acl and GET Object acl. + """ + resp = req.get_response(self.app, method='HEAD') + + acl = resp.object_acl if req.is_object_request else resp.bucket_acl + + resp = HTTPOk() + resp.body = tostring(acl.elem()) + + return resp + + @public + def PUT(self, req): + """ + Handles PUT Bucket acl and PUT Object acl. + """ + if req.is_object_request: + headers = {} + src_path = '/%s/%s' % (req.container_name, req.object_name) + + # object-sysmeta' can be updated by 'Copy' method, + # but can not be by 'POST' method. + # So headers['X-Copy-From'] for copy request is added here. + headers['X-Copy-From'] = quote(src_path) + headers['Content-Length'] = 0 + req.get_response(self.app, 'PUT', headers=headers) + else: + req.get_response(self.app, 'POST') + + return HTTPOk() diff --git a/swift/common/middleware/s3api/controllers/service.py b/swift/common/middleware/s3api/controllers/service.py new file mode 100644 index 0000000000..1a6564eb88 --- /dev/null +++ b/swift/common/middleware/s3api/controllers/service.py @@ -0,0 +1,70 @@ +# Copyright (c) 2010-2014 OpenStack Foundation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from swift.common.swob import bytes_to_wsgi +from swift.common.utils import json, public + +from swift.common.middleware.s3api.controllers.base import Controller +from swift.common.middleware.s3api.etree import Element, SubElement, tostring +from swift.common.middleware.s3api.s3response import HTTPOk, AccessDenied, \ + NoSuchBucket +from swift.common.middleware.s3api.utils import validate_bucket_name + + +class ServiceController(Controller): + """ + Handles account level requests. + """ + @public + def GET(self, req): + """ + Handle GET Service request + """ + resp = req.get_response(self.app, query={'format': 'json'}) + + containers = json.loads(resp.body) + + containers = filter( + lambda item: validate_bucket_name( + item['name'], self.conf.dns_compliant_bucket_names), + containers) + + # we don't keep the creation time of a bucket (s3cmd doesn't + # work without that) so we use something bogus. + elem = Element('ListAllMyBucketsResult') + + owner = SubElement(elem, 'Owner') + SubElement(owner, 'ID').text = req.user_id + SubElement(owner, 'DisplayName').text = req.user_id + + buckets = SubElement(elem, 'Buckets') + for c in containers: + if self.conf.s3_acl and self.conf.check_bucket_owner: + container = bytes_to_wsgi(c['name'].encode('utf8')) + try: + req.get_response(self.app, 'HEAD', container) + except AccessDenied: + continue + except NoSuchBucket: + continue + + bucket = SubElement(buckets, 'Bucket') + SubElement(bucket, 'Name').text = c['name'] + SubElement(bucket, 'CreationDate').text = \ + '2009-02-03T16:45:09.000Z' + + body = tostring(elem) + + return HTTPOk(content_type='application/xml', body=body) diff --git a/swift/common/middleware/s3api/controllers/tagging.py b/swift/common/middleware/s3api/controllers/tagging.py new file mode 100644 index 0000000000..ca7bc853d2 --- /dev/null +++ b/swift/common/middleware/s3api/controllers/tagging.py @@ -0,0 +1,57 @@ +# Copyright (c) 2014 OpenStack Foundation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from swift.common.utils import public + +from swift.common.middleware.s3api.controllers.base import Controller, \ + S3NotImplemented +from swift.common.middleware.s3api.s3response import HTTPOk +from swift.common.middleware.s3api.etree import Element, tostring, \ + SubElement + + +class TaggingController(Controller): + """ + Handles the following APIs: + + * GET Bucket and Object tagging + * PUT Bucket and Object tagging + * DELETE Bucket and Object tagging + + """ + @public + def GET(self, req): + """ + Handles GET Bucket and Object tagging. + """ + elem = Element('Tagging') + SubElement(elem, 'TagSet') + body = tostring(elem) + + return HTTPOk(body=body, content_type=None) + + @public + def PUT(self, req): + """ + Handles PUT Bucket and Object tagging. + """ + raise S3NotImplemented('The requested resource is not implemented') + + @public + def DELETE(self, req): + """ + Handles DELETE Bucket and Object tagging. + """ + raise S3NotImplemented('The requested resource is not implemented') diff --git a/swift/common/middleware/s3api/controllers/versioning.py b/swift/common/middleware/s3api/controllers/versioning.py new file mode 100644 index 0000000000..2d31d2af50 --- /dev/null +++ b/swift/common/middleware/s3api/controllers/versioning.py @@ -0,0 +1,82 @@ +# Copyright (c) 2010-2014 OpenStack Foundation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from swift.common.utils import public, config_true_value +from swift.common.registry import get_swift_info + +from swift.common.middleware.s3api.controllers.base import Controller, \ + bucket_operation +from swift.common.middleware.s3api.etree import Element, tostring, \ + fromstring, XMLSyntaxError, DocumentInvalid, SubElement +from swift.common.middleware.s3api.s3response import HTTPOk, \ + S3NotImplemented, MalformedXML + +MAX_PUT_VERSIONING_BODY_SIZE = 10240 + + +class VersioningController(Controller): + """ + Handles the following APIs: + + * GET Bucket versioning + * PUT Bucket versioning + + Those APIs are logged as VERSIONING operations in the S3 server log. + """ + @public + @bucket_operation + def GET(self, req): + """ + Handles GET Bucket versioning. + """ + sysmeta = req.get_container_info(self.app).get('sysmeta', {}) + + elem = Element('VersioningConfiguration') + if sysmeta.get('versions-enabled'): + SubElement(elem, 'Status').text = ( + 'Enabled' if config_true_value(sysmeta['versions-enabled']) + else 'Suspended') + body = tostring(elem) + + return HTTPOk(body=body, content_type=None) + + @public + @bucket_operation + def PUT(self, req): + """ + Handles PUT Bucket versioning. + """ + if 'object_versioning' not in get_swift_info(): + raise S3NotImplemented() + + xml = req.xml(MAX_PUT_VERSIONING_BODY_SIZE) + try: + elem = fromstring(xml, 'VersioningConfiguration') + status = elem.find('./Status').text + except (XMLSyntaxError, DocumentInvalid): + raise MalformedXML() + except Exception as e: + self.logger.error(e) + raise + + if status not in ['Enabled', 'Suspended']: + raise MalformedXML() + + # Set up versioning + # NB: object_versioning responsible for ensuring its container exists + req.headers['X-Versions-Enabled'] = str(status == 'Enabled').lower() + req.get_response(self.app, 'POST') + + return HTTPOk() diff --git a/swift/common/middleware/s3api/etree.py b/swift/common/middleware/s3api/etree.py new file mode 100644 index 0000000000..e5d4112b8f --- /dev/null +++ b/swift/common/middleware/s3api/etree.py @@ -0,0 +1,150 @@ +# Copyright (c) 2014 OpenStack Foundation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import lxml.etree +from copy import deepcopy +try: + # importlib.resources was introduced in py37, but couldn't handle + # resources in subdirectories (which we use); files() added support + from importlib.resources import files + del files +except ImportError: + # python < 3.9 + from pkg_resources import resource_stream # pylint: disable-msg=E0611 +else: + import importlib.resources + resource_stream = None + +from swift.common.utils import get_logger +from swift.common.middleware.s3api.exception import S3Exception +from swift.common.middleware.s3api.utils import camel_to_snake, \ + utf8decode + +XMLNS_S3 = 'http://s3.amazonaws.com/doc/2006-03-01/' +XMLNS_XSI = 'http://www.w3.org/2001/XMLSchema-instance' + + +class XMLSyntaxError(S3Exception): + pass + + +class DocumentInvalid(S3Exception): + pass + + +def cleanup_namespaces(elem): + def remove_ns(tag, ns): + if tag.startswith('{%s}' % ns): + tag = tag[len('{%s}' % ns):] + return tag + + if not isinstance(elem.tag, str): + # elem is a comment element. + return + + # remove s3 namespace + elem.tag = remove_ns(elem.tag, XMLNS_S3) + + # remove default namespace + if elem.nsmap and None in elem.nsmap: + elem.tag = remove_ns(elem.tag, elem.nsmap[None]) + + for e in elem.iterchildren(): + cleanup_namespaces(e) + + +def fromstring(text, root_tag=None, logger=None): + try: + elem = lxml.etree.fromstring(text, parser) + except lxml.etree.XMLSyntaxError as e: + if logger: + logger.debug(e) + raise XMLSyntaxError(e) + + cleanup_namespaces(elem) + + if root_tag is not None: + # validate XML + try: + path = 'schema/%s.rng' % camel_to_snake(root_tag) + if resource_stream: + # python < 3.9 + stream = resource_stream(__name__, path) + else: + stream = importlib.resources.files( + __name__.rsplit('.', 1)[0]).joinpath(path).open('rb') + with stream as rng: + lxml.etree.RelaxNG(file=rng).assertValid(elem) + except IOError as e: + # Probably, the schema file doesn't exist. + logger = logger or get_logger({}, log_route='s3api') + logger.error(e) + raise + except lxml.etree.DocumentInvalid as e: + if logger: + logger.debug(e) + raise DocumentInvalid(e) + + return elem + + +def tostring(tree, use_s3ns=True, xml_declaration=True): + if use_s3ns: + nsmap = tree.nsmap.copy() + nsmap[None] = XMLNS_S3 + + root = Element(tree.tag, attrib=tree.attrib, nsmap=nsmap) + root.text = tree.text + root.extend(deepcopy(list(tree))) + tree = root + + return lxml.etree.tostring(tree, xml_declaration=xml_declaration, + encoding='UTF-8') + + +class _Element(lxml.etree.ElementBase): + """ + Wrapper Element class of lxml.etree.Element to support + a utf-8 encoded non-ascii string as a text. + + Why we need this?: + Original lxml.etree.Element supports only unicode for the text. + It declines maintainability because we have to call a lot of encode/decode + methods to apply account/container/object name (i.e. PATH_INFO) to each + Element instance. When using this class, we can remove such a redundant + codes from swift.common.middleware.s3api middleware. + """ + def __init__(self, *args, **kwargs): + # pylint: disable-msg=E1002 + super(_Element, self).__init__(*args, **kwargs) + + @property + def text(self): + """ + utf-8 wrapper property of lxml.etree.Element.text + """ + return lxml.etree.ElementBase.text.__get__(self) + + @text.setter + def text(self, value): + lxml.etree.ElementBase.text.__set__(self, utf8decode(value)) + + +parser_lookup = lxml.etree.ElementDefaultClassLookup(element=_Element) +parser = lxml.etree.XMLParser(resolve_entities=False, no_network=True) +parser.set_element_class_lookup(parser_lookup) + +Element = parser.makeelement +SubElement = lxml.etree.SubElement diff --git a/swift/common/middleware/s3api/exception.py b/swift/common/middleware/s3api/exception.py new file mode 100644 index 0000000000..c12ee3f3ac --- /dev/null +++ b/swift/common/middleware/s3api/exception.py @@ -0,0 +1,122 @@ +# Copyright (c) 2014 OpenStack Foundation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class S3Exception(Exception): + pass + + +class NotS3Request(S3Exception): + pass + + +class ACLError(S3Exception): + pass + + +class InvalidBucketNameParseError(S3Exception): + + def __init__(self, bucket): + self.bucket_name = bucket + + +class InvalidURIParseError(S3Exception): + + def __init__(self, uri): + self.uri = uri + + +class InvalidSubresource(S3Exception): + def __init__(self, resource, cause): + self.resource = resource + self.cause = cause + + +class S3InputError(BaseException): + """ + There was an error with the client input detected on read(). + + Inherit from BaseException (rather than Exception) so it cuts from the + proxy-server app (which will presumably be the one reading the input) + through all the layers of the pipeline back to s3api. It should never + escape the s3api middleware. + """ + + +class S3InputIncomplete(S3InputError): + pass + + +class S3InputSizeError(S3InputError): + def __init__(self, expected, provided): + self.expected = expected + self.provided = provided + + +class S3InputChunkTooSmall(S3InputError): + def __init__(self, bad_chunk_size, chunk_number): + self.bad_chunk_size = bad_chunk_size + self.chunk_number = chunk_number + + +class S3InputMalformedTrailer(S3InputError): + pass + + +class S3InputChunkSignatureMismatch(S3InputError): + """ + Client provided a chunk-signature, but it doesn't match the data. + + This should result in a 403 going back to the client. + """ + + +class S3InputMissingSecret(S3InputError): + """ + Client provided per-chunk signatures, but we have no secret with which to + verify them. + + This happens if the auth middleware responsible for the user never called + the provided ``check_signature`` callback. + """ + + +class S3InputSHA256Mismatch(S3InputError): + """ + Client provided a X-Amz-Content-SHA256, but it doesn't match the data. + + This should result in a BadDigest going back to the client. + """ + def __init__(self, expected, computed): + self.expected = expected + self.computed = computed + + +class S3InputChecksumMismatch(S3InputError): + """ + Client provided a X-Amz-Checksum-* header, but it doesn't match the data. + + This should result in a InvalidRequest going back to the client. + """ + + +class S3InputChecksumTrailerInvalid(S3InputError): + """ + Client provided a X-Amz-Checksum-* trailer, but it is not a valid format. + + This should result in a InvalidRequest going back to the client. + """ + def __init__(self, trailer_name): + self.trailer = trailer_name diff --git a/swift/common/middleware/s3api/s3api.py b/swift/common/middleware/s3api/s3api.py new file mode 100644 index 0000000000..bd928049e0 --- /dev/null +++ b/swift/common/middleware/s3api/s3api.py @@ -0,0 +1,600 @@ +# Copyright (c) 2010-2014 OpenStack Foundation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +The s3api middleware will emulate the S3 REST api on top of swift. + +To enable this middleware to your configuration, add the s3api middleware +in front of the auth middleware. See ``proxy-server.conf-sample`` for more +detail and configurable options. + +To set up your client, ensure you are using the tempauth or keystone auth +system for swift project. +When your swift on a SAIO environment, make sure you have setting the tempauth +middleware configuration in ``proxy-server.conf``, and the access key will be +the concatenation of the account and user strings that should look like +test:tester, and the secret access key is the account password. The host should +also point to the swift storage hostname. + +The tempauth option example: + +.. code-block:: ini + + [filter:tempauth] + use = egg:swift#tempauth + user_admin_admin = admin .admin .reseller_admin + user_test_tester = testing + +An example client using tempauth with the python boto library is as follows: + +.. code-block:: python + + from boto.s3.connection import S3Connection + connection = S3Connection( + aws_access_key_id='test:tester', + aws_secret_access_key='testing', + port=8080, + host='127.0.0.1', + is_secure=False, + calling_format=boto.s3.connection.OrdinaryCallingFormat()) + +And if you using keystone auth, you need the ec2 credentials, which can +be downloaded from the API Endpoints tab of the dashboard or by openstack +ec2 command. + +Here is showing to create an EC2 credential: + +.. code-block:: console + + # openstack ec2 credentials create + +------------+---------------------------------------------------+ + | Field | Value | + +------------+---------------------------------------------------+ + | access | c2e30f2cd5204b69a39b3f1130ca8f61 | + | links | {u'self': u'http://controller:5000/v3/......'} | + | project_id | 407731a6c2d0425c86d1e7f12a900488 | + | secret | baab242d192a4cd6b68696863e07ed59 | + | trust_id | None | + | user_id | 00f0ee06afe74f81b410f3fe03d34fbc | + +------------+---------------------------------------------------+ + +An example client using keystone auth with the python boto library will be: + +.. code-block:: python + + from boto.s3.connection import S3Connection + connection = S3Connection( + aws_access_key_id='c2e30f2cd5204b69a39b3f1130ca8f61', + aws_secret_access_key='baab242d192a4cd6b68696863e07ed59', + port=8080, + host='127.0.0.1', + is_secure=False, + calling_format=boto.s3.connection.OrdinaryCallingFormat()) + +---------- +Deployment +---------- + +Proxy-Server Setting +^^^^^^^^^^^^^^^^^^^^ + +Set s3api before your auth in your pipeline in ``proxy-server.conf`` file. +To enable all compatibility currently supported, you should make sure that +bulk, slo, and your auth middleware are also included in your proxy +pipeline setting. + +Using tempauth, the minimum example config is: + +.. code-block:: ini + + [pipeline:main] + pipeline = proxy-logging cache s3api tempauth bulk slo proxy-logging \ +proxy-server + +When using keystone, the config will be: + +.. code-block:: ini + + [pipeline:main] + pipeline = proxy-logging cache authtoken s3api s3token keystoneauth bulk \ +slo proxy-logging proxy-server + +Finally, add the s3api middleware section: + +.. code-block:: ini + + [filter:s3api] + use = egg:swift#s3api + +.. note:: + ``keystonemiddleware.authtoken`` can be located before/after s3api but + we recommend to put it before s3api because when authtoken is after s3api, + both authtoken and s3token will issue the acceptable token to keystone + (i.e. authenticate twice). And in the ``keystonemiddleware.authtoken`` + middleware , you should set ``delay_auth_decision`` option to ``True``. + +----------- +Constraints +----------- +Currently, the s3api is being ported from https://github.com/openstack/swift3 +so any existing issues in swift3 are still remaining. Please make sure +descriptions in the example ``proxy-server.conf`` and what happens with the +config, before enabling the options. + +------------- +Supported API +------------- +The compatibility will continue to be improved upstream, you can keep and +eye on compatibility via a check tool build by SwiftStack. See +https://github.com/swiftstack/s3compat in detail. + +""" + +import json +from paste.deploy import loadwsgi +from urllib.parse import parse_qs + +from swift.common import swob +from swift.common.constraints import valid_api_version +from swift.common.middleware.listing_formats import \ + MAX_CONTAINER_LISTING_CONTENT_LENGTH +from swift.common.request_helpers import append_log_info +from swift.common.wsgi import PipelineWrapper, loadcontext, WSGIContext +from swift.common.statsd_client import get_labeled_statsd_client + +from swift.common.middleware import app_property +from swift.common.middleware.s3api.exception import NotS3Request, \ + InvalidSubresource +from swift.common.middleware.s3api import s3request +from swift.common.middleware.s3api.s3response import ErrorResponse, \ + InternalError, MethodNotAllowed, S3ResponseBase, S3NotImplemented +from swift.common.utils import get_logger, config_true_value, \ + config_positive_int_value, split_path, closing_if_possible, \ + list_from_csv, parse_header, checksum +from swift.common.middleware.s3api.utils import Config, \ + classify_checksum_header_value, make_header_label +from swift.common.middleware.s3api.acl_handlers import get_acl_handler +from swift.common.registry import register_swift_info, \ + register_sensitive_header, register_sensitive_param + + +# https://docs.aws.amazon.com/AmazonS3/latest/API/sigv4-auth-using-authorization-header.html +WELL_KNOWN_SPECIFIC_SHA256_VALUES = ( + 'UNSIGNED-PAYLOAD', + 'STREAMING-UNSIGNED-PAYLOAD-TRAILER', + 'STREAMING-AWS4-HMAC-SHA256-PAYLOAD', + 'STREAMING-AWS4-HMAC-SHA256-PAYLOAD-TRAILER', + 'STREAMING-AWS4-ECDSA-P256-SHA256-PAYLOAD', + 'STREAMING-AWS4-ECDSA-P256-SHA256-PAYLOAD-TRAILER' +) +# https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html +# https://docs.aws.amazon.com/AmazonS3/latest/API/API_Object.html#AmazonS3-Type-Object-ChecksumAlgorithm +# https://docs.aws.amazon.com/AmazonS3/latest/API/API_PutObject.html +# docs are unclear whether the header value is the (un-)hyphenated form + +# algorithms for x-amz-checksum-algorithm/ x-amz-sdk-checksum-algorithm +WELL_KNOWN_CHECKSUM_ALGORITHMS = ( + 'CRC64NVME', + 'CRC32', + 'CRC32C', + 'SHA1', + 'SHA256' +) +WELL_KNOWN_CHECKSUM_HEADERS = ( + 'x-amz-checksum-crc32', + 'x-amz-checksum-crc32c', + 'x-amz-checksum-sha1', + 'x-amz-checksum-sha256', + 'x-amz-checksum-crc64nvme' +) + + +class ListingEtagMiddleware(object): + def __init__(self, app): + self.app = app + + # Pass these along so get_container_info will have the configured + # odds to skip cache + _pipeline_final_app = app_property('_pipeline_final_app') + _pipeline_request_logging_app = app_property( + '_pipeline_request_logging_app') + + def __call__(self, env, start_response): + # a lot of this is cribbed from listing_formats / swob.Request + if env['REQUEST_METHOD'] != 'GET': + # Nothing to translate + return self.app(env, start_response) + + try: + v, a, c = split_path(env.get('SCRIPT_NAME', '') + + env['PATH_INFO'], 3, 3) + if not valid_api_version(v): + raise ValueError + except ValueError: + is_container_req = False + else: + is_container_req = True + if not is_container_req: + # pass through + return self.app(env, start_response) + + ctx = WSGIContext(self.app) + resp_iter = ctx._app_call(env) + + content_type = content_length = cl_index = None + for index, (header, value) in enumerate(ctx._response_headers): + header = header.lower() + if header == 'content-type': + content_type = value.split(';', 1)[0].strip() + if content_length: + break + elif header == 'content-length': + cl_index = index + try: + content_length = int(value) + except ValueError: + pass # ignore -- we'll bail later + if content_type: + break + + if content_type != 'application/json' or content_length is None or \ + content_length > MAX_CONTAINER_LISTING_CONTENT_LENGTH: + start_response(ctx._response_status, ctx._response_headers, + ctx._response_exc_info) + return resp_iter + + # We've done our sanity checks, slurp the response into memory + with closing_if_possible(resp_iter): + body = b''.join(resp_iter) + + try: + listing = json.loads(body) + for item in listing: + if 'subdir' in item: + continue + value, params = parse_header(item['hash']) + if 's3_etag' in params: + item['s3_etag'] = '"%s"' % params.pop('s3_etag') + item['hash'] = value + ''.join( + '; %s=%s' % kv for kv in params.items()) + except (TypeError, KeyError, ValueError): + # If anything goes wrong above, drop back to original response + start_response(ctx._response_status, ctx._response_headers, + ctx._response_exc_info) + return [body] + + body = json.dumps(listing).encode('ascii') + ctx._response_headers[cl_index] = ( + ctx._response_headers[cl_index][0], + str(len(body)), + ) + start_response(ctx._response_status, ctx._response_headers, + ctx._response_exc_info) + return [body] + + +class S3ApiMiddleware(object): + """S3Api: S3 compatibility middleware""" + def __init__(self, app, wsgi_conf, *args, **kwargs): + self.app = app + self.conf = Config() + + # Set default values if they are not configured + self.conf.allow_no_owner = config_true_value( + wsgi_conf.get('allow_no_owner', False)) + self.conf.location = wsgi_conf.get('location', 'us-east-1') + self.conf.dns_compliant_bucket_names = config_true_value( + wsgi_conf.get('dns_compliant_bucket_names', True)) + self.conf.max_bucket_listing = config_positive_int_value( + wsgi_conf.get('max_bucket_listing', 1000)) + self.conf.max_parts_listing = config_positive_int_value( + wsgi_conf.get('max_parts_listing', 1000)) + self.conf.max_multi_delete_objects = config_positive_int_value( + wsgi_conf.get('max_multi_delete_objects', 1000)) + self.conf.multi_delete_concurrency = config_positive_int_value( + wsgi_conf.get('multi_delete_concurrency', 2)) + self.conf.s3_acl = config_true_value( + wsgi_conf.get('s3_acl', False)) + self.conf.storage_domains = list_from_csv( + wsgi_conf.get('storage_domain', '')) + self.conf.auth_pipeline_check = config_true_value( + wsgi_conf.get('auth_pipeline_check', True)) + self.conf.max_upload_part_num = config_positive_int_value( + wsgi_conf.get('max_upload_part_num', 1000)) + self.conf.check_bucket_owner = config_true_value( + wsgi_conf.get('check_bucket_owner', False)) + self.conf.force_swift_request_proxy_log = config_true_value( + wsgi_conf.get('force_swift_request_proxy_log', False)) + self.conf.allow_multipart_uploads = config_true_value( + wsgi_conf.get('allow_multipart_uploads', True)) + self.conf.min_segment_size = config_positive_int_value( + wsgi_conf.get('min_segment_size', 5242880)) + self.conf.allowable_clock_skew = config_positive_int_value( + wsgi_conf.get('allowable_clock_skew', 15 * 60)) + self.conf.cors_preflight_allow_origin = list_from_csv(wsgi_conf.get( + 'cors_preflight_allow_origin', '')) + if '*' in self.conf.cors_preflight_allow_origin and \ + len(self.conf.cors_preflight_allow_origin) > 1: + raise ValueError('if cors_preflight_allow_origin should include ' + 'all domains, * must be the only entry') + self.conf.ratelimit_as_client_error = config_true_value( + wsgi_conf.get('ratelimit_as_client_error', False)) + + self.logger = get_logger( + wsgi_conf, log_route='s3api', statsd_tail_prefix='s3api') + self.statsd = get_labeled_statsd_client(wsgi_conf, self.logger) + + self.check_pipeline(wsgi_conf) + checksum.log_selected_implementation(self.logger) + + def is_s3_cors_preflight(self, env): + if env['REQUEST_METHOD'] != 'OPTIONS' or not env.get('HTTP_ORIGIN'): + # Not a CORS preflight + return False + acrh = env.get('HTTP_ACCESS_CONTROL_REQUEST_HEADERS', '').lower() + if 'authorization' in acrh and \ + not env['PATH_INFO'].startswith(('/v1/', '/v1.0/')): + return True + q = parse_qs(env.get('QUERY_STRING', '')) + if 'AWSAccessKeyId' in q or 'X-Amz-Credential' in q: + return True + # Not S3, apparently + return False + + def _make_req_header_labels(self, env): + req_headers = swob.HeaderEnvironProxy(env) + labels = {} + for hdr_key, hdr_val in req_headers.items(): + label_val = None + hdr_key = hdr_key.lower() + label_key = make_header_label(hdr_key) + if hdr_key == 'content-encoding': + if 'aws-chunked' in list_from_csv(hdr_val.lower()): + label_val = 'aws-chunked' + elif hdr_key == 'transfer-encoding': + if 'chunked' in list_from_csv(hdr_val.lower()): + label_val = 'chunked' + elif hdr_key == 'x-amz-decoded-content-length': + label_val = True + elif hdr_key == 'x-amz-content-sha256': + if hdr_val in WELL_KNOWN_SPECIFIC_SHA256_VALUES: + label_val = hdr_val + else: + label_val = classify_checksum_header_value(hdr_val) + elif hdr_key == 'content-md5': + label_val = classify_checksum_header_value(hdr_val) + elif hdr_key in s3request.CHECKSUMS_BY_HEADER.keys(): + label_val = classify_checksum_header_value(hdr_val) + elif hdr_key == 'x-amz-trailer': + if hdr_val.lower() in s3request.CHECKSUMS_BY_HEADER.keys(): + label_val = hdr_val.lower() + else: + label_val = 'unknown' + elif hdr_key in ('x-amz-checksum-algorithm', + 'x-amz-sdk-checksum-algorithm'): + hdr_val_normalised = hdr_val.upper().replace('-', '') + if hdr_val_normalised in WELL_KNOWN_CHECKSUM_ALGORITHMS: + label_val = hdr_val_normalised + else: + label_val = 'unknown' + + if label_val is not None: + labels[label_key] = label_val + + return labels + + def _emit_response_header_stats(self, env, resp, labels): + if not labels: + return + + labels['status'] = resp.status_int + labels['method'] = env.get('REQUEST_METHOD') + swift_path = env.get('swift.backend_path') + if swift_path: + vers, acc, con, obj = split_path(swift_path, 1, 4, True) + if obj: + labels['type'] = 'object' + labels['account'] = acc + labels['container'] = con + elif con: + labels['type'] = 'container' + labels['account'] = acc + labels['container'] = con + elif acc: + labels['account'] = acc + labels['type'] = 'account' + else: + labels['type'] = 'UNKNOWN' + else: + labels['type'] = 'UNKNOWN' + + self.statsd.increment("swift_s3_checksum_algo_request", labels=labels) + + def __call__(self, env, start_response): + # get metrics header labels before any mutation of the headers + req_header_labels = self._make_req_header_labels(env) + origin = env.get('HTTP_ORIGIN') + if self.conf.cors_preflight_allow_origin and \ + self.is_s3_cors_preflight(env): + # I guess it's likely going to be an S3 request? *shrug* + if self.conf.cors_preflight_allow_origin != ['*'] and \ + origin not in self.conf.cors_preflight_allow_origin: + start_response('401 Unauthorized', [ + ('Allow', 'GET, HEAD, PUT, POST, DELETE, OPTIONS'), + ]) + return [b''] + + headers = [ + ('Allow', 'GET, HEAD, PUT, POST, DELETE, OPTIONS'), + ('Access-Control-Allow-Origin', origin), + ('Access-Control-Allow-Methods', + 'GET, HEAD, PUT, POST, DELETE, OPTIONS'), + ('Vary', 'Origin, Access-Control-Request-Headers'), + ] + acrh = set(list_from_csv( + env.get('HTTP_ACCESS_CONTROL_REQUEST_HEADERS', '').lower())) + if acrh: + headers.append(( + 'Access-Control-Allow-Headers', + ', '.join(acrh))) + + start_response('200 OK', headers) + return [b''] + + try: + req_class = s3request.get_request_class(env, self.conf.s3_acl) + req = req_class(env, self.app, self.conf) + resp = self.handle_request(req) + except NotS3Request: + return self.app(env, start_response) + except InvalidSubresource as e: + self.logger.debug(e.cause) + except ErrorResponse as err_resp: + self.logger.increment(err_resp.metric_name) + append_log_info(env, 's3:err:%s' % err_resp.summary) + if isinstance(err_resp, InternalError): + self.logger.exception(err_resp) + resp = err_resp + except Exception as e: + self.logger.exception(e) + resp = InternalError(reason=str(e)) + + if isinstance(resp, S3ResponseBase) and 'swift.trans_id' in env: + resp.headers['x-amz-id-2'] = env['swift.trans_id'] + resp.headers['x-amz-request-id'] = env['swift.trans_id'] + + if 's3api.backend_path' in env and 'swift.backend_path' not in env: + env['swift.backend_path'] = env['s3api.backend_path'] + + # emit metric with header labels now path and status may be available + self._emit_response_header_stats(env, resp, req_header_labels) + + return resp(env, start_response) + + def handle_request(self, req): + self.logger.debug('Calling S3Api Middleware') + try: + controller = req.controller(self.app, self.conf, self.logger) + except S3NotImplemented: + # TODO: Probably we should distinct the error to log this warning + self.logger.warning('multipart: No SLO middleware in pipeline') + raise + + acl_handler = get_acl_handler(req.controller_name)(req, self.logger) + req.set_acl_handler(acl_handler) + + if hasattr(controller, req.method): + handler = getattr(controller, req.method) + if not getattr(handler, 'publicly_accessible', False): + raise MethodNotAllowed(req.method, + req.controller.resource_type()) + res = handler(req) + else: + raise MethodNotAllowed(req.method, + req.controller.resource_type()) + + if req.policy_index is not None: + res.headers.setdefault('X-Backend-Storage-Policy-Index', + req.policy_index) + return res + + def check_pipeline(self, wsgi_conf): + """ + Check that proxy-server.conf has an appropriate pipeline for s3api. + """ + if wsgi_conf.get('__file__', None) is None: + return + + ctx = loadcontext(loadwsgi.APP, wsgi_conf['__file__']) + pipeline = str(PipelineWrapper(ctx)).split(' ') + + # Add compatible with 3rd party middleware. + self.check_filter_order(pipeline, ['s3api', 'proxy-server']) + + auth_pipeline = pipeline[pipeline.index('s3api') + 1: + pipeline.index('proxy-server')] + + # Check SLO middleware + if self.conf.allow_multipart_uploads and 'slo' not in auth_pipeline: + self.conf.allow_multipart_uploads = False + self.logger.warning('s3api middleware requires SLO middleware ' + 'to support multi-part upload, please add it ' + 'in pipeline') + + if not self.conf.auth_pipeline_check: + self.logger.debug('Skip pipeline auth check.') + return + + if 'tempauth' in auth_pipeline: + self.logger.debug('Use tempauth middleware.') + elif 'keystoneauth' in auth_pipeline: + self.check_filter_order( + auth_pipeline, + ['s3token', 'keystoneauth']) + self.logger.debug('Use keystone middleware.') + elif len(auth_pipeline): + self.logger.debug('Use third party(unknown) auth middleware.') + else: + raise ValueError('Invalid pipeline %r: expected auth between ' + 's3api and proxy-server ' % pipeline) + + def check_filter_order(self, pipeline, required_filters): + """ + Check that required filters are present in order in the pipeline. + """ + indexes = [] + missing_filters = [] + for required_filter in required_filters: + try: + indexes.append(pipeline.index(required_filter)) + except ValueError as e: + self.logger.debug(e) + missing_filters.append(required_filter) + + if missing_filters: + raise ValueError('Invalid pipeline %r: missing filters %r' % ( + pipeline, missing_filters)) + + if indexes != sorted(indexes): + raise ValueError('Invalid pipeline %r: expected filter %s' % ( + pipeline, ' before '.join(required_filters))) + + +def filter_factory(global_conf, **local_conf): + """Standard filter factory to use the middleware with paste.deploy""" + conf = global_conf.copy() + conf.update(local_conf) + + register_swift_info( + 's3api', + # TODO: make default values as variables + max_bucket_listing=int(conf.get('max_bucket_listing', 1000)), + max_parts_listing=int(conf.get('max_parts_listing', 1000)), + max_upload_part_num=int(conf.get('max_upload_part_num', 1000)), + max_multi_delete_objects=int( + conf.get('max_multi_delete_objects', 1000)), + allow_multipart_uploads=config_true_value( + conf.get('allow_multipart_uploads', True)), + min_segment_size=int(conf.get('min_segment_size', 5242880)), + s3_acl=config_true_value(conf.get('s3_acl', False)), + ) + + register_sensitive_header('authorization') + register_sensitive_param('Signature') + register_sensitive_param('X-Amz-Signature') + + def s3api_filter(app): + return S3ApiMiddleware(ListingEtagMiddleware(app), conf) + + return s3api_filter diff --git a/swift/common/middleware/s3api/s3request.py b/swift/common/middleware/s3api/s3request.py new file mode 100644 index 0000000000..9f3641f067 --- /dev/null +++ b/swift/common/middleware/s3api/s3request.py @@ -0,0 +1,2465 @@ +# Copyright (c) 2014 OpenStack Foundation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import base64 +import binascii +from collections import defaultdict, OrderedDict +import contextlib +from email.header import Header +from hashlib import sha1, sha256 +import hmac +import re +# pylint: disable-msg=import-error +from urllib.parse import quote, unquote, parse_qsl +import string + +from swift.common.utils import split_path, json, md5, streq_const_time, \ + close_if_possible, InputProxy, get_policy_index, list_from_csv, \ + strict_b64decode, base64_str, checksum +from swift.common.registry import get_swift_info +from swift.common import swob +from swift.common.http import HTTP_OK, HTTP_CREATED, HTTP_ACCEPTED, \ + HTTP_NO_CONTENT, HTTP_UNAUTHORIZED, HTTP_FORBIDDEN, HTTP_NOT_FOUND, \ + HTTP_CONFLICT, HTTP_UNPROCESSABLE_ENTITY, HTTP_REQUEST_ENTITY_TOO_LARGE, \ + HTTP_PARTIAL_CONTENT, HTTP_NOT_MODIFIED, HTTP_PRECONDITION_FAILED, \ + HTTP_REQUESTED_RANGE_NOT_SATISFIABLE, HTTP_LENGTH_REQUIRED, \ + HTTP_BAD_REQUEST, HTTP_REQUEST_TIMEOUT, HTTP_SERVICE_UNAVAILABLE, \ + HTTP_TOO_MANY_REQUESTS, HTTP_RATE_LIMITED, is_success, \ + HTTP_CLIENT_CLOSED_REQUEST + +from swift.proxy.controllers.base import get_container_info +from swift.common.request_helpers import check_path_header + +from swift.common.middleware.s3api.controllers import ServiceController, \ + ObjectController, AclController, MultiObjectDeleteController, \ + LocationController, LoggingStatusController, PartController, \ + UploadController, UploadsController, VersioningController, \ + UnsupportedController, S3AclController, BucketController, \ + TaggingController, ObjectLockController +from swift.common.middleware.s3api.s3response import AccessDenied, \ + InvalidArgument, InvalidDigest, BucketAlreadyOwnedByYou, \ + RequestTimeTooSkewed, S3Response, SignatureDoesNotMatch, \ + BucketAlreadyExists, BucketNotEmpty, EntityTooLarge, \ + InternalError, NoSuchBucket, NoSuchKey, PreconditionFailed, InvalidRange, \ + MissingContentLength, InvalidStorageClass, S3NotImplemented, InvalidURI, \ + MalformedXML, InvalidRequest, RequestTimeout, InvalidBucketName, \ + BadDigest, AuthorizationHeaderMalformed, SlowDown, \ + AuthorizationQueryParametersError, ServiceUnavailable, BrokenMPU, \ + XAmzContentSHA256Mismatch, IncompleteBody, InvalidChunkSizeError, \ + InvalidPartNumber, InvalidPartArgument, MalformedTrailerError +from swift.common.middleware.s3api.exception import NotS3Request, \ + S3InputError, S3InputSizeError, S3InputIncomplete, \ + S3InputChunkSignatureMismatch, S3InputChunkTooSmall, \ + S3InputMalformedTrailer, S3InputMissingSecret, \ + S3InputSHA256Mismatch, S3InputChecksumMismatch, \ + S3InputChecksumTrailerInvalid +from swift.common.middleware.s3api.utils import utf8encode, \ + S3Timestamp, mktime, MULTIUPLOAD_SUFFIX +from swift.common.middleware.s3api.subresource import decode_acl, encode_acl +from swift.common.middleware.s3api.utils import sysmeta_header, \ + parse_host, parse_path, Config +from swift.common.middleware.s3api.exception import \ + InvalidBucketNameParseError, InvalidURIParseError +from swift.common.middleware.s3api.acl_utils import handle_acl_header + + +# List of sub-resources that must be maintained as part of the HMAC +# signature string. +ALLOWED_SUB_RESOURCES = sorted([ + 'acl', 'delete', 'lifecycle', 'location', 'logging', 'notification', + 'partNumber', 'policy', 'requestPayment', 'torrent', 'uploads', 'uploadId', + 'versionId', 'versioning', 'versions', 'website', + 'response-cache-control', 'response-content-disposition', + 'response-content-encoding', 'response-content-language', + 'response-content-type', 'response-expires', 'cors', 'tagging', 'restore', + 'object-lock' +]) + + +MAX_32BIT_INT = 2147483647 +SIGV2_TIMESTAMP_FORMAT = '%Y-%m-%dT%H:%M:%S' +SIGV4_X_AMZ_DATE_FORMAT = '%Y%m%dT%H%M%SZ' +SIGV4_CHUNK_MIN_SIZE = 8192 +SERVICE = 's3' # useful for mocking out in tests + + +CHECKSUMS_BY_HEADER = { + 'x-amz-checksum-crc32': checksum.crc32, + 'x-amz-checksum-crc32c': checksum.crc32c, + 'x-amz-checksum-crc64nvme': checksum.crc64nvme, + 'x-amz-checksum-sha1': sha1, + 'x-amz-checksum-sha256': sha256, +} + + +def _get_checksum_hasher(header): + try: + return CHECKSUMS_BY_HEADER[header]() + except (KeyError, NotImplementedError): + raise S3NotImplemented('The %s algorithm is not supported.' % header) + + +def _validate_checksum_value(checksum_hasher, b64digest): + return strict_b64decode( + b64digest, + exact_size=checksum_hasher.digest_size, + ) + + +def _validate_checksum_header_cardinality(num_checksum_headers, + headers_and_trailer=False): + if num_checksum_headers > 1: + # inconsistent messaging for AWS compatibility... + msg = 'Expecting a single x-amz-checksum- header' + if not headers_and_trailer: + msg += '. Multiple checksum Types are not allowed.' + raise InvalidRequest(msg) + + +def _is_streaming(aws_sha256): + return aws_sha256 in ( + 'STREAMING-UNSIGNED-PAYLOAD-TRAILER', + 'STREAMING-AWS4-HMAC-SHA256-PAYLOAD', + 'STREAMING-AWS4-HMAC-SHA256-PAYLOAD-TRAILER', + 'STREAMING-AWS4-ECDSA-P256-SHA256-PAYLOAD', + 'STREAMING-AWS4-ECDSA-P256-SHA256-PAYLOAD-TRAILER', + ) + + +def _header_strip(value): + # S3 seems to strip *all* control characters + if value is None: + return None + stripped = _header_strip.re.sub('', value) + if value and not stripped: + # If there's nothing left after stripping, + # behave as though it wasn't provided + return None + return stripped + + +_header_strip.re = re.compile('^[\x00-\x20]*|[\x00-\x20]*$') + + +def _header_acl_property(resource): + """ + Set and retrieve the acl in self.headers + """ + + def getter(self): + return getattr(self, '_%s' % resource) + + def setter(self, value): + self.headers.update(encode_acl(resource, value)) + setattr(self, '_%s' % resource, value) + + def deleter(self): + self.headers[sysmeta_header(resource, 'acl')] = '' + + return property(getter, setter, deleter, + doc='Get and set the %s acl property' % resource) + + +def _parse_path(req, bucket_in_host, dns_compliant_bucket_names): + try: + return parse_path(req, bucket_in_host, dns_compliant_bucket_names) + except InvalidURIParseError as err: + raise InvalidURI(err.uri) + except InvalidBucketNameParseError as err: + raise InvalidBucketName(err.bucket_name) + + +class HashingInput(InputProxy): + """ + wsgi.input wrapper to verify the SHA256 of the input as it's read. + """ + + def __init__(self, wsgi_input, content_length, expected_hex_hash): + super().__init__(wsgi_input) + self._expected_length = content_length + self._hasher = sha256() + self._expected_hash = expected_hex_hash + if content_length == 0 and \ + self._hasher.hexdigest() != self._expected_hash.lower(): + self.close() + raise XAmzContentSHA256Mismatch( + client_computed_content_s_h_a256=self._expected_hash, + s3_computed_content_s_h_a256=self._hasher.hexdigest(), + ) + + def chunk_update(self, chunk, eof, *args, **kwargs): + # Note that "chunk" is just whatever was read from the input; this + # says nothing about whether the underlying stream uses aws-chunked + self._hasher.update(chunk) + + if self.bytes_received < self._expected_length: + error = eof + elif self.bytes_received == self._expected_length: + error = self._hasher.hexdigest() != self._expected_hash.lower() + else: + error = True + + if error: + self.close() + # Since we don't return the last chunk, the PUT never completes + raise S3InputSHA256Mismatch( + self._expected_hash, + self._hasher.hexdigest()) + + return chunk + + +class ChecksummingInput(InputProxy): + """ + wsgi.input wrapper to calculate the X-Amz-Checksum-* of the input as it's + read. The calculated value is checked against an expected value that is + sent in either the request headers or trailers. To allow for the latter, + the expected value is lazy fetched once the input has been read. + + :param wsgi_input: file-like object to be wrapped. + :param content_length: the expected number of bytes to be read. + :param checksum_hasher: a hasher to calculate the checksum of read bytes. + :param checksum_key: the name of the header or trailer that will have + the expected checksum value to be checked. + :param checksum_source: a dict that will have the ``checksum_key``. + """ + + def __init__(self, wsgi_input, content_length, checksum_hasher, + checksum_key, checksum_source): + super().__init__(wsgi_input) + self._expected_length = content_length + self._checksum_hasher = checksum_hasher + self._checksum_key = checksum_key + self._checksum_source = checksum_source + + def chunk_update(self, chunk, eof, *args, **kwargs): + # Note that "chunk" is just whatever was read from the input; this + # says nothing about whether the underlying stream uses aws-chunked + self._checksum_hasher.update(chunk) + if self.bytes_received < self._expected_length: + # wrapped input is likely to have timed out before this clause is + # reached with eof==True, but just in case... + error = eof + elif self.bytes_received == self._expected_length: + # Lazy fetch checksum value because it may have come in trailers + b64digest = self._checksum_source.get(self._checksum_key) + try: + expected_raw_checksum = _validate_checksum_value( + self._checksum_hasher, b64digest) + except ValueError: + # If the checksum value came in a header then it would have + # been validated before the body was read, so if the validation + # fails here then we can infer that the checksum value came in + # a trailer. The S3InputChecksumTrailerInvalid raised here will + # propagate all the way back up the middleware stack to s3api + # where it is caught and translated to an InvalidRequest. + raise S3InputChecksumTrailerInvalid(self._checksum_key) + error = self._checksum_hasher.digest() != expected_raw_checksum + else: + # the underlying wsgi.Input stops reading at content-length so we + # don't expect to reach this clause, but just in case... + error = True + + if error: + self.close() + # Since we don't return the last chunk, the PUT never completes + raise S3InputChecksumMismatch(self._checksum_hasher.name.upper()) + return chunk + + +class ChunkReader(InputProxy): + """ + wsgi.input wrapper to read a single chunk from an aws-chunked input and + validate its signature. + + :param wsgi_input: a wsgi input. + :param chunk_size: number of bytes to read. + :param validator: function to call to validate the chunk's content. + :param chunk_params: string of params from the chunk's header. + """ + def __init__(self, wsgi_input, chunk_size, validator, chunk_params): + super().__init__(wsgi_input) + self.chunk_size = chunk_size + self._validator = validator + if self._validator is None: + self._signature = None + else: + self._signature = self._parse_chunk_signature(chunk_params) + self._sha256 = sha256() + + def _parse_chunk_signature(self, chunk_params): + if not chunk_params: + raise S3InputIncomplete + start, _, chunk_sig = chunk_params.partition('=') + if start.strip() != 'chunk-signature': + # Call the validator to update the string to sign + self._validator('', '') + raise S3InputChunkSignatureMismatch + if ';' in chunk_sig: + raise S3InputIncomplete + chunk_sig = chunk_sig.strip() + if not chunk_sig: + raise S3InputIncomplete + return chunk_sig + + @property + def to_read(self): + return self.chunk_size - self.bytes_received + + def read(self, size=None, *args, **kwargs): + if size is None or size < 0 or size > self.to_read: + size = self.to_read + return super().read(size) + + def readline(self, size=None, *args, **kwargs): + if size is None or size < 0 or size > self.to_read: + size = self.to_read + return super().readline(size) + + def chunk_update(self, chunk, eof, *args, **kwargs): + # Note that "chunk" is just whatever was read from the input + self._sha256.update(chunk) + if self.bytes_received == self.chunk_size: + if self._validator and not self._validator( + self._sha256.hexdigest(), self._signature): + self.close() + raise S3InputChunkSignatureMismatch + return chunk + + +class StreamingInput: + """ + wsgi.input wrapper to read a chunked input, verifying each chunk as it's + read. Once all chunks have been read, any trailers are read. + + :param input: a wsgi input. + :param decoded_content_length: the number of payload bytes expected to be + extracted from chunks. + :param expected_trailers: the set of trailer names expected. + :param sig_checker: an instance of SigCheckerV4 that will be called to + verify each chunk's signature. + """ + def __init__(self, input, decoded_content_length, + expected_trailers, sig_checker): + self._input = input + self._decoded_content_length = decoded_content_length + self._expected_trailers = expected_trailers + self._sig_checker = sig_checker + # Length of the payload remaining; i.e., number of bytes a caller + # still expects to be able to read. Once exhausted, we should be + # exactly at the trailers (if present) + self._to_read = decoded_content_length + # Reader for the current chunk that's in progress + self._chunk_reader = None + # Track the chunk number, for error messages + self._chunk_number = 0 + # Track the size of the most recently read chunk. AWS enforces an 8k + # min chunk size (except the final chunk) + self._last_chunk_size = None + # When True, we've read the payload, but not necessarily the trailers + self._completed_payload = False + # When True, we've read the trailers + self._completed_trailers = False + # Any trailers present after the payload (not available until after + # caller has read full payload; i.e., until after _to_read is 0) + self.trailers = {} + + def _read_chunk_header(self): + """ + Read a chunk header, reading at most one line from the raw input. + + Parse out the next chunk size and any other params. + + :returns: a tuple of (chunk_size, chunk_params). chunk_size is an int, + chunk_params is string. + """ + self._chunk_number += 1 + chunk_header = swob.bytes_to_wsgi(self._input.readline()) + if chunk_header[-2:] != '\r\n': + raise S3InputIncomplete('invalid chunk header: %s' % chunk_header) + chunk_size, _, chunk_params = chunk_header[:-2].partition(';') + + try: + chunk_size = int(chunk_size, 16) + if chunk_size < 0: + raise ValueError + except ValueError: + raise S3InputIncomplete('invalid chunk header: %s' % chunk_header) + + if self._last_chunk_size is not None and \ + self._last_chunk_size < SIGV4_CHUNK_MIN_SIZE and \ + chunk_size != 0: + raise S3InputChunkTooSmall(self._last_chunk_size, + self._chunk_number) + self._last_chunk_size = chunk_size + + if chunk_size > self._to_read: + raise S3InputSizeError( + self._decoded_content_length, + self._decoded_content_length - self._to_read + chunk_size) + return chunk_size, chunk_params + + def _read_payload(self, size, readline=False): + bufs = [] + bytes_read = 0 + while not self._completed_payload and ( + bytes_read < size + # Make sure we read the trailing zero-byte chunk at the end + or self._to_read == 0): + if self._chunk_reader is None: + # OK, we're at the start of a new chunk + chunk_size, chunk_params = self._read_chunk_header() + self._chunk_reader = ChunkReader( + self._input, + chunk_size, + self._sig_checker and + self._sig_checker.check_chunk_signature, + chunk_params) + if readline: + buf = self._chunk_reader.readline(size - bytes_read) + else: + buf = self._chunk_reader.read(size - bytes_read) + bufs.append(buf) + if self._chunk_reader.to_read == 0: + # If it's the final chunk, we're in (possibly empty) trailers + # Otherwise, there's a CRLF chunk-separator + if self._chunk_reader.chunk_size == 0: + self._completed_payload = True + elif self._input.read(2) != b'\r\n': + raise S3InputIncomplete + self._chunk_reader = None + bytes_read += len(buf) + self._to_read -= len(buf) + if readline and buf[-1:] == b'\n': + break + return b''.join(bufs) + + def _read_trailers(self): + if self._expected_trailers: + for line in iter(self._input.readline, b''): + if not line.endswith(b'\r\n'): + raise S3InputIncomplete + if line == b'\r\n': + break + key, _, value = swob.bytes_to_wsgi(line).partition(':') + if key.lower() not in self._expected_trailers: + raise S3InputMalformedTrailer + self.trailers[key.strip()] = value.strip() + if 'x-amz-trailer-signature' in self._expected_trailers \ + and 'x-amz-trailer-signature' not in self.trailers: + raise S3InputIncomplete + if set(self.trailers.keys()) != self._expected_trailers: + raise S3InputMalformedTrailer + if 'x-amz-trailer-signature' in self._expected_trailers \ + and self._sig_checker is not None: + if not self._sig_checker.check_trailer_signature( + self.trailers): + raise S3InputChunkSignatureMismatch + if len(self.trailers) == 1: + raise S3InputIncomplete + # Now that we've read them, we expect no more + self._expected_trailers = set() + elif self._input.read(2) not in (b'', b'\r\n'): + raise S3InputIncomplete + + self._completed_trailers = True + + def _read(self, size, readline=False): + data = self._read_payload(size, readline) + if self._completed_payload: + if not self._completed_trailers: + # read trailers, if present + self._read_trailers() + # At this point, we should have read everything; if we haven't, + # that's an error + if self._to_read: + raise S3InputSizeError( + self._decoded_content_length, + self._decoded_content_length - self._to_read) + return data + + def read(self, size=None): + if size is None or size < 0 or size > self._to_read: + size = self._to_read + try: + return self._read(size) + except S3InputError: + self.close() + raise + + def readline(self, size=None): + if size is None or size < 0 or size > self._to_read: + size = self._to_read + try: + return self._read(size, True) + except S3InputError: + self.close() + raise + + def close(self): + close_if_possible(self._input) + + +class BaseSigChecker: + def __init__(self, req): + self.req = req + self.signature = req.signature + self.string_to_sign = self._string_to_sign() + self._secret = None + + def _string_to_sign(self): + raise NotImplementedError + + def _derive_secret(self, secret): + return utf8encode(secret) + + def _check_signature(self): + raise NotImplementedError + + def check_signature(self, secret): + self._secret = self._derive_secret(secret) + return self._check_signature() + + +class SigCheckerV2(BaseSigChecker): + def _string_to_sign(self): + """ + Create 'StringToSign' value in Amazon terminology for v2. + """ + buf = [swob.wsgi_to_bytes(wsgi_str) for wsgi_str in [ + self.req.method, + _header_strip(self.req.headers.get('Content-MD5')) or '', + _header_strip(self.req.headers.get('Content-Type')) or '']] + + if 'headers_raw' in self.req.environ: # eventlet >= 0.19.0 + # See https://github.com/eventlet/eventlet/commit/67ec999 + amz_headers = defaultdict(list) + for key, value in self.req.environ['headers_raw']: + key = key.lower() + if not key.startswith('x-amz-'): + continue + amz_headers[key.strip()].append(value.strip()) + amz_headers = dict((key, ','.join(value)) + for key, value in amz_headers.items()) + else: # mostly-functional fallback + amz_headers = dict((key.lower(), value) + for key, value in self.req.headers.items() + if key.lower().startswith('x-amz-')) + + if self.req._is_header_auth: + if 'x-amz-date' in amz_headers: + buf.append(b'') + elif 'Date' in self.req.headers: + buf.append(swob.wsgi_to_bytes(self.req.headers['Date'])) + elif self.req._is_query_auth: + buf.append(swob.wsgi_to_bytes(self.req.params['Expires'])) + else: + # Should have already raised NotS3Request in _parse_auth_info, + # but as a sanity check... + raise AccessDenied(reason='not_s3') + + for key, value in sorted(amz_headers.items()): + buf.append(swob.wsgi_to_bytes("%s:%s" % (key, value))) + + path = self.req._canonical_uri() + if self.req.query_string: + path += '?' + self.req.query_string + params = [] + if '?' in path: + path, args = path.split('?', 1) + for key, value in sorted(self.req.params.items()): + if key in ALLOWED_SUB_RESOURCES: + params.append('%s=%s' % (key, value) if value else key) + if params: + buf.append(swob.wsgi_to_bytes('%s?%s' % (path, '&'.join(params)))) + else: + buf.append(swob.wsgi_to_bytes(path)) + return b'\n'.join(buf) + + def _check_signature(self): + valid_signature = base64_str( + hmac.new(self._secret, self.string_to_sign, sha1).digest()) + return streq_const_time(self.signature, valid_signature) + + +class SigCheckerV4(BaseSigChecker): + def __init__(self, req): + super().__init__(req) + self._all_chunk_signatures_valid = True + + def _string_to_sign(self): + return b'\n'.join([ + b'AWS4-HMAC-SHA256', + self.req.timestamp.amz_date_format.encode('ascii'), + '/'.join(self.req.scope.values()).encode('utf8'), + sha256(self.req._canonical_request()).hexdigest().encode('ascii')]) + + def _derive_secret(self, secret): + derived_secret = b'AWS4' + super()._derive_secret(secret) + for scope_piece in self.req.scope.values(): + derived_secret = hmac.new( + derived_secret, scope_piece.encode('utf8'), sha256).digest() + return derived_secret + + def _check_signature(self): + if self._secret is None: + raise S3InputMissingSecret + valid_signature = hmac.new( + self._secret, self.string_to_sign, sha256).hexdigest() + return streq_const_time(self.signature, valid_signature) + + def _chunk_string_to_sign(self, data_sha256): + """ + Create 'ChunkStringToSign' value in Amazon terminology for v4. + """ + return b'\n'.join([ + b'AWS4-HMAC-SHA256-PAYLOAD', + self.req.timestamp.amz_date_format.encode('ascii'), + '/'.join(self.req.scope.values()).encode('utf8'), + self.signature.encode('utf8'), + sha256(b'').hexdigest().encode('utf8'), + data_sha256.encode('utf8') + ]) + + def check_chunk_signature(self, chunk_sha256, signature): + """ + Check the validity of a chunk's signature. + + This method verifies the signature of a given chunk using its SHA-256 + hash. It updates the string to sign and the current signature, then + checks if the signature is valid. If any chunk signature is invalid, + it returns False. + + :param chunk_sha256: (str) The SHA-256 hash of the chunk. + :param signature: (str) The signature to be verified. + :returns: True if all chunk signatures are valid, False otherwise. + """ + if not self._all_chunk_signatures_valid: + return False + # NB: string_to_sign is calculated using the previous signature + self.string_to_sign = self._chunk_string_to_sign(chunk_sha256) + # So we have to update the signature to compare against *after* + # the string-to-sign + self.signature = signature + self._all_chunk_signatures_valid &= self._check_signature() + return self._all_chunk_signatures_valid + + def _trailer_string_to_sign(self, trailers): + """ + Create 'TrailerChunkStringToSign' value in Amazon terminology for v4. + """ + canonical_trailers = swob.wsgi_to_bytes(''.join( + f'{key}:{value}\n' + for key, value in sorted( + trailers.items(), + key=lambda kvp: swob.wsgi_to_bytes(kvp[0]).lower(), + ) + if key != 'x-amz-trailer-signature' + )) + if not canonical_trailers: + canonical_trailers = b'\n' + return b'\n'.join([ + b'AWS4-HMAC-SHA256-TRAILER', + self.req.timestamp.amz_date_format.encode('ascii'), + '/'.join(self.req.scope.values()).encode('utf8'), + self.signature.encode('utf8'), + sha256(canonical_trailers).hexdigest().encode('utf8'), + ]) + + def check_trailer_signature(self, trailers): + """ + Check the validity of a chunk's signature. + + This method verifies the trailers received after the main payload. + + :param trailers: (dict[str, str]) The trailers received. + :returns: True if x-amz-trailer-signature is valid, False otherwise. + """ + if not self._all_chunk_signatures_valid: + # if there was a breakdown earlier, this can't be right + return False + # NB: string_to_sign is calculated using the previous signature + self.string_to_sign = self._trailer_string_to_sign(trailers) + # So we have to update the signature to compare against *after* + # the string-to-sign + self.signature = trailers['x-amz-trailer-signature'] + self._all_chunk_signatures_valid &= self._check_signature() + return self._all_chunk_signatures_valid + + +def _parse_credential(credential_string): + """ + Parse an AWS credential string into its components. + + This method splits the given credential string into its constituent parts: + access key ID, date, AWS region, AWS service, and terminal identifier. + The credential string must follow the format: + ////aws4_request. + + :param credential_string: (str) The AWS credential string to be parsed. + :raises AccessDenied: If the credential string is invalid or does not + follow the required format. + :returns: A dict containing the parsed components of the credential string. + """ + parts = credential_string.split("/") + # credential must be in following format: + # ////aws4_request + if not parts[0] or len(parts) != 5: + raise AccessDenied(reason='invalid_credential') + return dict(zip(['access', 'date', 'region', 'service', 'terminal'], + parts)) + + +class SigV4Mixin(object): + """ + A request class mixin to provide S3 signature v4 functionality + """ + + @property + def _is_query_auth(self): + return 'X-Amz-Credential' in self.params + + @property + def _is_x_amz_content_sha256_required(self): + return not self._is_query_auth + + @property + def timestamp(self): + """ + Return timestamp string according to the auth type + The difference from v2 is v4 have to see 'X-Amz-Date' even though + it's query auth type. + """ + if not self._timestamp: + try: + if self._is_query_auth and 'X-Amz-Date' in self.params: + # NOTE(andrey-mp): Date in Signature V4 has different + # format + timestamp = mktime( + self.params['X-Amz-Date'], SIGV4_X_AMZ_DATE_FORMAT) + else: + if self.headers.get('X-Amz-Date'): + timestamp = mktime( + self.headers.get('X-Amz-Date'), + SIGV4_X_AMZ_DATE_FORMAT) + else: + timestamp = mktime(self.headers.get('Date')) + except (ValueError, TypeError): + raise AccessDenied('AWS authentication requires a valid Date ' + 'or x-amz-date header', + reason='invalid_date') + + if timestamp < 0: + raise AccessDenied('AWS authentication requires a valid Date ' + 'or x-amz-date header', + reason='invalid_date') + + try: + self._timestamp = S3Timestamp(timestamp) + except ValueError: + # Must be far-future; blame clock skew + raise RequestTimeTooSkewed() + + return self._timestamp + + def _validate_expire_param(self): + """ + Validate X-Amz-Expires in query parameter + :raises: AccessDenied + :raises: AuthorizationQueryParametersError + :raises: AccessDenined + """ + err = None + try: + expires = int(self.params['X-Amz-Expires']) + except KeyError: + raise AccessDenied(reason='invalid_expires') + except ValueError: + err = 'X-Amz-Expires should be a number' + else: + if expires < 0: + err = 'X-Amz-Expires must be non-negative' + elif expires >= 2 ** 63: + err = 'X-Amz-Expires should be a number' + elif expires > 604800: + err = ('X-Amz-Expires must be less than a week (in seconds); ' + 'that is, the given X-Amz-Expires must be less than ' + '604800 seconds') + if err: + raise AuthorizationQueryParametersError(err) + + if int(self.timestamp) + expires < S3Timestamp.now(): + raise AccessDenied('Request has expired', reason='expired') + + def _parse_query_authentication(self): + """ + Parse v4 query authentication + - version 4: + 'X-Amz-Credential' and 'X-Amz-Signature' should be in param + :raises: AccessDenied + :raises: AuthorizationHeaderMalformed + """ + if self.params.get('X-Amz-Algorithm') != 'AWS4-HMAC-SHA256': + raise InvalidArgument('X-Amz-Algorithm', + self.params.get('X-Amz-Algorithm')) + try: + cred_param = _parse_credential( + swob.wsgi_to_str(self.params['X-Amz-Credential'])) + sig = swob.wsgi_to_str(self.params['X-Amz-Signature']) + if not sig: + raise AccessDenied(reason='invalid_query_auth') + except KeyError: + raise AccessDenied(reason='invalid_query_auth') + + try: + signed_headers = swob.wsgi_to_str( + self.params['X-Amz-SignedHeaders']) + except KeyError: + # TODO: make sure if is it malformed request? + raise AuthorizationHeaderMalformed() + + self._signed_headers = set(signed_headers.split(';')) + + invalid_messages = { + 'date': 'Invalid credential date "%s". This date is not the same ' + 'as X-Amz-Date: "%s".', + 'region': "Error parsing the X-Amz-Credential parameter; " + "the region '%s' is wrong; expecting '%s'", + 'service': 'Error parsing the X-Amz-Credential parameter; ' + 'incorrect service "%s". This endpoint belongs to "%s".', + 'terminal': 'Error parsing the X-Amz-Credential parameter; ' + 'incorrect terminal "%s". This endpoint uses "%s".', + } + for key in ('date', 'region', 'service', 'terminal'): + if cred_param[key] != self.scope[key]: + kwargs = {} + if key == 'region': + # Allow lowercase region name + # for AWS .NET SDK compatibility + if not self.scope[key].islower() and \ + cred_param[key] == self.scope[key].lower(): + self.location = self.location.lower() + continue + kwargs = {'region': self.scope['region']} + raise AuthorizationQueryParametersError( + invalid_messages[key] % (cred_param[key], self.scope[key]), + **kwargs) + + return cred_param['access'], sig + + def _parse_header_authentication(self): + """ + Parse v4 header authentication + - version 4: + 'X-Amz-Credential' and 'X-Amz-Signature' should be in param + :raises: AccessDenied + :raises: AuthorizationHeaderMalformed + """ + + auth_str = swob.wsgi_to_str(self.headers['Authorization']) + cred_param = _parse_credential(auth_str.partition( + "Credential=")[2].split(',')[0]) + sig = auth_str.partition("Signature=")[2].split(',')[0] + if not sig: + raise AccessDenied(reason='invalid_header_auth') + signed_headers = auth_str.partition( + "SignedHeaders=")[2].split(',', 1)[0] + if not signed_headers: + # TODO: make sure if is it Malformed? + raise AuthorizationHeaderMalformed() + + invalid_messages = { + 'date': 'Invalid credential date "%s". This date is not the same ' + 'as X-Amz-Date: "%s".', + 'region': "The authorization header is malformed; the region '%s' " + "is wrong; expecting '%s'", + 'service': 'The authorization header is malformed; incorrect ' + 'service "%s". This endpoint belongs to "%s".', + 'terminal': 'The authorization header is malformed; incorrect ' + 'terminal "%s". This endpoint uses "%s".', + } + for key in ('date', 'region', 'service', 'terminal'): + if cred_param[key] != self.scope[key]: + kwargs = {} + if key == 'region': + # Allow lowercase region name + # for AWS .NET SDK compatibility + if not self.scope[key].islower() and \ + cred_param[key] == self.scope[key].lower(): + self.location = self.location.lower() + continue + kwargs = {'region': self.scope['region']} + raise AuthorizationHeaderMalformed( + invalid_messages[key] % (cred_param[key], self.scope[key]), + **kwargs) + + self._signed_headers = set(signed_headers.split(';')) + + return cred_param['access'], sig + + def _canonical_query_string(self): + return '&'.join( + '%s=%s' % (swob.wsgi_quote(key, safe='-_.~'), + swob.wsgi_quote(value, safe='-_.~')) + for key, value in sorted(self.params.items()) + if key not in ('Signature', 'X-Amz-Signature')).encode('ascii') + + def _headers_to_sign(self): + """ + Select the headers from the request that need to be included + in the StringToSign. + + :return : dict of headers to sign, the keys are all lower case + """ + if 'headers_raw' in self.environ: # eventlet >= 0.19.0 + # See https://github.com/eventlet/eventlet/commit/67ec999 + headers_lower_dict = defaultdict(list) + for key, value in self.environ['headers_raw']: + headers_lower_dict[key.lower().strip()].append( + ' '.join(_header_strip(value or '').split())) + headers_lower_dict = {k: ','.join(v) + for k, v in headers_lower_dict.items()} + else: # mostly-functional fallback + headers_lower_dict = dict( + (k.lower().strip(), ' '.join(_header_strip(v or '').split())) + for (k, v) in self.headers.items()) + + if 'host' in headers_lower_dict and re.match( + 'Boto/2.[0-9].[0-2]', + headers_lower_dict.get('user-agent', '')): + # Boto versions < 2.9.3 strip the port component of the host:port + # header, so detect the user-agent via the header and strip the + # port if we detect an old boto version. + headers_lower_dict['host'] = \ + headers_lower_dict['host'].split(':')[0] + + headers_to_sign = [ + (key, value) for key, value in sorted(headers_lower_dict.items()) + if swob.wsgi_to_str(key) in self._signed_headers] + + if len(headers_to_sign) != len(self._signed_headers): + # NOTE: if we are missing the header suggested via + # signed_header in actual header, it results in + # SignatureDoesNotMatch in actual S3 so we can raise + # the error immediately here to save redundant check + # process. + raise SignatureDoesNotMatch() + + return headers_to_sign + + def _canonical_uri(self): + """ + It won't require bucket name in canonical_uri for v4. + """ + return swob.wsgi_to_bytes(swob.wsgi_quote( + self.environ.get('PATH_INFO', self.path), safe='-_.~/')) + + def _canonical_request(self): + # prepare 'canonical_request' + # Example requests are like following: + # + # GET + # / + # Action=ListUsers&Version=2010-05-08 + # content-type:application/x-www-form-urlencoded; charset=utf-8 + # host:iam.amazonaws.com + # x-amz-date:20150830T123600Z + # + # content-type;host;x-amz-date + # e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 + # + + # 1. Add verb like: GET + cr = [swob.wsgi_to_bytes(self.method)] + + # 2. Add path like: / + path = self._canonical_uri() + cr.append(path) + + # 3. Add query like: Action=ListUsers&Version=2010-05-08 + cr.append(self._canonical_query_string()) + + # 4. Add headers like: + # content-type:application/x-www-form-urlencoded; charset=utf-8 + # host:iam.amazonaws.com + # x-amz-date:20150830T123600Z + headers_to_sign = self._headers_to_sign() + cr.append(b''.join(swob.wsgi_to_bytes('%s:%s\n' % (key, value)) + for key, value in headers_to_sign)) + + # 5. Add signed headers into canonical request like + # content-type;host;x-amz-date + cr.append(b';'.join(swob.wsgi_to_bytes(k) for k, v in headers_to_sign)) + + # 6. Add payload string at the tail + hashed_payload = self.headers.get('X-Amz-Content-SHA256', + 'UNSIGNED-PAYLOAD') + + cr.append(swob.wsgi_to_bytes(hashed_payload)) + return b'\n'.join(cr) + + @property + def scope(self): + return OrderedDict([ + ('date', self.timestamp.amz_date_format.split('T')[0]), + ('region', self.location), + ('service', SERVICE), + ('terminal', 'aws4_request'), + ]) + + def signature_does_not_match_kwargs(self): + kwargs = super(SigV4Mixin, self).signature_does_not_match_kwargs() + cr = self._canonical_request() + kwargs.update({ + 'canonical_request': cr, + 'canonical_request_bytes': ' '.join( + format(b, '02x') for b in cr), + }) + return kwargs + + +def get_request_class(env, s3_acl): + """ + Helper function to find a request class to use from Map + """ + if s3_acl: + request_classes = (S3AclRequest, SigV4S3AclRequest) + else: + request_classes = (S3Request, SigV4Request) + + req = swob.Request(env) + if 'X-Amz-Credential' in req.params or \ + req.headers.get('Authorization', '').startswith( + 'AWS4-HMAC-SHA256 '): + # This is an Amazon SigV4 request + return request_classes[1] + else: + # The others using Amazon SigV2 class + return request_classes[0] + + +class S3Request(swob.Request): + """ + S3 request object. + """ + + bucket_acl = _header_acl_property('container') + object_acl = _header_acl_property('object') + + def __init__(self, env, app=None, conf=None): + # NOTE: app is not used by this class, need for compatibility of S3acl + swob.Request.__init__(self, env) + self.conf = conf or Config() + self.location = self.conf.location + self._timestamp = None + self.access_key, self.signature = self._parse_auth_info() + self.bucket_in_host = parse_host(self.environ, + self.conf.storage_domains) + self.container_name, self.object_name = _parse_path( + self, self.bucket_in_host, self.conf.dns_compliant_bucket_names) + self._validate_headers() + if isinstance(self, SigV4Mixin): + # this is a deliberate but only partial shift away from the + # 'inherit and override from mixin' pattern towards a 'compose + # adapters' pattern. + self.sig_checker = SigCheckerV4(self) + else: + self.sig_checker = SigCheckerV2(self) + aws_sha256 = self.headers.get('x-amz-content-sha256') + if self.method in ('PUT', 'POST'): + checksum_hasher, checksum_header, checksum_trailer = \ + self._validate_checksum_headers() + if _is_streaming(aws_sha256): + if checksum_trailer: + streaming_input = self._install_streaming_input_wrapper( + aws_sha256, checksum_trailer=checksum_trailer) + checksum_key = checksum_trailer + checksum_source = streaming_input.trailers + else: + self._install_streaming_input_wrapper(aws_sha256) + checksum_key = checksum_header + checksum_source = self.headers + elif checksum_trailer: + raise MalformedTrailerError + else: + self._install_non_streaming_input_wrapper(aws_sha256) + checksum_key = checksum_header + checksum_source = self.headers + + if self.method == 'PUT': + verify_checksum = True + elif self.method == 'POST': + if 'delete' in self.params: + verify_checksum = True + else: + # S3 doesn't check the checksum for some POSTs (e.g. MPU + # complete) + verify_checksum = False + else: + verify_checksum = False + + if checksum_key and verify_checksum: + self._install_checksumming_input_wrapper( + checksum_hasher, checksum_key, checksum_source) + + # Lock in string-to-sign now, before we start messing with query params + self.environ['s3api.auth_details'] = { + 'access_key': self.access_key, + 'signature': self.signature, + 'string_to_sign': self.sig_checker.string_to_sign, + 'check_signature': self.sig_checker.check_signature, + } + # Set the logging field (if not set already) + # Because auth mw to our right will only see a copy of the SwiftRequest + # environ we use a mutable value to back-propagate updates to proxy-log + access_key_value = (self.access_key[:125] + '...' + if len(self.access_key) > 128 + else self.access_key) + self.environ.setdefault('swift.access_logging', {}).setdefault( + 'user_id', access_key_value) + self.account = None + self.user_id = None + self.policy_index = None + + # Avoids that swift.swob.Response replaces Location header value + # by full URL when absolute path given. See swift.swob for more detail. + self.environ['swift.leave_relative_location'] = True + + def validate_part_number(self, parts_count=None, check_max=True): + """ + Get the partNumber param, if it exists, and check it is valid. + + To be valid, a partNumber must satisfy two criteria. First, it must be + an integer between 1 and the maximum allowed parts, inclusive. The + maximum allowed parts is the maximum of the configured + ``max_upload_part_num`` and, if given, ``parts_count``. Second, the + partNumber must be less than or equal to the ``parts_count``, if it is + given. + + :param parts_count: if given, this is the number of parts in an + existing object. + :raises InvalidPartArgument: if the partNumber param is invalid i.e. + less than 1 or greater than the maximum allowed parts. + :raises InvalidPartNumber: if the partNumber param is valid but greater + than ``num_parts``. + :return: an integer part number if the partNumber param exists, + otherwise ``None``. + """ + part_number = self.params.get('partNumber') + if part_number is None: + return None + + if self.range: + raise InvalidRequest('Cannot specify both Range header and ' + 'partNumber query parameter') + + try: + parts_count = int(parts_count) + except (TypeError, ValueError): + # an invalid/empty param is treated like parts_count=max_parts + parts_count = self.conf.max_upload_part_num + # max_parts may be raised to the number of existing parts + max_parts = max(self.conf.max_upload_part_num, parts_count) + + try: + part_number = int(part_number) + if part_number < 1: + raise ValueError + except ValueError: + raise InvalidPartArgument(max_parts, part_number) # 400 + + if check_max: + if part_number > max_parts: + raise InvalidPartArgument(max_parts, part_number) # 400 + if part_number > parts_count: + raise InvalidPartNumber() # 416 + + return part_number + + @property + def timestamp(self): + """ + S3Timestamp from Date header. If X-Amz-Date header specified, it + will be prior to Date header. + + :return : S3Timestamp instance + """ + if not self._timestamp: + try: + if self._is_query_auth and 'Timestamp' in self.params: + # If Timestamp specified in query, it should be prior + # to any Date header (is this right?) + timestamp = mktime( + self.params['Timestamp'], SIGV2_TIMESTAMP_FORMAT) + else: + timestamp = mktime( + self.headers.get('X-Amz-Date', + self.headers.get('Date'))) + except ValueError: + raise AccessDenied('AWS authentication requires a valid Date ' + 'or x-amz-date header', + reason='invalid_date') + + if timestamp < 0: + raise AccessDenied('AWS authentication requires a valid Date ' + 'or x-amz-date header', + reason='invalid_date') + try: + self._timestamp = S3Timestamp(timestamp) + except ValueError: + # Must be far-future; blame clock skew + raise RequestTimeTooSkewed() + + return self._timestamp + + @property + def _is_header_auth(self): + return 'Authorization' in self.headers + + @property + def _is_query_auth(self): + return 'AWSAccessKeyId' in self.params + + @property + def _is_x_amz_content_sha256_required(self): + return False + + def _parse_query_authentication(self): + """ + Parse v2 authentication query args + TODO: make sure if 0, 1, 3 is supported? + - version 0, 1, 2, 3: + 'AWSAccessKeyId' and 'Signature' should be in param + + :return: a tuple of access_key and signature + :raises: AccessDenied + """ + try: + access = swob.wsgi_to_str(self.params['AWSAccessKeyId']) + expires = swob.wsgi_to_str(self.params['Expires']) + sig = swob.wsgi_to_str(self.params['Signature']) + except KeyError: + raise AccessDenied(reason='invalid_query_auth') + + if not all([access, sig, expires]): + raise AccessDenied(reason='invalid_query_auth') + + return access, sig + + def _parse_header_authentication(self): + """ + Parse v2 header authentication info + + :returns: a tuple of access_key and signature + :raises: AccessDenied + """ + auth_str = swob.wsgi_to_str(self.headers['Authorization']) + if not auth_str.startswith('AWS ') or ':' not in auth_str: + raise AccessDenied(reason='invalid_header_auth') + # This means signature format V2 + access, sig = auth_str.split(' ', 1)[1].rsplit(':', 1) + return access, sig + + def _parse_auth_info(self): + """Extract the access key identifier and signature. + + :returns: a tuple of access_key and signature + :raises: NotS3Request + """ + if self._is_query_auth: + self._validate_expire_param() + return self._parse_query_authentication() + elif self._is_header_auth: + self._validate_dates() + return self._parse_header_authentication() + else: + # if this request is neither query auth nor header auth + # s3api regard this as not s3 request + raise NotS3Request() + + def _validate_expire_param(self): + """ + Validate Expires in query parameters + :raises: AccessDenied + """ + # Expires header is a float since epoch + try: + ex = S3Timestamp(float(self.params['Expires'])) + except (KeyError, ValueError): + raise AccessDenied(reason='invalid_expires') + + if S3Timestamp.now() > ex: + raise AccessDenied('Request has expired', reason='expired') + + if ex >= 2 ** 31: + raise AccessDenied( + 'Invalid date (should be seconds since epoch): %s' % + self.params['Expires'], reason='invalid_expires') + + def _validate_dates(self): + """ + Validate Date/X-Amz-Date headers for signature v2 + :raises: AccessDenied + :raises: RequestTimeTooSkewed + """ + date_header = self.headers.get('Date') + amz_date_header = self.headers.get('X-Amz-Date') + if not date_header and not amz_date_header: + raise AccessDenied('AWS authentication requires a valid Date ' + 'or x-amz-date header', + reason='invalid_date') + + # Anyways, request timestamp should be validated + epoch = S3Timestamp.zero() + if self.timestamp < epoch: + raise AccessDenied(reason='invalid_date') + + # If the standard date is too far ahead or behind, it is an + # error + delta = abs(int(self.timestamp) - int(S3Timestamp.now())) + if delta > self.conf.allowable_clock_skew: + raise RequestTimeTooSkewed() + + def _validate_sha256(self): + aws_sha256 = self.headers.get('x-amz-content-sha256') + if not aws_sha256: + if self._is_x_amz_content_sha256_required: + msg = 'Missing required header for this request: ' \ + 'x-amz-content-sha256' + raise InvalidRequest(msg) + else: + return + + looks_like_sha256 = ( + aws_sha256 and len(aws_sha256) == 64 and + all(c in '0123456789abcdef' for c in aws_sha256.lower())) + if aws_sha256 == 'UNSIGNED-PAYLOAD': + pass + elif _is_streaming(aws_sha256): + decoded_content_length = self.headers.get( + 'x-amz-decoded-content-length') + try: + decoded_content_length = int(decoded_content_length) + except (ValueError, TypeError): + raise MissingContentLength + if decoded_content_length < 0: + raise InvalidArgument('x-amz-decoded-content-length', + decoded_content_length) + + if not isinstance(self, SigV4Mixin) or self._is_query_auth: + if decoded_content_length < (self.content_length or 0): + raise IncompleteBody( + number_bytes_expected=decoded_content_length, + number_bytes_provided=self.content_length, + ) + body = self.body_file.read() + raise XAmzContentSHA256Mismatch( + client_computed_content_s_h_a256=aws_sha256, + s3_computed_content_s_h_a256=sha256(body).hexdigest(), + ) + elif aws_sha256 in ( + 'STREAMING-AWS4-ECDSA-P256-SHA256-PAYLOAD', + 'STREAMING-AWS4-ECDSA-P256-SHA256-PAYLOAD-TRAILER', + ): + raise S3NotImplemented( + "Don't know how to validate %s streams" + % aws_sha256) + + elif not looks_like_sha256 and self._is_x_amz_content_sha256_required: + raise InvalidArgument( + 'x-amz-content-sha256', + aws_sha256, + 'x-amz-content-sha256 must be UNSIGNED-PAYLOAD, ' + 'STREAMING-UNSIGNED-PAYLOAD-TRAILER, ' + 'STREAMING-AWS4-HMAC-SHA256-PAYLOAD, ' + 'STREAMING-AWS4-HMAC-SHA256-PAYLOAD-TRAILER or ' + 'a valid sha256 value.') + + return aws_sha256 + + def _cleanup_content_encoding(self): + if 'aws-chunked' in self.headers.get('Content-Encoding', ''): + new_enc = ', '.join( + enc for enc in list_from_csv( + self.headers.pop('Content-Encoding')) + # TODO: test what's stored w/ 'aws-chunked, aws-chunked' + if enc != 'aws-chunked') + if new_enc: + # used to be, AWS would store '', but not any more + self.headers['Content-Encoding'] = new_enc + + def _install_streaming_input_wrapper(self, aws_sha256, + checksum_trailer=None): + """ + Wrap the wsgi input with a reader that parses an aws-chunked body. + + :param aws_sha256: the value of the 'x-amz-content-sha256' header. + :param checksum_trailer: the name of an 'x-amz-checksum-*' trailer + (if any) that is to be expected at the end of the body. + :return: an instance of StreamingInput. + """ + self._cleanup_content_encoding() + self.content_length = int(self.headers.get( + 'x-amz-decoded-content-length')) + expected_trailers = set() + if aws_sha256 == 'STREAMING-AWS4-HMAC-SHA256-PAYLOAD-TRAILER': + expected_trailers.add('x-amz-trailer-signature') + if checksum_trailer: + expected_trailers.add(checksum_trailer) + streaming_input = StreamingInput( + self.environ['wsgi.input'], + self.content_length, + expected_trailers, + None if aws_sha256 == 'STREAMING-UNSIGNED-PAYLOAD-TRAILER' + else self.sig_checker) + self.environ['wsgi.input'] = streaming_input + return streaming_input + + def _install_non_streaming_input_wrapper(self, aws_sha256): + if (aws_sha256 not in (None, 'UNSIGNED-PAYLOAD') and + self.content_length is not None): + self.environ['wsgi.input'] = HashingInput( + self.environ['wsgi.input'], + self.content_length, + aws_sha256) + # If no content-length, either client's trying to do a HTTP chunked + # transfer, or a HTTP/1.0-style transfer (in which case swift will + # reject with length-required and we'll translate back to + # MissingContentLength) + + def _validate_x_amz_checksum_headers(self): + """ + Validate and return a header that specifies a checksum value. A valid + header must be named x-amz-checksum- where is + one of the supported checksum algorithms. + + :raises: InvalidRequest if more than one checksum header is found or if + an invalid algorithm is specified. + :return: a dict containing at most a single checksum header name:value + pair. + """ + checksum_headers = { + h.lower(): v + for h, v in self.headers.items() + if (h.lower().startswith('x-amz-checksum-') + and h.lower() not in ('x-amz-checksum-algorithm', + 'x-amz-checksum-type')) + } + if any(h not in CHECKSUMS_BY_HEADER + for h in checksum_headers): + raise InvalidRequest('The algorithm type you specified in ' + 'x-amz-checksum- header is invalid.') + _validate_checksum_header_cardinality(len(checksum_headers)) + return checksum_headers + + def _validate_x_amz_trailer_header(self): + """ + Validate and return the name of a checksum trailer that is declared by + an ``x-amz-trailer`` header. A valid trailer must be named + x-amz-checksum- where is one of the supported + checksum algorithms. + + :raises: InvalidRequest if more than one checksum trailer is declared + by the ``x-amz-trailer`` header, or if an invalid algorithm is + specified. + :return: a list containing at most a single checksum header name. + """ + header = self.headers.get('x-amz-trailer', '').strip() + checksum_headers = [ + v.strip() for v in header.rstrip(',').split(',') + ] if header else [] + if any(h not in CHECKSUMS_BY_HEADER + for h in checksum_headers): + raise InvalidRequest('The value specified in the x-amz-trailer ' + 'header is not supported') + _validate_checksum_header_cardinality(len(checksum_headers)) + return checksum_headers + + def _validate_checksum_headers(self): + """ + A checksum for the request is specified by a checksum header of the + form: + + x-amz-checksum-: + + where is one of the supported checksum algorithms and + is the value to be checked. A checksum header may be sent in + either the headers or the trailers. An ``x-amz-trailer`` header is used + to declare that a checksum header is to be expected in the trailers. + + At most one checksum header is allowed in the headers or trailers. If + this condition is met, this method returns the name of the checksum + header or trailer and a hasher for the checksum algorithm that it + declares. + + :raises InvalidRequest: if any of the following conditions occur: more + than one checksum header is declared; the checksum header specifies + an invalid algorithm; the algorithm does not match the value of any + ``x-amz-sdk-checksum-algorithm`` header that is also present; the + checksum value is invalid. + :raises S3NotImplemented: if the declared algorithm is valid but not + supported. + :return: a tuple of + (hasher, checksum header name, checksum trailer name) where at + least one of (checksum header name, checksum trailer name) will be + None. + """ + checksum_headers = self._validate_x_amz_checksum_headers() + checksum_trailer_headers = self._validate_x_amz_trailer_header() + _validate_checksum_header_cardinality( + len(checksum_headers) + len(checksum_trailer_headers), + headers_and_trailer=True + ) + + if checksum_headers: + checksum_trailer = None + checksum_header, b64digest = list(checksum_headers.items())[0] + checksum_hasher = _get_checksum_hasher(checksum_header) + try: + # early check on the value... + _validate_checksum_value(checksum_hasher, b64digest) + except ValueError: + raise InvalidRequest( + 'Value for %s header is invalid.' % checksum_header) + elif checksum_trailer_headers: + checksum_header = None + checksum_trailer = checksum_trailer_headers[0] + checksum_hasher = _get_checksum_hasher(checksum_trailer) + # checksum should appear at end of request in trailers + else: + checksum_hasher = checksum_header = checksum_trailer = None + + checksum_algo = self.headers.get('x-amz-sdk-checksum-algorithm') + if checksum_algo: + if not checksum_hasher: + raise InvalidRequest( + 'x-amz-sdk-checksum-algorithm specified, but no ' + 'corresponding x-amz-checksum-* or x-amz-trailer ' + 'headers were found.') + if checksum_algo.lower() != checksum_hasher.name: + raise InvalidRequest('Value for x-amz-sdk-checksum-algorithm ' + 'header is invalid.') + + return checksum_hasher, checksum_header, checksum_trailer + + def _install_checksumming_input_wrapper( + self, checksum_hasher, checksum_key, checksum_source): + self.environ['wsgi.input'] = ChecksummingInput( + self.environ['wsgi.input'], + self.content_length, + checksum_hasher, + checksum_key, + checksum_source + ) + + def _validate_headers(self): + if 'CONTENT_LENGTH' in self.environ: + try: + if self.content_length < 0: + raise InvalidArgument('Content-Length', + self.content_length) + except (ValueError, TypeError): + raise InvalidArgument('Content-Length', + self.environ['CONTENT_LENGTH']) + + if self.method == 'PUT' and ( + any(h in self.headers for h in ( + 'If-Match', 'If-Modified-Since', 'If-Unmodified-Since')) + or self.headers.get('If-None-Match', '*') != '*'): + raise S3NotImplemented( + 'Conditional object PUTs are not supported.') + + if 'X-Amz-Copy-Source' in self.headers: + try: + check_path_header(self, 'X-Amz-Copy-Source', 2, '') + except swob.HTTPException: + msg = 'Copy Source must mention the source bucket and key: ' \ + 'sourcebucket/sourcekey' + raise InvalidArgument('x-amz-copy-source', + self.headers['X-Amz-Copy-Source'], + msg) + if 'x-amz-metadata-directive' in self.headers: + value = self.headers['x-amz-metadata-directive'] + if value not in ('COPY', 'REPLACE'): + err_msg = 'Unknown metadata directive.' + raise InvalidArgument('x-amz-metadata-directive', value, + err_msg) + + if 'x-amz-storage-class' in self.headers: + # Only STANDARD is supported now. + if self.headers['x-amz-storage-class'] != 'STANDARD': + raise InvalidStorageClass() + + if 'x-amz-mfa' in self.headers: + raise S3NotImplemented('MFA Delete is not supported.') + + sse_value = self.headers.get('x-amz-server-side-encryption') + if sse_value is not None: + if sse_value not in ('aws:kms', 'AES256'): + raise InvalidArgument( + 'x-amz-server-side-encryption', sse_value, + 'The encryption method specified is not supported') + encryption_enabled = get_swift_info(admin=True)['admin'].get( + 'encryption', {}).get('enabled') + if not encryption_enabled or sse_value != 'AES256': + raise S3NotImplemented( + 'Server-side encryption is not supported.') + + if 'x-amz-website-redirect-location' in self.headers: + raise S3NotImplemented('Website redirection is not supported.') + + self._validate_sha256() + + value = _header_strip(self.headers.get('Content-MD5')) + if value is not None: + if not re.match('^[A-Za-z0-9+/]+={0,2}$', value): + # Non-base64-alphabet characters in value. + raise InvalidDigest(content_md5=value) + try: + self.headers['ETag'] = binascii.b2a_hex( + binascii.a2b_base64(value)) + except binascii.Error: + # incorrect padding, most likely + raise InvalidDigest(content_md5=value) + + if len(self.headers['ETag']) != 32: + raise InvalidDigest(content_md5=value) + + if 'x-amz-tagging' in self.headers: + raise S3NotImplemented('Object tagging is not supported.') + + @property + def body(self): + """ + swob.Request.body is not secure against malicious input. It consumes + too much memory without any check when the request body is excessively + large. Use xml() instead. + """ + raise AttributeError("No attribute 'body'") + + def xml(self, max_length): + """ + Similar to swob.Request.body, but it checks the content length before + creating a body string. + """ + te = self.headers.get('transfer-encoding', '') + te = [x.strip() for x in te.split(',') if x.strip()] + if te and (len(te) > 1 or te[-1] != 'chunked'): + raise S3NotImplemented('A header you provided implies ' + 'functionality that is not implemented', + header='Transfer-Encoding') + + ml = self.message_length() + if ml and ml > max_length: + raise MalformedXML() + + if te or ml: + # Limit the read similar to how SLO handles manifests + with self.translate_read_errors(): + body = self.body_file.read(max_length) + else: + # No (or zero) Content-Length provided, and not chunked transfer; + # no body. Assume zero-length, and enforce a required body below. + return None + + return body + + def check_md5(self, body): + """ + Check the md5 of the request body against the content-md5 header if the + header is present. + + :raise BadDigest: if the header is present but does not match the + calculated body md5. + :return: True if the header is present, False otherwise. + """ + content_md5 = self.environ.get('HTTP_CONTENT_MD5') + if not content_md5: + return False + digest = base64_str(md5(body, usedforsecurity=False).digest()) + if content_md5 != digest: + raise BadDigest(expected_digest=content_md5) + return True + + def require_md5(self, body): + allowed_checksum_env_keys = [ + 'HTTP_' + hdr.upper().replace('-', '_') + for hdr in CHECKSUMS_BY_HEADER.keys() + ] + allowed_checksum_env_keys.append('HTTP_CONTENT_MD5') + if not any(k in self.environ for k in allowed_checksum_env_keys): + raise InvalidRequest('Missing required header for this request: ' + 'Content-MD5 OR x-amz-checksum-*') + self.check_md5(body) + + def _copy_source_headers(self): + env = {} + for key, value in self.environ.items(): + if key.startswith('HTTP_X_AMZ_COPY_SOURCE_'): + env[key.replace('X_AMZ_COPY_SOURCE_', '')] = value + + return swob.HeaderEnvironProxy(env) + + def check_copy_source(self, app): + """ + check_copy_source checks the copy source existence and if copying an + object to itself, for illegal request parameters + + :returns: the source HEAD response + """ + try: + src_path = self.headers['X-Amz-Copy-Source'] + except KeyError: + return None + + src_path, qs = src_path.partition('?')[::2] + parsed = parse_qsl(qs, True) + if not parsed: + query = {} + elif len(parsed) == 1 and parsed[0][0] == 'versionId': + query = {'version-id': parsed[0][1]} + else: + raise InvalidArgument('X-Amz-Copy-Source', + self.headers['X-Amz-Copy-Source'], + 'Unsupported copy source parameter.') + + src_path = unquote(src_path) + src_path = src_path if src_path.startswith('/') else ('/' + src_path) + src_bucket, src_obj = split_path(src_path, 0, 2, True) + + headers = swob.HeaderKeyDict() + headers.update(self._copy_source_headers()) + + src_resp = self.get_response(app, 'HEAD', src_bucket, + swob.str_to_wsgi(src_obj), + headers=headers, query=query) + if src_resp.status_int == 304: # pylint: disable-msg=E1101 + raise PreconditionFailed() + + if (self.container_name == src_bucket and + self.object_name == src_obj and + self.headers.get('x-amz-metadata-directive', + 'COPY') == 'COPY' and + not query): + raise InvalidRequest("This copy request is illegal " + "because it is trying to copy an " + "object to itself without " + "changing the object's metadata, " + "storage class, website redirect " + "location or encryption " + "attributes.") + # We've done some normalizing; write back so it's ready for + # to_swift_req + self.headers['X-Amz-Copy-Source'] = quote(src_path) + if query: + self.headers['X-Amz-Copy-Source'] += \ + '?versionId=' + query['version-id'] + return src_resp + + def _canonical_uri(self): + """ + Require bucket name in canonical_uri for v2 in virtual hosted-style. + """ + raw_path_info = self.environ.get('RAW_PATH_INFO', self.path) + if self.bucket_in_host: + raw_path_info = '/' + self.bucket_in_host + raw_path_info + return raw_path_info + + def signature_does_not_match_kwargs(self): + return { + 'a_w_s_access_key_id': self.access_key, + 'string_to_sign': self.sig_checker.string_to_sign, + 'signature_provided': self.signature, + 'string_to_sign_bytes': ' '.join( + format(b, '02x') for b in self.sig_checker.string_to_sign), + } + + @property + def controller_name(self): + return self.controller.__name__[:-len('Controller')] + + @property + def controller(self): + if self.is_service_request: + return ServiceController + + if not self.conf.allow_multipart_uploads: + multi_part = ['partNumber', 'uploadId', 'uploads'] + if len([p for p in multi_part if p in self.params]): + raise S3NotImplemented("Multi-part feature isn't support") + + if 'acl' in self.params: + return AclController + if 'delete' in self.params: + return MultiObjectDeleteController + if 'location' in self.params: + return LocationController + if 'logging' in self.params: + return LoggingStatusController + if 'partNumber' in self.params: + if self.method == 'PUT': + return PartController + else: + return ObjectController + if 'uploadId' in self.params: + return UploadController + if 'uploads' in self.params: + return UploadsController + if 'versioning' in self.params: + return VersioningController + if 'tagging' in self.params: + return TaggingController + if 'object-lock' in self.params: + return ObjectLockController + + unsupported = ('notification', 'policy', 'requestPayment', 'torrent', + 'website', 'cors', 'restore') + if set(unsupported) & set(self.params): + return UnsupportedController + + if self.is_object_request: + return ObjectController + return BucketController + + @property + def is_service_request(self): + return not self.container_name + + @property + def is_bucket_request(self): + return self.container_name and not self.object_name + + @property + def is_object_request(self): + return self.container_name and self.object_name + + @property + def is_authenticated(self): + return self.account is not None + + def to_swift_req(self, method, container, obj, query=None, + body=None, headers=None): + """ + Create a Swift request based on this request's environment. + """ + if self.account is None: + account = swob.str_to_wsgi(self.access_key) + else: + account = self.account + + env = self.environ.copy() + env['swift.infocache'] = self.environ.setdefault('swift.infocache', {}) + + def sanitize(value): + if set(value).issubset(string.printable): + return value + + value = Header(value, 'UTF-8').encode() + if value.startswith('=?utf-8?q?'): + return '=?UTF-8?Q?' + value[10:] + elif value.startswith('=?utf-8?b?'): + return '=?UTF-8?B?' + value[10:] + else: + return value + + if 'headers_raw' in env: # eventlet >= 0.19.0 + # See https://github.com/eventlet/eventlet/commit/67ec999 + for key, value in env['headers_raw']: + if not key.lower().startswith('x-amz-meta-'): + continue + # AWS ignores user-defined headers with these characters + if any(c in key for c in ' "),/;<=>?@[\\]{}'): + # NB: apparently, '(' *is* allowed + continue + # Note that this may have already been deleted, e.g. if the + # client sent multiple headers with the same name, or both + # x-amz-meta-foo-bar and x-amz-meta-foo_bar + env.pop('HTTP_' + key.replace('-', '_').upper(), None) + # Need to preserve underscores. Since we know '=' can't be + # present, quoted-printable seems appropriate. + key = key.replace('_', '=5F').replace('-', '_').upper() + key = 'HTTP_X_OBJECT_META_' + key[11:] + if key in env: + env[key] += ',' + sanitize(value) + else: + env[key] = sanitize(value) + else: # mostly-functional fallback + for key in self.environ: + if not key.startswith('HTTP_X_AMZ_META_'): + continue + # AWS ignores user-defined headers with these characters + if any(c in key for c in ' "),/;<=>?@[\\]{}'): + # NB: apparently, '(' *is* allowed + continue + env['HTTP_X_OBJECT_META_' + key[16:]] = sanitize(env[key]) + del env[key] + + copy_from_version_id = '' + if 'HTTP_X_AMZ_COPY_SOURCE' in env and env['REQUEST_METHOD'] == 'PUT': + env['HTTP_X_COPY_FROM'], copy_from_version_id = env[ + 'HTTP_X_AMZ_COPY_SOURCE'].partition('?versionId=')[::2] + del env['HTTP_X_AMZ_COPY_SOURCE'] + env['CONTENT_LENGTH'] = '0' + if env.pop('HTTP_X_AMZ_METADATA_DIRECTIVE', None) == 'REPLACE': + env['HTTP_X_FRESH_METADATA'] = 'True' + else: + copy_exclude_headers = ('HTTP_CONTENT_DISPOSITION', + 'HTTP_CONTENT_ENCODING', + 'HTTP_CONTENT_LANGUAGE', + 'CONTENT_TYPE', + 'HTTP_EXPIRES', + 'HTTP_CACHE_CONTROL', + 'HTTP_X_ROBOTS_TAG') + for key in copy_exclude_headers: + env.pop(key, None) + for key in list(env.keys()): + if key.startswith('HTTP_X_OBJECT_META_'): + del env[key] + + if self.conf.force_swift_request_proxy_log: + env['swift.proxy_access_log_made'] = False + env['swift.source'] = 'S3' + if method is not None: + env['REQUEST_METHOD'] = method + + if obj: + path = '/v1/%s/%s/%s' % (account, container, obj) + elif container: + path = '/v1/%s/%s' % (account, container) + else: + path = '/v1/%s' % (account) + env['PATH_INFO'] = path + + params = [] + if query is not None: + for key, value in sorted(query.items()): + if value is not None: + params.append('%s=%s' % (key, quote(str(value)))) + else: + params.append(key) + if copy_from_version_id and not (query and query.get('version-id')): + params.append('version-id=' + copy_from_version_id) + env['QUERY_STRING'] = '&'.join(params) + + return swob.Request.blank(quote(path), environ=env, body=body, + headers=headers) + + def _swift_success_codes(self, method, container, obj): + """ + Returns a list of expected success codes from Swift. + """ + if not container: + # Swift account access. + code_map = { + 'GET': [ + HTTP_OK, + ], + } + elif not obj: + # Swift container access. + code_map = { + 'HEAD': [ + HTTP_NO_CONTENT, + ], + 'GET': [ + HTTP_OK, + HTTP_NO_CONTENT, + ], + 'PUT': [ + HTTP_CREATED, + ], + 'POST': [ + HTTP_NO_CONTENT, + ], + 'DELETE': [ + HTTP_NO_CONTENT, + ], + } + else: + # Swift object access. + code_map = { + 'HEAD': [ + HTTP_OK, + HTTP_PARTIAL_CONTENT, + HTTP_NOT_MODIFIED, + ], + 'GET': [ + HTTP_OK, + HTTP_PARTIAL_CONTENT, + HTTP_NOT_MODIFIED, + ], + 'PUT': [ + HTTP_CREATED, + HTTP_ACCEPTED, # For SLO with heartbeating + ], + 'POST': [ + HTTP_ACCEPTED, + ], + 'DELETE': [ + HTTP_OK, + HTTP_NO_CONTENT, + ], + } + + return code_map[method] + + def _bucket_put_accepted_error(self, container, app): + sw_req = self.to_swift_req('HEAD', container, None) + info = get_container_info(sw_req.environ, app, swift_source='S3') + sysmeta = info.get('sysmeta', {}) + try: + acl = json.loads(sysmeta.get('s3api-acl', + sysmeta.get('swift3-acl', '{}'))) + owner = acl.get('Owner') + except (ValueError, TypeError, KeyError): + owner = None + if owner is None or owner == self.user_id: + raise BucketAlreadyOwnedByYou(container) + raise BucketAlreadyExists(container) + + def _swift_error_codes(self, method, container, obj, env, app): + """ + Returns a dict from expected Swift error codes to the corresponding S3 + error responses. + """ + if not container: + # Swift account access. + code_map = { + 'GET': { + }, + } + elif not obj: + # Swift container access. + code_map = { + 'HEAD': { + HTTP_NOT_FOUND: (NoSuchBucket, container), + }, + 'GET': { + HTTP_NOT_FOUND: (NoSuchBucket, container), + }, + 'PUT': { + HTTP_ACCEPTED: (self._bucket_put_accepted_error, container, + app), + }, + 'POST': { + HTTP_NOT_FOUND: (NoSuchBucket, container), + }, + 'DELETE': { + HTTP_NOT_FOUND: (NoSuchBucket, container), + HTTP_CONFLICT: BucketNotEmpty, + }, + } + else: + # Swift object access. + + # 404s differ depending upon whether the bucket exists + # Note that base-container-existence checks happen elsewhere for + # multi-part uploads, and get_container_info should be pulling + # from the env cache + def not_found_handler(): + if container.endswith(MULTIUPLOAD_SUFFIX) or \ + is_success(get_container_info( + env, app, swift_source='S3').get('status')): + return NoSuchKey(obj) + return NoSuchBucket(container) + + # Since BadDigest ought to plumb in some client-provided values, + # defer evaluation until we know they're provided + def bad_digest_handler(): + etag = binascii.hexlify(base64.b64decode( + env['HTTP_CONTENT_MD5'])) + return BadDigest( + expected_digest=etag, # yes, really hex + # TODO: plumb in calculated_digest, as b64 + ) + + code_map = { + 'HEAD': { + HTTP_NOT_FOUND: not_found_handler, + HTTP_PRECONDITION_FAILED: PreconditionFailed, + }, + 'GET': { + HTTP_NOT_FOUND: not_found_handler, + HTTP_PRECONDITION_FAILED: PreconditionFailed, + }, + 'PUT': { + HTTP_NOT_FOUND: (NoSuchBucket, container), + HTTP_UNPROCESSABLE_ENTITY: bad_digest_handler, + HTTP_REQUEST_ENTITY_TOO_LARGE: EntityTooLarge, + HTTP_LENGTH_REQUIRED: MissingContentLength, + HTTP_REQUEST_TIMEOUT: RequestTimeout, + HTTP_PRECONDITION_FAILED: PreconditionFailed, + HTTP_CLIENT_CLOSED_REQUEST: RequestTimeout, + }, + 'POST': { + HTTP_NOT_FOUND: not_found_handler, + HTTP_PRECONDITION_FAILED: PreconditionFailed, + }, + 'DELETE': { + HTTP_NOT_FOUND: (NoSuchKey, obj), + }, + } + + return code_map[method] + + @contextlib.contextmanager + def translate_read_errors(self): + try: + yield + except S3InputIncomplete: + raise IncompleteBody('The request body terminated unexpectedly') + except S3InputSHA256Mismatch as err: + # hopefully by now any modifications to the path (e.g. tenant to + # account translation) will have been made by auth middleware + raise XAmzContentSHA256Mismatch( + client_computed_content_s_h_a256=err.expected, + s3_computed_content_s_h_a256=err.computed, + ) + except S3InputChecksumMismatch as e: + raise BadDigest( + 'The %s you specified did not ' + 'match the calculated checksum.' % e.args[0]) + except S3InputChecksumTrailerInvalid as e: + raise InvalidRequest( + 'Value for %s trailing header is invalid.' % e.trailer) + except S3InputChunkSignatureMismatch: + raise SignatureDoesNotMatch( + **self.signature_does_not_match_kwargs()) + except S3InputSizeError as e: + raise IncompleteBody( + number_bytes_expected=e.expected, + number_bytes_provided=e.provided, + ) + except S3InputChunkTooSmall as e: + raise InvalidChunkSizeError( + chunk=e.chunk_number, + bad_chunk_size=e.bad_chunk_size, + ) + except S3InputMalformedTrailer: + raise MalformedTrailerError + except S3InputMissingSecret: + # XXX: We should really log something here. The poor user can't do + # anything about this; we need to notify the operator to notify the + # auth middleware developer + raise S3NotImplemented('Transferring payloads in multiple chunks ' + 'using aws-chunked is not supported.') + except S3InputError: + # All cases should be covered above, but belt & braces + # NB: general exception handler in s3api.py will log traceback + raise InternalError + + def _get_response(self, app, method, container, obj, + headers=None, body=None, query=None): + """ + Calls the application with this request's environment. Returns a + S3Response object that wraps up the application's result. + """ + + method = method or self.environ['REQUEST_METHOD'] + + if container is None: + container = self.container_name + if obj is None: + obj = self.object_name + + sw_req = self.to_swift_req(method, container, obj, headers=headers, + body=body, query=query) + + try: + with self.translate_read_errors(): + sw_resp = sw_req.get_response(app) + finally: + # reuse account + _, self.account, _ = split_path(sw_req.environ['PATH_INFO'], + 2, 3, True) + self.environ['s3api.backend_path'] = sw_req.environ['PATH_INFO'] + + # keep a record of the backend policy index so that the s3api can add + # it to the headers of whatever response it returns, which may not + # necessarily be this resp. + self.policy_index = get_policy_index(sw_req.headers, sw_resp.headers) + resp = S3Response.from_swift_resp(sw_resp) + status = resp.status_int # pylint: disable-msg=E1101 + + if not self.user_id: + if 'HTTP_X_USER_NAME' in sw_resp.environ: + # keystone + self.user_id = "%s:%s" % ( + sw_resp.environ['HTTP_X_TENANT_NAME'], + sw_resp.environ['HTTP_X_USER_NAME']) + else: + # tempauth + self.user_id = self.access_key + + success_codes = self._swift_success_codes(method, container, obj) + error_codes = self._swift_error_codes(method, container, obj, + sw_req.environ, app) + + if status in success_codes: + return resp + + err_msg = resp.body + + if status in error_codes: + err_resp = \ + error_codes[sw_resp.status_int] # pylint: disable-msg=E1101 + if isinstance(err_resp, tuple): + raise err_resp[0](*err_resp[1:]) + elif b'quota' in err_msg: + raise err_resp(err_msg) + else: + raise err_resp() + + if status == HTTP_BAD_REQUEST: + err_str = err_msg.decode('utf8') + if 'X-Delete-At' in err_str: + raise InvalidArgument('X-Delete-At', + self.headers['X-Delete-At'], + err_str) + if 'X-Delete-After' in err_str: + raise InvalidArgument('X-Delete-After', + self.headers['X-Delete-After'], + err_str) + else: + raise InvalidRequest(msg=err_str) + if status == HTTP_UNAUTHORIZED: + raise SignatureDoesNotMatch( + **self.signature_does_not_match_kwargs()) + if status == HTTP_FORBIDDEN: + raise AccessDenied(reason='forbidden') + if status == HTTP_REQUESTED_RANGE_NOT_SATISFIABLE: + self.validate_part_number( + parts_count=resp.headers.get('x-amz-mp-parts-count')) + raise InvalidRange() + if status == HTTP_SERVICE_UNAVAILABLE: + raise ServiceUnavailable() + if status in (HTTP_RATE_LIMITED, HTTP_TOO_MANY_REQUESTS): + if self.conf.ratelimit_as_client_error: + raise SlowDown(status='429 Slow Down') + raise SlowDown() + if resp.status_int == HTTP_CONFLICT: + if self.method == 'GET': + raise BrokenMPU() + else: + raise ServiceUnavailable() + + raise InternalError('unexpected status code %d' % status) + + def get_response(self, app, method=None, container=None, obj=None, + headers=None, body=None, query=None): + """ + get_response is an entry point to be extended for child classes. + If additional tasks needed at that time of getting swift response, + we can override this method. + swift.common.middleware.s3api.s3request.S3Request need to just call + _get_response to get pure swift response. + """ + + if 'HTTP_X_AMZ_ACL' in self.environ: + handle_acl_header(self) + + return self._get_response(app, method, container, obj, + headers, body, query) + + def get_validated_param(self, param, default, limit=MAX_32BIT_INT): + value = default + if param in self.params: + try: + value = int(self.params[param]) + if value < 0: + err_msg = 'Argument %s must be an integer between 0 and' \ + ' %d' % (param, MAX_32BIT_INT) + raise InvalidArgument(param, self.params[param], err_msg) + + if value > MAX_32BIT_INT: + # check the value because int() could build either a long + # instance or a 64bit integer. + raise ValueError() + + if limit < value: + value = limit + + except ValueError: + err_msg = 'Provided %s not an integer or within ' \ + 'integer range' % param + raise InvalidArgument(param, self.params[param], err_msg) + + return value + + def get_container_info(self, app): + """ + get_container_info will return a result dict of get_container_info + from the backend Swift. + + :returns: a dictionary of container info from + swift.controllers.base.get_container_info + :raises: NoSuchBucket when the container doesn't exist + :raises: InternalError when the request failed without 404 + """ + if not self.is_authenticated: + sw_req = self.to_swift_req('TEST', None, None, body='') + # don't show log message of this request + sw_req.environ['swift.proxy_access_log_made'] = True + + sw_resp = sw_req.get_response(app) + + if not sw_req.remote_user: + raise SignatureDoesNotMatch( + **self.signature_does_not_match_kwargs()) + + _, self.account, _ = split_path(sw_resp.environ['PATH_INFO'], + 2, 3, True) + sw_req = self.to_swift_req('TEST', self.container_name, None) + info = get_container_info(sw_req.environ, app, swift_source='S3') + if is_success(info['status']): + return info + elif info['status'] == HTTP_NOT_FOUND: + raise NoSuchBucket(self.container_name) + elif info['status'] == HTTP_SERVICE_UNAVAILABLE: + raise ServiceUnavailable() + else: + raise InternalError( + 'unexpected status code %d' % info['status']) + + def gen_multipart_manifest_delete_query(self, app, obj=None, version=None): + if not self.conf.allow_multipart_uploads: + return {} + if not obj: + obj = self.object_name + query = {'symlink': 'get'} + if version is not None: + query['version-id'] = version + resp = self.get_response(app, 'HEAD', obj=obj, query=query) + if not resp.is_slo: + return {} + elif resp.sysmeta_headers.get(sysmeta_header('object', 'etag')): + # Even if allow_async_delete is turned off, SLO will just handle + # the delete synchronously, so we don't need to check before + # setting async=on + return {'multipart-manifest': 'delete', 'async': 'on'} + else: + return {'multipart-manifest': 'delete'} + + def set_acl_handler(self, handler): + pass + + +class S3AclRequest(S3Request): + """ + S3Acl request object. + """ + + def __init__(self, env, app=None, conf=None): + super(S3AclRequest, self).__init__(env, app, conf) + self.authenticate(app) + self.acl_handler = None + + @property + def controller(self): + if 'acl' in self.params and not self.is_service_request: + return S3AclController + return super(S3AclRequest, self).controller + + def authenticate(self, app): + """ + authenticate method will run pre-authenticate request and retrieve + account information. + Note that it currently supports only keystone and tempauth. + (no support for the third party authentication middleware) + """ + sw_req = self.to_swift_req('TEST', None, None, body='') + # don't show log message of this request + sw_req.environ['swift.proxy_access_log_made'] = True + + sw_resp = sw_req.get_response(app) + + if not sw_req.remote_user: + raise SignatureDoesNotMatch( + **self.signature_does_not_match_kwargs()) + + _, self.account, _ = split_path(sw_resp.environ['PATH_INFO'], + 2, 3, True) + + if 'HTTP_X_USER_NAME' in sw_resp.environ: + # keystone + self.user_id = "%s:%s" % (sw_resp.environ['HTTP_X_TENANT_NAME'], + sw_resp.environ['HTTP_X_USER_NAME']) + else: + # tempauth + self.user_id = self.access_key + + sw_req.environ.get('swift.authorize', lambda req: None)(sw_req) + self.environ['swift_owner'] = sw_req.environ.get('swift_owner', False) + if 'REMOTE_USER' in sw_req.environ: + self.environ['REMOTE_USER'] = sw_req.environ['REMOTE_USER'] + + # Need to skip S3 authorization on subsequent requests to prevent + # overwriting the account in PATH_INFO + del self.environ['s3api.auth_details'] + + def to_swift_req(self, method, container, obj, query=None, + body=None, headers=None): + sw_req = super(S3AclRequest, self).to_swift_req( + method, container, obj, query, body, headers) + if self.account: + sw_req.environ['swift_owner'] = True # needed to set ACL + sw_req.environ['swift.authorize_override'] = True + sw_req.environ['swift.authorize'] = lambda req: None + return sw_req + + def get_acl_response(self, app, method=None, container=None, obj=None, + headers=None, body=None, query=None): + """ + Wrapper method of _get_response to add s3 acl information + from response sysmeta headers. + """ + + resp = self._get_response( + app, method, container, obj, headers, body, query) + resp.bucket_acl = decode_acl( + 'container', resp.sysmeta_headers, self.conf.allow_no_owner) + resp.object_acl = decode_acl( + 'object', resp.sysmeta_headers, self.conf.allow_no_owner) + + return resp + + def get_response(self, app, method=None, container=None, obj=None, + headers=None, body=None, query=None): + """ + Wrap up get_response call to hook with acl handling method. + """ + if not self.acl_handler: + # we should set acl_handler all time before calling get_response + raise Exception('get_response called before set_acl_handler') + resp = self.acl_handler.handle_acl( + app, method, container, obj, headers) + + # possible to skip recalling get_response_acl if resp is not + # None (e.g. HEAD) + if resp: + return resp + return self.get_acl_response(app, method, container, obj, + headers, body, query) + + def set_acl_handler(self, acl_handler): + self.acl_handler = acl_handler + + +class SigV4Request(SigV4Mixin, S3Request): + pass + + +class SigV4S3AclRequest(SigV4Mixin, S3AclRequest): + pass diff --git a/swift/common/middleware/s3api/s3response.py b/swift/common/middleware/s3api/s3response.py new file mode 100644 index 0000000000..7a84411d28 --- /dev/null +++ b/swift/common/middleware/s3api/s3response.py @@ -0,0 +1,818 @@ +# Copyright (c) 2014 OpenStack Foundation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from collections.abc import MutableMapping +from functools import partial + +from swift.common import header_key_dict +from swift.common import swob +from swift.common.utils import config_true_value +from swift.common.request_helpers import is_sys_meta + +from swift.common.middleware.s3api.utils import snake_to_camel, \ + sysmeta_prefix, sysmeta_header +from swift.common.middleware.s3api.etree import Element, SubElement, tostring +from swift.common.middleware.versioned_writes.object_versioning import \ + DELETE_MARKER_CONTENT_TYPE + + +class HeaderKeyDict(header_key_dict.HeaderKeyDict): + """ + Similar to the Swift's normal HeaderKeyDict class, but its key name is + normalized as S3 clients expect. + """ + @staticmethod + def _title(s): + s = header_key_dict.HeaderKeyDict._title(s) + if s.lower() == 'etag': + # AWS Java SDK expects only 'ETag'. + return 'ETag' + if s.lower().startswith('x-amz-'): + # AWS headers returned by S3 are lowercase. + return swob.bytes_to_wsgi(swob.wsgi_to_bytes(s).lower()) + return s + + +def translate_swift_to_s3(key, val): + _key = swob.bytes_to_wsgi(swob.wsgi_to_bytes(key).lower()) + + def translate_meta_key(_key): + if not _key.startswith('x-object-meta-'): + return _key + # Note that AWS allows user-defined metadata with underscores in the + # header, while WSGI (and other protocols derived from CGI) does not + # differentiate between an underscore and a dash. Fortunately, + # eventlet exposes the raw headers from the client, so we could + # translate '_' to '=5F' on the way in. Now, we translate back. + return 'x-amz-meta-' + _key[14:].replace('=5f', '_') + + if _key.startswith('x-object-meta-'): + return translate_meta_key(_key), val + elif _key in ('accept-ranges', 'content-length', 'content-type', + 'content-range', 'content-encoding', + 'content-disposition', 'content-language', + 'etag', 'last-modified', 'x-robots-tag', + 'cache-control', 'expires'): + return key, val + elif _key == 'x-object-version-id': + return 'x-amz-version-id', val + elif _key == 'x-parts-count': + return 'x-amz-mp-parts-count', val + elif _key == 'x-copied-from-version-id': + return 'x-amz-copy-source-version-id', val + elif _key == 'x-backend-content-type' and \ + val == DELETE_MARKER_CONTENT_TYPE: + return 'x-amz-delete-marker', 'true' + elif _key == 'access-control-expose-headers': + exposed_headers = val.split(', ') + exposed_headers.extend([ + 'x-amz-request-id', + 'x-amz-id-2', + ]) + return 'access-control-expose-headers', ', '.join( + translate_meta_key(h) for h in exposed_headers) + elif _key == 'access-control-allow-methods': + methods = val.split(', ') + try: + methods.remove('COPY') # that's not a thing in S3 + except ValueError: + pass # not there? don't worry about it + return key, ', '.join(methods) + elif _key.startswith('access-control-'): + return key, val + # else, drop the header + return None + + +class S3ResponseBase(object): + """ + Base class for swift3 responses. + """ + pass + + +class S3Response(S3ResponseBase, swob.Response): + """ + Similar to the Response class in Swift, but uses our HeaderKeyDict for + headers instead of Swift's HeaderKeyDict. This also translates Swift + specific headers to S3 headers. + """ + + def __init__(self, *args, **kwargs): + swob.Response.__init__(self, *args, **kwargs) + + s3_sysmeta_headers = swob.HeaderKeyDict() + sw_headers = swob.HeaderKeyDict() + headers = HeaderKeyDict() + self.is_slo = False + + def is_swift3_sysmeta(sysmeta_key, server_type): + swift3_sysmeta_prefix = ( + 'x-%s-sysmeta-swift3' % server_type).lower() + return sysmeta_key.lower().startswith(swift3_sysmeta_prefix) + + def is_s3api_sysmeta(sysmeta_key, server_type): + s3api_sysmeta_prefix = sysmeta_prefix(_server_type).lower() + return sysmeta_key.lower().startswith(s3api_sysmeta_prefix) + + for key, val in self.headers.items(): + if is_sys_meta('object', key) or is_sys_meta('container', key): + _server_type = key.split('-')[1] + if is_swift3_sysmeta(key, _server_type): + # To be compatible with older swift3, translate swift3 + # sysmeta to s3api sysmeta here + key = sysmeta_prefix(_server_type) + \ + key[len('x-%s-sysmeta-swift3-' % _server_type):] + + if key not in s3_sysmeta_headers: + # To avoid overwrite s3api sysmeta by older swift3 + # sysmeta set the key only when the key does not exist + s3_sysmeta_headers[key] = val + elif is_s3api_sysmeta(key, _server_type): + s3_sysmeta_headers[key] = val + else: + sw_headers[key] = val + else: + sw_headers[key] = val + + # Handle swift headers + for key, val in sw_headers.items(): + s3_pair = translate_swift_to_s3(key, val) + if s3_pair is None: + continue + headers[s3_pair[0]] = s3_pair[1] + + self.is_slo = config_true_value(sw_headers.get( + 'x-static-large-object')) + + # Check whether we stored the AWS-style etag on upload + override_etag = s3_sysmeta_headers.get( + sysmeta_header('object', 'etag')) + if override_etag not in (None, ''): + # Multipart uploads in AWS have ETags like + # - + headers['etag'] = override_etag + elif self.is_slo and 'etag' in headers: + # Many AWS clients use the presence of a '-' to decide whether + # to attempt client-side download validation, so even if we + # didn't store the AWS-style header, tack on a '-N'. (Use 'N' + # because we don't actually know how many parts there are.) + headers['etag'] += '-N' + + self.headers = headers + + if self.etag: + # add double quotes to the etag header + self.etag = self.etag + + # Used for pure swift header handling at the request layer + self.sw_headers = sw_headers + self.sysmeta_headers = s3_sysmeta_headers + + @classmethod + def from_swift_resp(cls, sw_resp): + """ + Create a new S3 response object based on the given Swift response. + """ + if sw_resp.app_iter: + body = None + app_iter = sw_resp.app_iter + else: + body = sw_resp.body + app_iter = None + + resp = cls(status=sw_resp.status, headers=sw_resp.headers, + request=sw_resp.request, body=body, app_iter=app_iter, + conditional_response=sw_resp.conditional_response) + resp.environ.update(sw_resp.environ) + + return resp + + def append_copy_resp_body(self, controller_name, last_modified): + elem = Element('Copy%sResult' % controller_name) + SubElement(elem, 'LastModified').text = last_modified + SubElement(elem, 'ETag').text = '"%s"' % self.etag + self.headers['Content-Type'] = 'application/xml' + self.body = tostring(elem) + self.etag = None + + +HTTPOk = partial(S3Response, status=200) +HTTPCreated = partial(S3Response, status=201) +HTTPAccepted = partial(S3Response, status=202) +HTTPNoContent = partial(S3Response, status=204) +HTTPPartialContent = partial(S3Response, status=206) + + +class ErrorResponse(S3ResponseBase, swob.HTTPException): + """ + S3 error object. + + Reference information about S3 errors is available at: + http://docs.aws.amazon.com/AmazonS3/latest/API/ErrorResponses.html + """ + _status = '' + _msg = '' + _code = '' + xml_declaration = True + + def __init__(self, msg=None, reason=None, *args, **kwargs): + if msg: + self._msg = msg + if not self._code: + self._code = self.__class__.__name__ + self.reason = reason + + self.info = kwargs.copy() + for reserved_key in ('headers', 'body'): + if self.info.get(reserved_key): + del (self.info[reserved_key]) + + swob.HTTPException.__init__( + self, status=kwargs.pop('status', self._status), + # we use an app_iter, so that we can add our trans_id to the resp + # xml *after* we've been called - technically any non-None app_iter + # would do, we override swob.Response._response_iter anyway. + app_iter=self._body_iter(), + content_type='application/xml', *args, + **kwargs) + self.headers = HeaderKeyDict(self.headers) + + @property + def summary(self): + """Provide a summary of the error code and reason.""" + if self.reason: + summary = '.'.join([self._code, self.reason]) + else: + summary = self._code + return summary.replace(' ', '_') + + @property + def metric_name(self): + return '.'.join([str(self.status_int), self.summary]) + + def _body_iter(self): + error_elem = Element('Error') + SubElement(error_elem, 'Code').text = self._code + SubElement(error_elem, 'Message').text = self._msg + # N.B. swob.Response objects don't normally have an environ attribute + # when they're created, but swob always gives this to us when we're + # __call__'d + if 'swift.trans_id' in self.environ: + request_id = self.environ['swift.trans_id'] + SubElement(error_elem, 'RequestId').text = request_id + + self._dict_to_etree(error_elem, self.info) + + yield tostring(error_elem, use_s3ns=False, + xml_declaration=self.xml_declaration) + + def _response_iter(self, app_iter, body): + # we don't actually want our _response_iter to be a generator, a list + # of strings is much better for eventlet.wsgi.server connection + # handling and request pipelining and ErrorResponses are small. FWIW + # we now have self.environ, app_iter=self._body_iter() and body is None + return super()._response_iter(list(app_iter), body) + + def _dict_to_etree(self, parent, d): + for key, value in d.items(): + tag = re.sub(r'\W', '', snake_to_camel(key)) + elem = SubElement(parent, tag) + + if isinstance(value, (dict, MutableMapping)): + self._dict_to_etree(elem, value) + else: + if isinstance(value, (int, float, bool)): + value = str(value) + try: + elem.text = value + except ValueError: + # We set an invalid string for XML. + elem.text = '(invalid string)' + + +class AccessDenied(ErrorResponse): + _status = '403 Forbidden' + _msg = 'Access Denied.' + + +class AccountProblem(ErrorResponse): + _status = '403 Forbidden' + _msg = 'There is a problem with your AWS account that prevents the ' \ + 'operation from completing successfully.' + + +class AmbiguousGrantByEmailAddress(ErrorResponse): + _status = '400 Bad Request' + _msg = 'The e-mail address you provided is associated with more than ' \ + 'one account.' + + +class AuthorizationHeaderMalformed(ErrorResponse): + _status = '400 Bad Request' + _msg = 'The authorization header is malformed; the authorization ' \ + 'header requires three components: Credential, SignedHeaders, ' \ + 'and Signature.' + + +class AuthorizationQueryParametersError(ErrorResponse): + _status = '400 Bad Request' + + +class BadDigest(ErrorResponse): + _status = '400 Bad Request' + _msg = 'The Content-MD5 you specified did not match what we received.' + + +class XAmzContentSHA256Mismatch(ErrorResponse): + _status = '400 Bad Request' + _msg = "The provided 'x-amz-content-sha256' header does not match what " \ + "was computed." + + +class BucketAlreadyExists(ErrorResponse): + _status = '409 Conflict' + _msg = 'The requested bucket name is not available. The bucket ' \ + 'namespace is shared by all users of the system. Please select a ' \ + 'different name and try again.' + + def __init__(self, bucket, msg=None, *args, **kwargs): + ErrorResponse.__init__(self, msg, bucket_name=bucket, *args, **kwargs) + + +class BucketAlreadyOwnedByYou(ErrorResponse): + _status = '409 Conflict' + _msg = 'Your previous request to create the named bucket succeeded and ' \ + 'you already own it.' + + def __init__(self, bucket, msg=None, *args, **kwargs): + ErrorResponse.__init__(self, msg, bucket_name=bucket, *args, **kwargs) + + +class BucketNotEmpty(ErrorResponse): + _status = '409 Conflict' + _msg = 'The bucket you tried to delete is not empty' + + +class VersionedBucketNotEmpty(BucketNotEmpty): + _msg = 'The bucket you tried to delete is not empty. ' \ + 'You must delete all versions in the bucket.' + _code = 'BucketNotEmpty' + + +class CredentialsNotSupported(ErrorResponse): + _status = '400 Bad Request' + _msg = 'This request does not support credentials.' + + +class CrossLocationLoggingProhibited(ErrorResponse): + _status = '403 Forbidden' + _msg = 'Cross location logging not allowed. Buckets in one geographic ' \ + 'location cannot log information to a bucket in another location.' + + +class EntityTooSmall(ErrorResponse): + _status = '400 Bad Request' + _msg = 'Your proposed upload is smaller than the minimum allowed object ' \ + 'size.' + + +class EntityTooLarge(ErrorResponse): + _status = '400 Bad Request' + _msg = 'Your proposed upload exceeds the maximum allowed object size.' + + +class ExpiredToken(ErrorResponse): + _status = '400 Bad Request' + _msg = 'The provided token has expired.' + + +class IllegalVersioningConfigurationException(ErrorResponse): + _status = '400 Bad Request' + _msg = 'The Versioning configuration specified in the request is invalid.' + + +class IncompleteBody(ErrorResponse): + _status = '400 Bad Request' + _msg = 'You did not provide the number of bytes specified by the ' \ + 'Content-Length HTTP header' + + +class IncorrectNumberOfFilesInPostRequest(ErrorResponse): + _status = '400 Bad Request' + _msg = 'POST requires exactly one file upload per request.' + + +class InlineDataTooLarge(ErrorResponse): + _status = '400 Bad Request' + _msg = 'Inline data exceeds the maximum allowed size.' + + +class InternalError(ErrorResponse): + _status = '500 Internal Server Error' + _msg = 'We encountered an internal error. Please try again.' + + def __str__(self): + return '%s: %s (%s)' % ( + self.__class__.__name__, self.status, self._msg) + + +class InvalidAccessKeyId(ErrorResponse): + _status = '403 Forbidden' + _msg = 'The AWS Access Key Id you provided does not exist in our records.' + + +class InvalidArgument(ErrorResponse): + _status = '400 Bad Request' + _msg = 'Invalid Argument.' + + def __init__(self, name, value, msg=None, *args, **kwargs): + ErrorResponse.__init__(self, msg, argument_name=name, + argument_value=value, *args, **kwargs) + + +class InvalidBucketName(ErrorResponse): + _status = '400 Bad Request' + _msg = 'The specified bucket is not valid.' + + def __init__(self, bucket, msg=None, *args, **kwargs): + ErrorResponse.__init__(self, msg, bucket_name=bucket, *args, **kwargs) + + +class InvalidBucketState(ErrorResponse): + _status = '409 Conflict' + _msg = 'The request is not valid with the current state of the bucket.' + + +class InvalidChunkSizeError(ErrorResponse): + _status = '403 Forbidden' + _msg = 'Only the last chunk is allowed to have a size less than 8192 bytes' + + +class InvalidDigest(ErrorResponse): + _status = '400 Bad Request' + _msg = 'The Content-MD5 you specified was invalid.' + + +class InvalidLocationConstraint(ErrorResponse): + _status = '400 Bad Request' + _msg = 'The specified location constraint is not valid.' + + +class InvalidObjectState(ErrorResponse): + _status = '403 Forbidden' + _msg = 'The operation is not valid for the current state of the object.' + + +class InvalidPartArgument(InvalidArgument): + _code = 'InvalidArgument' + + def __init__(self, max_parts, value): + err_msg = ('Part number must be an integer between ' + '1 and %s, inclusive' % max_parts) + super(InvalidArgument, self).__init__(err_msg, + argument_name='partNumber', + argument_value=value) + + +class InvalidPart(ErrorResponse): + _status = '400 Bad Request' + _msg = 'One or more of the specified parts could not be found. The ' \ + 'part may not have been uploaded, or the specified entity tag ' \ + 'may not match the part\'s entity tag.' + + +class InvalidPartOrder(ErrorResponse): + _status = '400 Bad Request' + _msg = 'The list of parts was not in ascending order.Parts list must ' \ + 'specified in order by part number.' + + +class InvalidPayer(ErrorResponse): + _status = '403 Forbidden' + _msg = 'All access to this object has been disabled.' + + +class InvalidPolicyDocument(ErrorResponse): + _status = '400 Bad Request' + _msg = 'The content of the form does not meet the conditions specified ' \ + 'in the policy document.' + + +class InvalidRange(ErrorResponse): + _status = '416 Requested Range Not Satisfiable' + _msg = 'The requested range cannot be satisfied.' + + +class InvalidPartNumber(ErrorResponse): + _status = '416 Requested Range Not Satisfiable' + _msg = 'The requested partnumber is not satisfiable' + + +class InvalidRequest(ErrorResponse): + _status = '400 Bad Request' + _msg = 'Invalid Request.' + + +class InvalidSecurity(ErrorResponse): + _status = '403 Forbidden' + _msg = 'The provided security credentials are not valid.' + + +class InvalidSOAPRequest(ErrorResponse): + _status = '400 Bad Request' + _msg = 'The SOAP request body is invalid.' + + +class InvalidStorageClass(ErrorResponse): + _status = '400 Bad Request' + _msg = 'The storage class you specified is not valid.' + + +class InvalidTargetBucketForLogging(ErrorResponse): + _status = '400 Bad Request' + _msg = 'The target bucket for logging does not exist, is not owned by ' \ + 'you, or does not have the appropriate grants for the ' \ + 'log-delivery group.' + + def __init__(self, bucket, msg=None, *args, **kwargs): + ErrorResponse.__init__(self, msg, target_bucket=bucket, *args, + **kwargs) + + +class InvalidToken(ErrorResponse): + _status = '400 Bad Request' + _msg = 'The provided token is malformed or otherwise invalid.' + + +class InvalidURI(ErrorResponse): + _status = '400 Bad Request' + _msg = 'Couldn\'t parse the specified URI.' + + def __init__(self, uri, msg=None, *args, **kwargs): + ErrorResponse.__init__(self, msg, uri=uri, *args, **kwargs) + + +class KeyTooLongError(ErrorResponse): + _status = '400 Bad Request' + _msg = 'Your key is too long.' + + +class MalformedACLError(ErrorResponse): + _status = '400 Bad Request' + _msg = 'The XML you provided was not well-formed or did not validate ' \ + 'against our published schema.' + + +class MalformedPOSTRequest(ErrorResponse): + _status = '400 Bad Request' + _msg = 'The body of your POST request is not well-formed ' \ + 'multipart/form-data.' + + +class MalformedTrailerError(ErrorResponse): + _status = '400 Bad Request' + _msg = 'The request contained trailing data that was not well-formed ' \ + 'or did not conform to our published schema.' + + +class MalformedXML(ErrorResponse): + _status = '400 Bad Request' + _msg = 'The XML you provided was not well-formed or did not validate ' \ + 'against our published schema' + + +class MaxMessageLengthExceeded(ErrorResponse): + _status = '400 Bad Request' + _msg = 'Your request was too big.' + + +class MaxPostPreDataLengthExceededError(ErrorResponse): + _status = '400 Bad Request' + _msg = 'Your POST request fields preceding the upload file were too large.' + + +class MetadataTooLarge(ErrorResponse): + _status = '400 Bad Request' + _msg = 'Your metadata headers exceed the maximum allowed metadata size.' + + +class MethodNotAllowed(ErrorResponse): + _status = '405 Method Not Allowed' + _msg = 'The specified method is not allowed against this resource.' + + def __init__(self, method, resource_type, msg=None, *args, **kwargs): + ErrorResponse.__init__(self, msg, method=method, + resource_type=resource_type, *args, **kwargs) + + +class MissingContentLength(ErrorResponse): + _status = '411 Length Required' + _msg = 'You must provide the Content-Length HTTP header.' + + +class MissingRequestBodyError(ErrorResponse): + _status = '400 Bad Request' + _msg = 'Request body is empty.' + + +class MissingSecurityElement(ErrorResponse): + _status = '400 Bad Request' + _msg = 'The SOAP 1.1 request is missing a security element.' + + +class MissingSecurityHeader(ErrorResponse): + _status = '400 Bad Request' + _msg = 'Your request was missing a required header.' + + +class NoLoggingStatusForKey(ErrorResponse): + _status = '400 Bad Request' + _msg = 'There is no such thing as a logging status sub-resource for a key.' + + +class NoSuchBucket(ErrorResponse): + _status = '404 Not Found' + _msg = 'The specified bucket does not exist.' + + def __init__(self, bucket, msg=None, *args, **kwargs): + if not bucket: + raise InternalError() + ErrorResponse.__init__(self, msg, bucket_name=bucket, *args, **kwargs) + + +class NoSuchKey(ErrorResponse): + _status = '404 Not Found' + _msg = 'The specified key does not exist.' + + def __init__(self, key, msg=None, *args, **kwargs): + if not key: + raise InternalError() + ErrorResponse.__init__(self, msg, key=key, *args, **kwargs) + + +class ObjectLockConfigurationNotFoundError(ErrorResponse): + _status = '404 Not found' + _msg = 'Object Lock configuration does not exist for this bucket' + + def __init__(self, bucket, msg=None, *args, **kwargs): + if not bucket: + raise InternalError() + ErrorResponse.__init__(self, msg, bucket_name=bucket, *args, **kwargs) + + +class NoSuchLifecycleConfiguration(ErrorResponse): + _status = '404 Not Found' + _msg = 'The lifecycle configuration does not exist. .' + + +class NoSuchUpload(ErrorResponse): + _status = '404 Not Found' + _msg = 'The specified multipart upload does not exist. The upload ID ' \ + 'might be invalid, or the multipart upload might have been ' \ + 'aborted or completed.' + + +class NoSuchVersion(ErrorResponse): + _status = '404 Not Found' + _msg = 'The specified version does not exist.' + + def __init__(self, key, version_id, msg=None, *args, **kwargs): + if not key: + raise InternalError() + ErrorResponse.__init__(self, msg, key=key, version_id=version_id, + *args, **kwargs) + + +# NotImplemented is a python built-in constant. Use S3NotImplemented instead. +class S3NotImplemented(ErrorResponse): + _status = '501 Not Implemented' + _msg = 'Not implemented.' + _code = 'NotImplemented' + + +class NotSignedUp(ErrorResponse): + _status = '403 Forbidden' + _msg = 'Your account is not signed up for the Amazon S3 service.' + + +class NotSuchBucketPolicy(ErrorResponse): + _status = '404 Not Found' + _msg = 'The specified bucket does not have a bucket policy.' + + +class OperationAborted(ErrorResponse): + _status = '409 Conflict' + _msg = 'A conflicting conditional operation is currently in progress ' \ + 'against this resource. Please try again.' + + +class PermanentRedirect(ErrorResponse): + _status = '301 Moved Permanently' + _msg = 'The bucket you are attempting to access must be addressed using ' \ + 'the specified endpoint. Please send all future requests to this ' \ + 'endpoint.' + + +class PreconditionFailed(ErrorResponse): + _status = '412 Precondition Failed' + _msg = 'At least one of the preconditions you specified did not hold.' + + +class Redirect(ErrorResponse): + _status = '307 Moved Temporarily' + _msg = 'Temporary redirect.' + + +class RestoreAlreadyInProgress(ErrorResponse): + _status = '409 Conflict' + _msg = 'Object restore is already in progress.' + + +class RequestIsNotMultiPartContent(ErrorResponse): + _status = '400 Bad Request' + _msg = 'Bucket POST must be of the enclosure-type multipart/form-data.' + + +class RequestTimeout(ErrorResponse): + _status = '400 Bad Request' + _msg = 'Your socket connection to the server was not read from or ' \ + 'written to within the timeout period.' + + +class RequestTimeTooSkewed(ErrorResponse): + _status = '403 Forbidden' + _msg = 'The difference between the request time and the current time ' \ + 'is too large.' + + +class RequestTorrentOfBucketError(ErrorResponse): + _status = '400 Bad Request' + _msg = 'Requesting the torrent file of a bucket is not permitted.' + + +class SignatureDoesNotMatch(ErrorResponse): + _status = '403 Forbidden' + _msg = 'The request signature we calculated does not match the ' \ + 'signature you provided. Check your key and signing method.' + + +class ServiceUnavailable(ErrorResponse): + _status = '503 Service Unavailable' + _msg = 'Please reduce your request rate.' + + +class SlowDown(ErrorResponse): + _status = '503 Slow Down' + _msg = 'Please reduce your request rate.' + + +class TemporaryRedirect(ErrorResponse): + _status = '307 Moved Temporarily' + _msg = 'You are being redirected to the bucket while DNS updates.' + + +class TokenRefreshRequired(ErrorResponse): + _status = '400 Bad Request' + _msg = 'The provided token must be refreshed.' + + +class TooManyBuckets(ErrorResponse): + _status = '400 Bad Request' + _msg = 'You have attempted to create more buckets than allowed.' + + +class UnexpectedContent(ErrorResponse): + _status = '400 Bad Request' + _msg = 'This request does not support content.' + + +class UnresolvableGrantByEmailAddress(ErrorResponse): + _status = '400 Bad Request' + _msg = 'The e-mail address you provided does not match any account on ' \ + 'record.' + + +class UserKeyMustBeSpecified(ErrorResponse): + _status = '400 Bad Request' + _msg = 'The bucket POST must contain the specified field name. If it is ' \ + 'specified, please check the order of the fields.' + + +class BrokenMPU(ErrorResponse): + # This is very much a Swift-ism, and we wish we didn't need it + _status = '409 Conflict' + _msg = 'Multipart upload has broken segment data.' diff --git a/swift/common/middleware/s3api/s3token.py b/swift/common/middleware/s3api/s3token.py new file mode 100644 index 0000000000..41f87cb375 --- /dev/null +++ b/swift/common/middleware/s3api/s3token.py @@ -0,0 +1,438 @@ +# Copyright 2012 OpenStack Foundation +# Copyright 2010 United States Government as represented by the +# Administrator of the National Aeronautics and Space Administration. +# Copyright 2011,2012 Akira YOSHIYAMA +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +# This source code is based ./auth_token.py and ./ec2_token.py. +# See them for their copyright. + +""" +------------------- +S3 Token Middleware +------------------- +s3token middleware is for authentication with s3api + keystone. +This middleware: + +* Gets a request from the s3api middleware with an S3 Authorization + access key. +* Validates s3 token with Keystone. +* Transforms the account name to AUTH_%(tenant_name). +* Optionally can retrieve and cache secret from keystone + to validate signature locally + +.. note:: + If upgrading from swift3, the ``auth_version`` config option has been + removed, and the ``auth_uri`` option now includes the Keystone API + version. If you previously had a configuration like + + .. code-block:: ini + + [filter:s3token] + use = egg:swift3#s3token + auth_uri = https://keystonehost:35357 + auth_version = 3 + + you should now use + + .. code-block:: ini + + [filter:s3token] + use = egg:swift#s3token + auth_uri = https://keystonehost:35357/v3 +""" + +import base64 +import json + +from keystoneclient.v3 import client as keystone_client +from keystoneauth1 import session as keystone_session +from keystoneauth1 import loading as keystone_loading +import requests +import urllib + +from swift.common.swob import Request, HTTPBadRequest, HTTPUnauthorized, \ + HTTPException, str_to_wsgi +from swift.common.utils import config_true_value, split_path, get_logger, \ + cache_from_env, append_underscore +from swift.common.wsgi import ConfigFileError + + +PROTOCOL_NAME = 'S3 Token Authentication' + +# Headers to purge if they came from (or may have come from) the client +KEYSTONE_AUTH_HEADERS = ( + 'X-Identity-Status', 'X-Service-Identity-Status', + 'X-Domain-Id', 'X-Service-Domain-Id', + 'X-Domain-Name', 'X-Service-Domain-Name', + 'X-Project-Id', 'X-Service-Project-Id', + 'X-Project-Name', 'X-Service-Project-Name', + 'X-Project-Domain-Id', 'X-Service-Project-Domain-Id', + 'X-Project-Domain-Name', 'X-Service-Project-Domain-Name', + 'X-User-Id', 'X-Service-User-Id', + 'X-User-Name', 'X-Service-User-Name', + 'X-User-Domain-Id', 'X-Service-User-Domain-Id', + 'X-User-Domain-Name', 'X-Service-User-Domain-Name', + 'X-Roles', 'X-Service-Roles', + 'X-Is-Admin-Project', + 'X-Service-Catalog', + # Deprecated headers, too... + 'X-Tenant-Id', + 'X-Tenant-Name', + 'X-Tenant', + 'X-User', + 'X-Role', +) + + +def parse_v2_response(token): + access_info = token['access'] + headers = { + 'X-Identity-Status': 'Confirmed', + 'X-Roles': ','.join(r['name'] + for r in access_info['user']['roles']), + 'X-User-Id': access_info['user']['id'], + 'X-User-Name': access_info['user']['name'], + 'X-Tenant-Id': access_info['token']['tenant']['id'], + 'X-Tenant-Name': access_info['token']['tenant']['name'], + 'X-Project-Id': access_info['token']['tenant']['id'], + 'X-Project-Name': access_info['token']['tenant']['name'], + } + return headers, access_info['token']['tenant'] + + +def parse_v3_response(token): + token = token['token'] + headers = { + 'X-Identity-Status': 'Confirmed', + 'X-Roles': ','.join(r['name'] + for r in token['roles']), + 'X-User-Id': token['user']['id'], + 'X-User-Name': token['user']['name'], + 'X-User-Domain-Id': token['user']['domain']['id'], + 'X-User-Domain-Name': token['user']['domain']['name'], + 'X-Tenant-Id': token['project']['id'], + 'X-Tenant-Name': token['project']['name'], + 'X-Project-Id': token['project']['id'], + 'X-Project-Name': token['project']['name'], + 'X-Project-Domain-Id': token['project']['domain']['id'], + 'X-Project-Domain-Name': token['project']['domain']['name'], + } + return headers, token['project'] + + +class S3Token(object): + """Middleware that handles S3 authentication.""" + + def __init__(self, app, conf): + """Common initialization code.""" + self._app = app + self._logger = get_logger( + conf, log_route=conf.get('log_name', 's3token')) + self._logger.debug('Starting the %s component', PROTOCOL_NAME) + self._timeout = float(conf.get('http_timeout', '10.0')) + if not (0 < self._timeout <= 60): + raise ValueError('http_timeout must be between 0 and 60 seconds') + self._reseller_prefix = append_underscore( + conf.get('reseller_prefix', 'AUTH')) + self._delay_auth_decision = config_true_value( + conf.get('delay_auth_decision')) + + # where to find the auth service (we use this to validate tokens) + self._request_uri = conf.get('auth_uri', '').rstrip('/') + '/s3tokens' + parsed = urllib.parse.urlsplit(self._request_uri) + if not parsed.scheme or not parsed.hostname: + raise ConfigFileError( + 'Invalid auth_uri; must include scheme and host') + if parsed.scheme not in ('http', 'https'): + raise ConfigFileError( + 'Invalid auth_uri; scheme must be http or https') + if parsed.query or parsed.fragment or '@' in parsed.netloc: + raise ConfigFileError('Invalid auth_uri; must not include ' + 'username, query, or fragment') + + # SSL + insecure = config_true_value(conf.get('insecure')) + cert_file = conf.get('certfile') + key_file = conf.get('keyfile') + + if insecure: + self._verify = False + elif cert_file and key_file: + self._verify = (cert_file, key_file) + elif cert_file: + self._verify = cert_file + else: + self._verify = None + + self._secret_cache_duration = int( + conf.get('secret_cache_duration', 60)) + if self._secret_cache_duration < 0: + raise ValueError('secret_cache_duration must be non-negative') + + # Service authentication for s3tokens API calls + self.keystoneclient = None + try: + auth_plugin = keystone_loading.get_plugin_loader( + conf.get('auth_type', 'password')) + available_auth_options = auth_plugin.get_options() + auth_options = {} + for option in available_auth_options: + name = option.name.replace('-', '_') + value = conf.get(name) + if value: + auth_options[name] = value + + if not auth_options: + self._logger.warning( + "No service auth configuration. " + "s3tokens API calls will be unauthenticated. " + "New versions of keystone require service auth.") + else: + auth = auth_plugin.load_from_options(**auth_options) + session = keystone_session.Session(auth=auth) + self.keystoneclient = keystone_client.Client( + session=session, + region_name=conf.get('region_name')) + self._logger.info( + "Service authentication configured for s3tokens API") + except Exception: + self._logger.warning( + "Unable to load service auth configuration. " + "s3tokens API calls will be unauthenticated " + "and secret caching will be unavailable.", + exc_info=True) + + if self._secret_cache_duration and self.keystoneclient: + self._logger.info("Caching s3tokens for %s seconds", + self._secret_cache_duration) + else: + self._secret_cache_duration = 0 + + def _deny_request(self, code): + error_cls, message = { + 'AccessDenied': (HTTPUnauthorized, 'Access denied'), + 'InvalidURI': (HTTPBadRequest, + 'Could not parse the specified URI'), + }[code] + resp = error_cls(content_type='text/xml') + error_msg = ('\r\n' + '\r\n %s\r\n ' + '%s\r\n\r\n' % + (code, message)).encode() + resp.body = error_msg + return resp + + def _json_request(self, creds_json): + headers = {'Content-Type': 'application/json'} + + # Add service authentication headers if configured + if self.keystoneclient: + try: + headers.update( + self.keystoneclient.session.get_auth_headers()) + except Exception: + self._logger.warning("Failed to get service token", + exc_info=True) + + try: + response = requests.post(self._request_uri, + headers=headers, data=creds_json, + verify=self._verify, + timeout=self._timeout) + except requests.exceptions.RequestException as e: + self._logger.info('HTTP connection exception: %s', e) + raise self._deny_request('InvalidURI') + + if response.status_code < 200 or response.status_code >= 300: + self._logger.debug('Keystone reply error: status=%s reason=%s', + response.status_code, response.reason) + raise self._deny_request('AccessDenied') + + return response + + def __call__(self, environ, start_response): + """Handle incoming request. authenticate and send downstream.""" + req = Request(environ) + self._logger.debug('Calling S3Token middleware.') + + # Always drop auth headers if we're first in the pipeline + if 'keystone.token_info' not in req.environ: + req.headers.update({h: None for h in KEYSTONE_AUTH_HEADERS}) + + try: + parts = split_path(urllib.parse.unquote(req.path), 1, 4, True) + version, account, container, obj = parts + except ValueError: + msg = 'Not a path query: %s, skipping.' % req.path + self._logger.debug(msg) + return self._app(environ, start_response) + + # Read request signature and access id. + s3_auth_details = req.environ.get('s3api.auth_details') + if not s3_auth_details: + msg = 'No authorization details from s3api. skipping.' + self._logger.debug(msg) + return self._app(environ, start_response) + + access = s3_auth_details['access_key'] + if isinstance(access, bytes): + access = access.decode('utf-8') + + signature = s3_auth_details['signature'] + if isinstance(signature, bytes): + signature = signature.decode('utf-8') + + string_to_sign = s3_auth_details['string_to_sign'] + if isinstance(string_to_sign, str): + string_to_sign = string_to_sign.encode('utf-8') + token = base64.urlsafe_b64encode(string_to_sign) + if isinstance(token, bytes): + token = token.decode('ascii') + + # NOTE(chmou): This is to handle the special case with nova + # when we have the option s3_affix_tenant. We will force it to + # connect to another account than the one + # authenticated. Before people start getting worried about + # security, I should point that we are connecting with + # username/token specified by the user but instead of + # connecting to its own account we will force it to go to an + # another account. In a normal scenario if that user don't + # have the reseller right it will just fail but since the + # reseller account can connect to every account it is allowed + # by the swift_auth middleware. + force_tenant = None + if ':' in access: + access, force_tenant = access.split(':') + + # Authenticate request. + creds = {'credentials': {'access': access, + 'token': token, + 'signature': signature}} + + memcache_client = None + memcache_token_key = 's3secret/%s' % access + if self._secret_cache_duration > 0: + memcache_client = cache_from_env(environ) + cached_auth_data = None + + if memcache_client: + cached_auth_data = memcache_client.get(memcache_token_key) + if cached_auth_data: + if len(cached_auth_data) == 4: + # Old versions of swift may have cached token, too, + # but we don't need it + headers, _token, tenant, secret = cached_auth_data + else: + headers, tenant, secret = cached_auth_data + + if s3_auth_details['check_signature'](secret): + self._logger.debug("Cached creds valid") + else: + self._logger.debug("Cached creds invalid") + cached_auth_data = None + + if not cached_auth_data: + creds_json = json.dumps(creds) + self._logger.debug('Connecting to Keystone sending this JSON: %s', + creds_json) + # NOTE(vish): We could save a call to keystone by having + # keystone return token, tenant, user, and roles + # from this call. + # + # NOTE(chmou): We still have the same problem we would need to + # change token_auth to detect if we already + # identified and not doing a second query and just + # pass it through to swiftauth in this case. + try: + # NB: requests.Response, not swob.Response + resp = self._json_request(creds_json) + except HTTPException as e_resp: + if self._delay_auth_decision: + msg = ('Received error, deferring rejection based on ' + 'error: %s') + self._logger.debug(msg, e_resp.status) + return self._app(environ, start_response) + else: + msg = 'Received error, rejecting request with error: %s' + self._logger.debug(msg, e_resp.status) + # NB: swob.Response, not requests.Response + return e_resp(environ, start_response) + + self._logger.debug('Keystone Reply: Status: %d, Output: %s', + resp.status_code, resp.content) + + try: + token = resp.json() + if 'access' in token: + headers, tenant = parse_v2_response(token) + elif 'token' in token: + headers, tenant = parse_v3_response(token) + else: + raise ValueError + if memcache_client: + user_id = headers.get('X-User-Id') + if not user_id: + raise ValueError + try: + cred_ref = self.keystoneclient.ec2.get( + user_id=user_id, + access=access) + memcache_client.set( + memcache_token_key, + (headers, tenant, cred_ref.secret), + time=self._secret_cache_duration) + self._logger.debug("Cached keystone credentials") + except Exception: + self._logger.warning("Unable to cache secret", + exc_info=True) + + # Populate the environment similar to auth_token, + # so we don't have to contact Keystone again. + # + # Note that although the strings are unicode following json + # deserialization, Swift's HeaderEnvironProxy handles ensuring + # they're stored as native strings + req.environ['keystone.token_info'] = token + except (ValueError, KeyError, TypeError): + if self._delay_auth_decision: + error = ('Error on keystone reply: %d %s - ' + 'deferring rejection downstream') + self._logger.debug(error, resp.status_code, resp.content) + return self._app(environ, start_response) + else: + error = ('Error on keystone reply: %d %s - ' + 'rejecting request') + self._logger.debug(error, resp.status_code, resp.content) + return self._deny_request('InvalidURI')( + environ, start_response) + + req.headers.update(headers) + tenant_to_connect = force_tenant or tenant['id'] + self._logger.debug('Connecting with tenant: %s', tenant_to_connect) + new_tenant_name = '%s%s' % (self._reseller_prefix, tenant_to_connect) + environ['PATH_INFO'] = environ['PATH_INFO'].replace( + str_to_wsgi(account), str_to_wsgi(new_tenant_name), 1) + return self._app(environ, start_response) + + +def filter_factory(global_conf, **local_conf): + """Returns a WSGI filter app for use with paste.deploy.""" + conf = global_conf.copy() + conf.update(local_conf) + + def auth_filter(app): + return S3Token(app, conf) + return auth_filter diff --git a/swift/common/middleware/s3api/schema/access_control_policy.rng b/swift/common/middleware/s3api/schema/access_control_policy.rng new file mode 100644 index 0000000000..5308a12f32 --- /dev/null +++ b/swift/common/middleware/s3api/schema/access_control_policy.rng @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/swift/common/middleware/s3api/schema/bucket_logging_status.rng b/swift/common/middleware/s3api/schema/bucket_logging_status.rng new file mode 100644 index 0000000000..27ea1e1dd0 --- /dev/null +++ b/swift/common/middleware/s3api/schema/bucket_logging_status.rng @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/swift/common/middleware/s3api/schema/common.rng b/swift/common/middleware/s3api/schema/common.rng new file mode 100644 index 0000000000..22319c0ebd --- /dev/null +++ b/swift/common/middleware/s3api/schema/common.rng @@ -0,0 +1,66 @@ + + + + + + + + + + + + + + + + + STANDARD + REDUCED_REDUNDANCY + GLACIER + UNKNOWN + + + + + + + + + + + AmazonCustomerByEmail + + + + + + + + CanonicalUser + + + + + + Group + + + + + + + + + + READ + WRITE + READ_ACP + WRITE_ACP + FULL_CONTROL + + + + + + + diff --git a/swift/common/middleware/s3api/schema/complete_multipart_upload.rng b/swift/common/middleware/s3api/schema/complete_multipart_upload.rng new file mode 100644 index 0000000000..55aefa464e --- /dev/null +++ b/swift/common/middleware/s3api/schema/complete_multipart_upload.rng @@ -0,0 +1,44 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/swift/common/middleware/s3api/schema/complete_multipart_upload_result.rng b/swift/common/middleware/s3api/schema/complete_multipart_upload_result.rng new file mode 100644 index 0000000000..47406e1c5e --- /dev/null +++ b/swift/common/middleware/s3api/schema/complete_multipart_upload_result.rng @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + + + diff --git a/swift/common/middleware/s3api/schema/copy_object_result.rng b/swift/common/middleware/s3api/schema/copy_object_result.rng new file mode 100644 index 0000000000..ec0ac95f2c --- /dev/null +++ b/swift/common/middleware/s3api/schema/copy_object_result.rng @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/swift/common/middleware/s3api/schema/copy_part_result.rng b/swift/common/middleware/s3api/schema/copy_part_result.rng new file mode 100644 index 0000000000..0370daad6b --- /dev/null +++ b/swift/common/middleware/s3api/schema/copy_part_result.rng @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/swift/common/middleware/s3api/schema/create_bucket_configuration.rng b/swift/common/middleware/s3api/schema/create_bucket_configuration.rng new file mode 100644 index 0000000000..882edc465f --- /dev/null +++ b/swift/common/middleware/s3api/schema/create_bucket_configuration.rng @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/swift/common/middleware/s3api/schema/delete.rng b/swift/common/middleware/s3api/schema/delete.rng new file mode 100644 index 0000000000..f0659fc1c0 --- /dev/null +++ b/swift/common/middleware/s3api/schema/delete.rng @@ -0,0 +1,28 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/swift/common/middleware/s3api/schema/delete_result.rng b/swift/common/middleware/s3api/schema/delete_result.rng new file mode 100644 index 0000000000..1e28b3ceb8 --- /dev/null +++ b/swift/common/middleware/s3api/schema/delete_result.rng @@ -0,0 +1,47 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/swift/common/middleware/s3api/schema/error.rng b/swift/common/middleware/s3api/schema/error.rng new file mode 100644 index 0000000000..a0d61d4853 --- /dev/null +++ b/swift/common/middleware/s3api/schema/error.rng @@ -0,0 +1,30 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/swift/common/middleware/s3api/schema/initiate_multipart_upload_result.rng b/swift/common/middleware/s3api/schema/initiate_multipart_upload_result.rng new file mode 100644 index 0000000000..67d03016da --- /dev/null +++ b/swift/common/middleware/s3api/schema/initiate_multipart_upload_result.rng @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/swift/common/middleware/s3api/schema/lifecycle_configuration.rng b/swift/common/middleware/s3api/schema/lifecycle_configuration.rng new file mode 100644 index 0000000000..dd0816e2f5 --- /dev/null +++ b/swift/common/middleware/s3api/schema/lifecycle_configuration.rng @@ -0,0 +1,56 @@ + + + + + + + + + + + + + + + + + + + Enabled + Disabled + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/swift/common/middleware/s3api/schema/list_all_my_buckets_result.rng b/swift/common/middleware/s3api/schema/list_all_my_buckets_result.rng new file mode 100644 index 0000000000..76959d7b81 --- /dev/null +++ b/swift/common/middleware/s3api/schema/list_all_my_buckets_result.rng @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + + + + + + + + diff --git a/swift/common/middleware/s3api/schema/list_bucket_result.rng b/swift/common/middleware/s3api/schema/list_bucket_result.rng new file mode 100644 index 0000000000..b3181238e6 --- /dev/null +++ b/swift/common/middleware/s3api/schema/list_bucket_result.rng @@ -0,0 +1,93 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/swift/common/middleware/s3api/schema/list_multipart_uploads_result.rng b/swift/common/middleware/s3api/schema/list_multipart_uploads_result.rng new file mode 100644 index 0000000000..2e20c840ec --- /dev/null +++ b/swift/common/middleware/s3api/schema/list_multipart_uploads_result.rng @@ -0,0 +1,73 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/swift/common/middleware/s3api/schema/list_parts_result.rng b/swift/common/middleware/s3api/schema/list_parts_result.rng new file mode 100644 index 0000000000..4cf5a0ce7d --- /dev/null +++ b/swift/common/middleware/s3api/schema/list_parts_result.rng @@ -0,0 +1,59 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/swift/common/middleware/s3api/schema/list_versions_result.rng b/swift/common/middleware/s3api/schema/list_versions_result.rng new file mode 100644 index 0000000000..464cfbcc48 --- /dev/null +++ b/swift/common/middleware/s3api/schema/list_versions_result.rng @@ -0,0 +1,104 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/swift/common/middleware/s3api/schema/location_constraint.rng b/swift/common/middleware/s3api/schema/location_constraint.rng new file mode 100644 index 0000000000..2f3a143b27 --- /dev/null +++ b/swift/common/middleware/s3api/schema/location_constraint.rng @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/swift/common/middleware/s3api/schema/versioning_configuration.rng b/swift/common/middleware/s3api/schema/versioning_configuration.rng new file mode 100644 index 0000000000..3d6d3d1238 --- /dev/null +++ b/swift/common/middleware/s3api/schema/versioning_configuration.rng @@ -0,0 +1,25 @@ + + + + + + + + + Enabled + Suspended + + + + + + + Enabled + Disabled + + + + + + + diff --git a/swift/common/middleware/s3api/subresource.py b/swift/common/middleware/s3api/subresource.py new file mode 100644 index 0000000000..42af24bd05 --- /dev/null +++ b/swift/common/middleware/s3api/subresource.py @@ -0,0 +1,574 @@ +# Copyright (c) 2014 OpenStack Foundation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +--------------------------- +s3api's ACLs implementation +--------------------------- +s3api uses a different implementation approach to achieve S3 ACLs. + +First, we should understand what we have to design to achieve real S3 ACLs. +Current s3api(real S3)'s ACLs Model is as follows:: + + AccessControlPolicy: + Owner: + AccessControlList: + Grant[n]: + (Grantee, Permission) + +Each bucket or object has its own acl consisting of Owner and +AcessControlList. AccessControlList can contain some Grants. +By default, AccessControlList has only one Grant to allow FULL +CONTROLL to owner. Each Grant includes single pair with Grantee, +Permission. Grantee is the user (or user group) allowed the given permission. + +This module defines the groups and the relation tree. + +If you wanna get more information about S3's ACLs model in detail, +please see official documentation here, + +http://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html + +""" +from functools import partial + +from swift.common.utils import json + +from swift.common.middleware.s3api.s3response import InvalidArgument, \ + MalformedACLError, S3NotImplemented, InvalidRequest, AccessDenied +from swift.common.middleware.s3api.etree import Element, SubElement, tostring +from swift.common.middleware.s3api.utils import sysmeta_header +from swift.common.middleware.s3api.exception import InvalidSubresource + +XMLNS_XSI = 'http://www.w3.org/2001/XMLSchema-instance' +PERMISSIONS = ['FULL_CONTROL', 'READ', 'WRITE', 'READ_ACP', 'WRITE_ACP'] +LOG_DELIVERY_USER = '.log_delivery' + + +def encode_acl(resource, acl): + """ + Encode an ACL instance to Swift metadata. + + Given a resource type and an ACL instance, this method returns HTTP + headers, which can be used for Swift metadata. + """ + header_value = {"Owner": acl.owner.id} + grants = [] + for grant in acl.grants: + grant = {"Permission": grant.permission, + "Grantee": str(grant.grantee)} + grants.append(grant) + header_value.update({"Grant": grants}) + headers = {} + key = sysmeta_header(resource, 'acl') + headers[key] = json.dumps(header_value, separators=(',', ':')) + + return headers + + +def decode_acl(resource, headers, allow_no_owner): + """ + Decode Swift metadata to an ACL instance. + + Given a resource type and HTTP headers, this method returns an ACL + instance. + """ + value = '' + + key = sysmeta_header(resource, 'acl') + if key in headers: + value = headers[key] + + if value == '': + # Fix me: In the case of value is empty or not dict instance, + # I want an instance of Owner as None. + # However, in the above process would occur error in reference + # to an instance variable of Owner. + return ACL(Owner(None, None), [], True, allow_no_owner) + + try: + encode_value = json.loads(value) + if not isinstance(encode_value, dict): + return ACL(Owner(None, None), [], True, allow_no_owner) + + id = None + name = None + grants = [] + if 'Owner' in encode_value: + id = encode_value['Owner'] + name = encode_value['Owner'] + if 'Grant' in encode_value: + for grant in encode_value['Grant']: + grantee = None + # pylint: disable-msg=E1101 + for group in Group.__subclasses__(): + if group.__name__ == grant['Grantee']: + grantee = group() + if not grantee: + grantee = User(grant['Grantee']) + permission = grant['Permission'] + grants.append(Grant(grantee, permission)) + return ACL(Owner(id, name), grants, True, allow_no_owner) + except Exception as e: + raise InvalidSubresource((resource, 'acl', value), e) + + +class Grantee(object): + """ + Base class for grantee. + + Methods: + + * init: create a Grantee instance + * elem: create an ElementTree from itself + + Static Methods: + + * from_header: convert a grantee string in the HTTP header + to an Grantee instance. + * from_elem: convert a ElementTree to an Grantee instance. + + """ + # Needs confirmation whether we really need these methods or not. + # * encode (method): create a JSON which includes whole own elements + # * encode_from_elem (static method): convert from an ElementTree to a JSON + # * elem_from_json (static method): convert from a JSON to an ElementTree + # * from_json (static method): convert a Json string to an Grantee + # instance. + + def __contains__(self, key): + """ + The key argument is a S3 user id. This method checks that the user id + belongs to this class. + """ + raise S3NotImplemented() + + def elem(self): + """ + Get an etree element of this instance. + """ + raise S3NotImplemented() + + @staticmethod + def from_elem(elem): + type = elem.get('{%s}type' % XMLNS_XSI) + if type == 'CanonicalUser': + value = elem.find('./ID').text + return User(value) + elif type == 'Group': + value = elem.find('./URI').text + subclass = get_group_subclass_from_uri(value) + return subclass() + elif type == 'AmazonCustomerByEmail': + raise S3NotImplemented() + else: + raise MalformedACLError() + + @staticmethod + def from_header(grantee): + """ + Convert a grantee string in the HTTP header to an Grantee instance. + """ + grantee_type, value = grantee.split('=', 1) + grantee_type = grantee_type.lower() + value = value.strip('"\'') + if grantee_type == 'id': + return User(value) + elif grantee_type == 'emailaddress': + raise S3NotImplemented() + elif grantee_type == 'uri': + # return a subclass instance of Group class + subclass = get_group_subclass_from_uri(value) + return subclass() + else: + raise InvalidArgument(grantee_type, value, + 'Argument format not recognized') + + +class User(Grantee): + """ + Canonical user class for S3 accounts. + """ + type = 'CanonicalUser' + + def __init__(self, name): + self.id = name + self.display_name = name + + def __contains__(self, key): + return key == self.id + + def elem(self): + elem = Element('Grantee', nsmap={'xsi': XMLNS_XSI}) + elem.set('{%s}type' % XMLNS_XSI, self.type) + SubElement(elem, 'ID').text = self.id + SubElement(elem, 'DisplayName').text = self.display_name + return elem + + def __str__(self): + return self.display_name + + def __lt__(self, other): + if not isinstance(other, User): + return NotImplemented + return self.id < other.id + + +class Owner(object): + """ + Owner class for S3 accounts + """ + def __init__(self, id, name): + self.id = id + if not (name is None or isinstance(name, str)): + raise TypeError('name must be a string or None') + self.name = name + + +def get_group_subclass_from_uri(uri): + """ + Convert a URI to one of the predefined groups. + """ + for group in Group.__subclasses__(): # pylint: disable-msg=E1101 + if group.uri == uri: + return group + raise InvalidArgument('uri', uri, 'Invalid group uri') + + +class Group(Grantee): + """ + Base class for Amazon S3 Predefined Groups + """ + type = 'Group' + uri = '' + + def __init__(self): + # Initialize method to clarify this has nothing to do + pass + + def elem(self): + elem = Element('Grantee', nsmap={'xsi': XMLNS_XSI}) + elem.set('{%s}type' % XMLNS_XSI, self.type) + SubElement(elem, 'URI').text = self.uri + + return elem + + def __str__(self): + return self.__class__.__name__ + + +def canned_acl_grantees(bucket_owner, object_owner=None): + """ + A set of predefined grants supported by AWS S3. + """ + owner = object_owner or bucket_owner + + return { + 'private': [ + ('FULL_CONTROL', User(owner.name)), + ], + 'public-read': [ + ('READ', AllUsers()), + ('FULL_CONTROL', User(owner.name)), + ], + 'public-read-write': [ + ('READ', AllUsers()), + ('WRITE', AllUsers()), + ('FULL_CONTROL', User(owner.name)), + ], + 'authenticated-read': [ + ('READ', AuthenticatedUsers()), + ('FULL_CONTROL', User(owner.name)), + ], + 'bucket-owner-read': [ + ('READ', User(bucket_owner.name)), + ('FULL_CONTROL', User(owner.name)), + ], + 'bucket-owner-full-control': [ + ('FULL_CONTROL', User(owner.name)), + ('FULL_CONTROL', User(bucket_owner.name)), + ], + 'log-delivery-write': [ + ('WRITE', LogDelivery()), + ('READ_ACP', LogDelivery()), + ('FULL_CONTROL', User(owner.name)), + ], + } + + +class AuthenticatedUsers(Group): + """ + This group represents all AWS accounts. Access permission to this group + allows any AWS account to access the resource. However, all requests must + be signed (authenticated). + """ + uri = 'http://acs.amazonaws.com/groups/global/AuthenticatedUsers' + + def __contains__(self, key): + # s3api handles only signed requests. + return True + + +class AllUsers(Group): + """ + Access permission to this group allows anyone to access the resource. The + requests can be signed (authenticated) or unsigned (anonymous). Unsigned + requests omit the Authentication header in the request. + + Note: s3api regards unsigned requests as Swift API accesses, and bypasses + them to Swift. As a result, AllUsers behaves completely same as + AuthenticatedUsers. + """ + uri = 'http://acs.amazonaws.com/groups/global/AllUsers' + + def __contains__(self, key): + return True + + +class LogDelivery(Group): + """ + WRITE and READ_ACP permissions on a bucket enables this group to write + server access logs to the bucket. + """ + uri = 'http://acs.amazonaws.com/groups/s3/LogDelivery' + + def __contains__(self, key): + if ':' in key: + tenant, user = key.split(':', 1) + else: + user = key + return user == LOG_DELIVERY_USER + + +class Grant(object): + """ + Grant Class which includes both Grantee and Permission + """ + + def __init__(self, grantee, permission): + """ + :param grantee: a grantee class or its subclass + :param permission: string + """ + if permission.upper() not in PERMISSIONS: + raise S3NotImplemented() + if not isinstance(grantee, Grantee): + raise ValueError() + self.grantee = grantee + self.permission = permission + + @classmethod + def from_elem(cls, elem): + """ + Convert an ElementTree to an ACL instance + """ + grantee = Grantee.from_elem(elem.find('./Grantee')) + permission = elem.find('./Permission').text + return cls(grantee, permission) + + def elem(self): + """ + Create an etree element. + """ + elem = Element('Grant') + elem.append(self.grantee.elem()) + SubElement(elem, 'Permission').text = self.permission + + return elem + + def allow(self, grantee, permission): + return permission == self.permission and grantee in self.grantee + + +class ACL(object): + """ + S3 ACL class. + + Refs (S3 API - acl-overview: + http://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html): + + The sample ACL includes an Owner element identifying the owner via the + AWS account's canonical user ID. The Grant element identifies the grantee + (either an AWS account or a predefined group), and the permission granted. + This default ACL has one Grant element for the owner. You grant permissions + by adding Grant elements, each grant identifying the grantee and the + permission. + """ + metadata_name = 'acl' + root_tag = 'AccessControlPolicy' + max_xml_length = 200 * 1024 + + def __init__(self, owner, grants=None, s3_acl=False, allow_no_owner=False): + """ + :param owner: Owner instance for ACL instance + :param grants: a list of Grant instances + :param s3_acl: boolean indicates whether this class is used under + s3_acl is True or False (from s3api middleware configuration) + :param allow_no_owner: boolean indicates this ACL instance can be + handled when no owner information found + """ + self.owner = owner + self.grants = grants or [] + self.s3_acl = s3_acl + self.allow_no_owner = allow_no_owner + + def __bytes__(self): + return tostring(self.elem()) + + def __repr__(self): + return self.__bytes__().decode('utf8') + + @classmethod + def from_elem(cls, elem, s3_acl=False, allow_no_owner=False): + """ + Convert an ElementTree to an ACL instance + """ + id = elem.find('./Owner/ID').text + try: + name = elem.find('./Owner/DisplayName').text + except AttributeError: + name = id + + grants = [Grant.from_elem(e) + for e in elem.findall('./AccessControlList/Grant')] + return cls(Owner(id, name), grants, s3_acl, allow_no_owner) + + def elem(self): + """ + Decode the value to an ACL instance. + """ + elem = Element(self.root_tag) + + owner = SubElement(elem, 'Owner') + SubElement(owner, 'ID').text = self.owner.id + SubElement(owner, 'DisplayName').text = self.owner.name + + SubElement(elem, 'AccessControlList').extend( + g.elem() for g in self.grants + ) + + return elem + + def check_owner(self, user_id): + """ + Check that the user is an owner. + """ + if not self.s3_acl: + # Ignore S3api ACL. + return + + if not self.owner.id: + if self.allow_no_owner: + # No owner means public. + return + raise AccessDenied() + + if user_id != self.owner.id: + raise AccessDenied() + + def check_permission(self, user_id, permission): + """ + Check that the user has a permission. + """ + if not self.s3_acl: + # Ignore S3api ACL. + return + + try: + # owners have full control permission + self.check_owner(user_id) + return + except AccessDenied: + pass + + if permission in PERMISSIONS: + for g in self.grants: + if g.allow(user_id, 'FULL_CONTROL') or \ + g.allow(user_id, permission): + return + + raise AccessDenied() + + @classmethod + def from_headers(cls, headers, bucket_owner, object_owner=None, + as_private=True): + """ + Convert HTTP headers to an ACL instance. + """ + grants = [] + try: + for key, value in headers.items(): + if key.lower().startswith('x-amz-grant-'): + permission = key[len('x-amz-grant-'):] + permission = permission.upper().replace('-', '_') + if permission not in PERMISSIONS: + continue + for grantee in value.split(','): + grants.append( + Grant(Grantee.from_header(grantee), permission)) + + if 'x-amz-acl' in headers: + try: + acl = headers['x-amz-acl'] + if len(grants) > 0: + err_msg = 'Specifying both Canned ACLs and Header ' \ + 'Grants is not allowed' + raise InvalidRequest(err_msg) + grantees = canned_acl_grantees( + bucket_owner, object_owner)[acl] + for permission, grantee in grantees: + grants.append(Grant(grantee, permission)) + except KeyError: + # expects canned_acl_grantees()[] raises KeyError + raise InvalidArgument('x-amz-acl', headers['x-amz-acl']) + except (KeyError, ValueError): + # TODO: think about we really catch this except sequence + raise InvalidRequest() + + if len(grants) == 0: + # No ACL headers + if as_private: + return ACLPrivate(bucket_owner, object_owner) + else: + return None + + return cls(object_owner or bucket_owner, grants) + + +class CannedACL(object): + """ + A dict-like object that returns canned ACL. + """ + def __getitem__(self, key): + def acl(key, bucket_owner, object_owner=None, + s3_acl=False, allow_no_owner=False): + grants = [] + grantees = canned_acl_grantees(bucket_owner, object_owner)[key] + for permission, grantee in grantees: + grants.append(Grant(grantee, permission)) + return ACL(object_owner or bucket_owner, + grants, s3_acl, allow_no_owner) + + return partial(acl, key) + + +canned_acl = CannedACL() + +ACLPrivate = canned_acl['private'] +ACLPublicRead = canned_acl['public-read'] +ACLPublicReadWrite = canned_acl['public-read-write'] +ACLAuthenticatedRead = canned_acl['authenticated-read'] +ACLBucketOwnerRead = canned_acl['bucket-owner-read'] +ACLBucketOwnerFullControl = canned_acl['bucket-owner-full-control'] +ACLLogDeliveryWrite = canned_acl['log-delivery-write'] diff --git a/swift/common/middleware/s3api/utils.py b/swift/common/middleware/s3api/utils.py new file mode 100644 index 0000000000..dc956a189b --- /dev/null +++ b/swift/common/middleware/s3api/utils.py @@ -0,0 +1,372 @@ +# Copyright (c) 2014 OpenStack Foundation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import base64 +import calendar +import datetime +import email.utils +import re +import time +import uuid + +from swift.common import utils +from swift.common.constraints import check_utf8 +from swift.common.swob import wsgi_to_str +from swift.common.middleware.s3api.exception import \ + InvalidBucketNameParseError, InvalidURIParseError + +MULTIUPLOAD_SUFFIX = '+segments' + + +def sysmeta_prefix(resource): + """ + Returns the system metadata prefix for given resource type. + """ + if resource.lower() == 'object': + return 'x-object-sysmeta-s3api-' + else: + return 'x-container-sysmeta-s3api-' + + +def sysmeta_header(resource, name): + """ + Returns the system metadata header for given resource type and name. + """ + return sysmeta_prefix(resource) + name + + +def camel_to_snake(camel): + return re.sub('(.)([A-Z])', r'\1_\2', camel).lower() + + +def snake_to_camel(snake): + return snake.title().replace('_', '') + + +def make_header_label(header): + return 'header_' + header.lower().replace('-', '_') + + +def unique_id(): + result = base64.urlsafe_b64encode(str(uuid.uuid4()).encode('ascii')) + return result.decode('ascii') + + +def utf8encode(s): + if s is None or isinstance(s, bytes): + return s + return s.encode('utf8') + + +def utf8decode(s): + if isinstance(s, bytes): + s = s.decode('utf8') + return s + + +def is_valid_base64(s): + try: + base64.b64decode(s) + return True + except Exception: + return False + + +def is_valid_hash(hash_string): + try: + int(hash_string, 16) + except ValueError: + return False + return True + + +def classify_checksum_header_value(value): + if is_valid_hash(value): + if len(value) in (8, 16, 20, 32, 64, 128, 256, 512): + return 'hash_%d' % len(value) + elif is_valid_base64(value): + # crc32 -> b64_8 + # crc64 -> b64_12 + # md5 -> b64_24 + # sha1 -> b64_28 + # sha256 -> b64_44 + if len(value) in (8, 12, 24, 28, 44): + return 'b64_%d' % len(value) + return 'unknown' + + +def validate_bucket_name(name, dns_compliant_bucket_names): + """ + Validates the name of the bucket against S3 criteria, + http://docs.amazonwebservices.com/AmazonS3/latest/BucketRestrictions.html + True is valid, False is invalid. + """ + valid_chars = '-.a-z0-9' + if not dns_compliant_bucket_names: + valid_chars += 'A-Z_' + max_len = 63 if dns_compliant_bucket_names else 255 + + if len(name) < 3 or len(name) > max_len or not name[0].isalnum(): + # Bucket names should be between 3 and 63 (or 255) characters long + # Bucket names must start with a letter or a number + return False + elif dns_compliant_bucket_names and ( + '.-' in name or '-.' in name or '..' in name or + not name[-1].isalnum()): + # Bucket names cannot contain dashes next to periods + # Bucket names cannot contain two adjacent periods + # Bucket names must end with a letter or a number + return False + elif name.endswith('.'): + # Bucket names must not end with dot + return False + elif re.match(r"^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.)" + r"{3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$", + name): + # Bucket names cannot be formatted as an IP Address + return False + elif not re.match("^[%s]*$" % valid_chars, name): + # Bucket names can contain lowercase letters, numbers, and hyphens. + return False + else: + return True + + +def get_s3_access_key_id(req): + """ + Return the S3 access_key_id user for the request, + or None if it does not look like an S3 request. + + :param req: a swob.Request instance + + :returns: access_key_id if available, else None + """ + + authorization = req.headers.get('Authorization', '') + if authorization.startswith('AWS '): + # v2 + return authorization[4:].rsplit(':', 1)[0] + if authorization.startswith('AWS4-HMAC-SHA256 '): + # v4 + return authorization.partition('Credential=')[2].split('/', 1)[0] + params = req.params + if 'AWSAccessKeyId' in params: + # v2 + return params['AWSAccessKeyId'] + if 'X-Amz-Credential' in params: + # v4 + return params['X-Amz-Credential'].split('/', 1)[0] + + return None + + +def is_s3_req(req): + """ + Check whether a request looks like it ought to be an S3 request. + + :param req: a swob.Request instance + + :returns: True if access_key_id is available, False if not + """ + return bool(get_s3_access_key_id(req)) + + +def parse_host(environ, storage_domains): + """ + A bucket-in-host request has the bucket name as the first part of a + ``.``-separated host. If the host ends with any of + the given storage_domains then the bucket name is returned. + Otherwise ``None`` is returned. + + :param environ: an environment dict + :param storage_domains: a list of storage domains for which bucket-in-host + is supported. + :returns: bucket name or None + """ + + if 'HTTP_HOST' in environ: + given_domain = environ['HTTP_HOST'] + elif 'SERVER_NAME' in environ: + given_domain = environ['SERVER_NAME'] + else: + return None + if ':' in given_domain: + given_domain = given_domain.rsplit(':', 1)[0] + + for storage_domain in storage_domains: + if not storage_domain.startswith('.'): + storage_domain = '.' + storage_domain + + if given_domain.endswith(storage_domain): + return given_domain[:-len(storage_domain)] + + return None + + +def parse_path(req, bucket_in_host, dns_compliant_bucket_names): + """ + :params req: a swob.Request instance + :params bucket_in_host: A bucket-in-host request has the bucket name as + the first part of a ``.``-separated host. + :params dns_compliant_bucket_names: whether to validate that the bucket + name must be dns compliant + + :returns: WSGI string + """ + if not check_utf8(wsgi_to_str(req.environ['PATH_INFO'])): + raise InvalidURIParseError(req.path) + + if bucket_in_host: + obj = req.environ['PATH_INFO'][1:] or None + return bucket_in_host, obj + + bucket, obj = req.split_path(0, 2, True) + + if bucket and not validate_bucket_name( + bucket, dns_compliant_bucket_names): + # Ignore GET service case + raise InvalidBucketNameParseError(bucket) + return bucket, obj + + +def extract_bucket_and_key(req, storage_domains, + dns_compliant_bucket_names): + """ + Extract the bucket and object key from the request's PATH_INFO. Support + bucket-in-host if storage_domains and HTTP_HOST or SERVER_NAME are + specified. Otherwise the bucket is parsed from PATH_INFO. + + :param req: a swob.Request instance + :param storage_domains: a list of storage domains for which bucket-in-host + is supported. + :param dns_compliant_bucket_names: whether to validate that the bucket + name must be dns compliant + + :returns: a tuple of (bucket, key). If the request path is invalid + the tuple (None, None) is returned. + """ + try: + bucket_in_host = parse_host(req.environ, storage_domains) + bucket, key = parse_path( + req, bucket_in_host, dns_compliant_bucket_names) + except (InvalidBucketNameParseError, InvalidURIParseError): + bucket, key = None, None + return bucket, key + + +class S3Timestamp(utils.Timestamp): + S3_XML_FORMAT = "%Y-%m-%dT%H:%M:%S.000Z" + + @property + def s3xmlformat(self): + dt = datetime.datetime.fromtimestamp( + self.ceil(), datetime.timezone.utc) + return dt.strftime(self.S3_XML_FORMAT) + + @classmethod + def from_s3xmlformat(cls, date_string): + dt = datetime.datetime.strptime(date_string, cls.S3_XML_FORMAT) + dt = dt.replace(tzinfo=datetime.timezone.utc) + seconds = calendar.timegm(dt.timetuple()) + return cls(seconds) + + @property + def amz_date_format(self): + """ + this format should be like 'YYYYMMDDThhmmssZ' + """ + return self.isoformat.replace( + '-', '').replace(':', '')[:-7] + 'Z' + + +def mktime(timestamp_str, time_format='%Y-%m-%dT%H:%M:%S'): + """ + mktime creates a float instance in epoch time really like as time.mktime + + the difference from time.mktime is allowing to 2 formats string for the + argument for the S3 testing usage. + TODO: support + + :param timestamp_str: a string of timestamp formatted as + (a) RFC2822 (e.g. date header) + (b) %Y-%m-%dT%H:%M:%S (e.g. copy result) + :param time_format: a string of format to parse in (b) process + :returns: a float instance in epoch time + """ + # time_tuple is the *remote* local time + time_tuple = email.utils.parsedate_tz(timestamp_str) + if time_tuple is None: + time_tuple = time.strptime(timestamp_str, time_format) + # add timezone info as utc (no time difference) + time_tuple += (0, ) + + # We prefer calendar.gmtime and a manual adjustment over + # email.utils.mktime_tz because older versions of Python (<2.7.4) may + # double-adjust for timezone in some situations (such when swift changes + # os.environ['TZ'] without calling time.tzset()). + epoch_time = calendar.timegm(time_tuple) - time_tuple[9] + + return epoch_time + + +class Config(dict): + DEFAULTS = { + 'storage_domains': [], + 'location': 'us-east-1', + 'force_swift_request_proxy_log': False, + 'dns_compliant_bucket_names': True, + 'allow_multipart_uploads': True, + 'allow_no_owner': False, + 'allowable_clock_skew': 900, + 'ratelimit_as_client_error': False, + 'max_upload_part_num': 1000, + } + + def __init__(self, base=None): + self.update(self.DEFAULTS) + if base is not None: + self.update(base) + + def __getattr__(self, name): + if name not in self: + raise AttributeError("No attribute '%s'" % name) + + return self[name] + + def __setattr__(self, name, value): + self[name] = value + + def __delattr__(self, name): + del self[name] + + def update(self, other): + if hasattr(other, 'keys'): + for key in other.keys(): + self[key] = other[key] + else: + for key, value in other: + self[key] = value + + def __setitem__(self, key, value): + if isinstance(self.get(key), bool): + dict.__setitem__(self, key, utils.config_true_value(value)) + elif isinstance(self.get(key), int): + try: + dict.__setitem__(self, key, int(value)) + except ValueError: + if value: # No need to raise the error if value is '' + raise + else: + dict.__setitem__(self, key, value) diff --git a/swift/common/middleware/slo.py b/swift/common/middleware/slo.py new file mode 100644 index 0000000000..6a62166ce6 --- /dev/null +++ b/swift/common/middleware/slo.py @@ -0,0 +1,1919 @@ +# Copyright (c) 2018 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r""" +Middleware that will provide Static Large Object (SLO) support. + +This feature is very similar to Dynamic Large Object (DLO) support in that +it allows the user to upload many objects concurrently and afterwards +download them as a single object. It is different in that it does not rely +on eventually consistent container listings to do so. Instead, a user +defined manifest of the object segments is used. + +---------------------- +Uploading the Manifest +---------------------- + +After the user has uploaded the objects to be concatenated, a manifest is +uploaded. The request must be a ``PUT`` with the query parameter:: + + ?multipart-manifest=put + +The body of this request will be an ordered list of segment descriptions in +JSON format. The data to be supplied for each segment is either: + +=========== ======================================================== +Key Description +=========== ======================================================== +path the path to the segment object (not including account) + /container/object_name +etag (optional) the ETag given back when the segment object + was PUT +size_bytes (optional) the size of the complete segment object in + bytes +range (optional) the (inclusive) range within the object to + use as a segment. If omitted, the entire object is used +=========== ======================================================== + +Or: + +=========== ======================================================== +Key Description +=========== ======================================================== +data base64-encoded data to be returned +=========== ======================================================== + +.. note:: + At least one object-backed segment must be included. If you'd like + to create a manifest consisting purely of data segments, consider + uploading a normal object instead. + +The format of the list will be:: + + [{"path": "/cont/object", + "etag": "etagoftheobjectsegment", + "size_bytes": 10485760, + "range": "1048576-2097151"}, + {"data": base64.b64encode("interstitial data")}, + {"path": "/cont/another-object", ...}, + ...] + +The number of object-backed segments is limited to ``max_manifest_segments`` +(configurable in proxy-server.conf, default 1000). Each segment must be at +least 1 byte. On upload, the middleware will head every object-backed segment +passed in to verify: + +1. the segment exists (i.e. the ``HEAD`` was successful); +2. the segment meets minimum size requirements; +3. if the user provided a non-null ``etag``, the etag matches; +4. if the user provided a non-null ``size_bytes``, the size_bytes matches; and +5. if the user provided a ``range``, it is a singular, syntactically correct + range that is satisfiable given the size of the object referenced. + +For inlined data segments, the middleware verifies each is valid, non-empty +base64-encoded binary data. Note that data segments *do not* count against +``max_manifest_segments``. + +Note that the ``etag`` and ``size_bytes`` keys are optional; if omitted, the +verification is not performed. If any of the objects fail to verify (not +found, size/etag mismatch, below minimum size, invalid range) then the user +will receive a 4xx error response. If everything does match, the user will +receive a 2xx response and the SLO object is ready for downloading. + +Note that large manifests may take a long time to verify; historically, +clients would need to use a long read timeout for the connection to give +Swift enough time to send a final ``201 Created`` or ``400 Bad Request`` +response. Now, clients should use the query parameters:: + + ?multipart-manifest=put&heartbeat=on + +to request that Swift send an immediate ``202 Accepted`` response and periodic +whitespace to keep the connection alive. A final response code will appear in +the body. The format of the response body defaults to text/plain but can be +either json or xml depending on the ``Accept`` header. An example body is as +follows:: + + Response Status: 201 Created + Response Body: + Etag: "8f481cede6d2ddc07cb36aa084d9a64d" + Last Modified: Wed, 25 Oct 2017 17:08:55 GMT + Errors: + +Or, as a json response:: + + {"Response Status": "201 Created", + "Response Body": "", + "Etag": "\"8f481cede6d2ddc07cb36aa084d9a64d\"", + "Last Modified": "Wed, 25 Oct 2017 17:08:55 GMT", + "Errors": []} + +Behind the scenes, on success, a JSON manifest generated from the user input is +sent to object servers with an extra ``X-Static-Large-Object: True`` header +and a modified ``Content-Type``. The items in this manifest will include the +``etag`` and ``size_bytes`` for each segment, regardless of whether the client +specified them for verification. The parameter ``swift_bytes=$total_size`` will +be appended to the existing ``Content-Type``, where ``$total_size`` is the sum +of all the included segments' ``size_bytes``. This extra parameter will be +hidden from the user. + +Manifest files can reference objects in separate containers, which will improve +concurrent upload speed. Objects can be referenced by multiple manifests. The +segments of a SLO manifest can even be other SLO manifests. Treat them as any +other object i.e., use the ``Etag`` and ``Content-Length`` given on the ``PUT`` +of the sub-SLO in the manifest to the parent SLO. + +While uploading a manifest, a user can send ``Etag`` for verification. It needs +to be md5 of the segments' etags, if there is no range specified. For example, +if the manifest to be uploaded looks like this:: + + [{"path": "/cont/object1", + "etag": "etagoftheobjectsegment1", + "size_bytes": 10485760}, + {"path": "/cont/object2", + "etag": "etagoftheobjectsegment2", + "size_bytes": 10485760}] + +The Etag of the above manifest would be md5 of ``etagoftheobjectsegment1`` and +``etagoftheobjectsegment2``. This could be computed in the following way:: + + echo -n 'etagoftheobjectsegment1etagoftheobjectsegment2' | md5sum + +If a manifest to be uploaded with a segment range looks like this:: + + [{"path": "/cont/object1", + "etag": "etagoftheobjectsegmentone", + "size_bytes": 10485760, + "range": "1-2"}, + {"path": "/cont/object2", + "etag": "etagoftheobjectsegmenttwo", + "size_bytes": 10485760, + "range": "3-4"}] + +While computing the Etag of the above manifest, internally each segment's etag +will be taken in the form of ``etagvalue:rangevalue;``. Hence the Etag of the +above manifest would be:: + + echo -n 'etagoftheobjectsegmentone:1-2;etagoftheobjectsegmenttwo:3-4;' \ + | md5sum + +For the purposes of Etag computations, inlined data segments are considered to +have an etag of the md5 of the raw data (i.e., *not* base64-encoded). + + +------------------- +Range Specification +------------------- + +Users now have the ability to specify ranges for SLO segments. +Users can include an optional ``range`` field in segment descriptions +to specify which bytes from the underlying object should be used for the +segment data. Only one range may be specified per segment. + +.. note:: + + The ``etag`` and ``size_bytes`` fields still describe the backing object + as a whole. + +If a user uploads this manifest:: + + [{"path": "/con/obj_seg_1", "size_bytes": 2097152, "range": "0-1048576"}, + {"path": "/con/obj_seg_2", "size_bytes": 2097152, + "range": "512-1550000"}, + {"path": "/con/obj_seg_1", "size_bytes": 2097152, "range": "-2048"}] + +The segment will consist of the first 1048576 bytes of /con/obj_seg_1, +followed by bytes 513 through 1550000 (inclusive) of /con/obj_seg_2, and +finally bytes 2095104 through 2097152 (i.e., the last 2048 bytes) of +/con/obj_seg_1. + +.. note:: + + The minimum sized range is 1 byte. This is the same as the minimum + segment size. + + +------------------------- +Inline Data Specification +------------------------- + +When uploading a manifest, users can include 'data' segments that should +be included along with objects. The data in these segments must be +base64-encoded binary data and will be included in the etag of the +resulting large object exactly as if that data had been uploaded and +referenced as separate objects. + +.. note:: + + This feature is primarily aimed at reducing the need for storing + many tiny objects, and as such any supplied data must fit within + the maximum manifest size (default is 8MiB). This maximum size + can be configured via ``max_manifest_size`` in proxy-server.conf. + + +------------------------- +Retrieving a Large Object +------------------------- + +A ``GET`` request to the manifest object will return the concatenation of the +objects from the manifest much like DLO. If any of the segments from the +manifest are not found or their ``Etag``/``Content-Length`` have changed since +upload, the connection will drop. In this case a ``409 Conflict`` will be +logged in the proxy logs and the user will receive incomplete results. Note +that this will be enforced regardless of whether the user performed per-segment +validation during upload. + +The headers from this ``GET`` or ``HEAD`` request will return the metadata +attached to the manifest object itself with some exceptions: + +===================== ================================================== +Header Value +===================== ================================================== +Content-Length the total size of the SLO (the sum of the sizes of + the segments in the manifest) +X-Static-Large-Object the string "True" +Etag the etag of the SLO (generated the same way as DLO) +===================== ================================================== + +A ``GET`` request with the query parameter:: + + ?multipart-manifest=get + +will return a transformed version of the original manifest, containing +additional fields and different key names. For example, the first manifest in +the example above would look like this:: + + [{"name": "/cont/object", + "hash": "etagoftheobjectsegment", + "bytes": 10485760, + "range": "1048576-2097151"}, ...] + +As you can see, some of the fields are renamed compared to the put request: +*path* is *name*, *etag* is *hash*, *size_bytes* is *bytes*. The *range* field +remains the same (if present). + +A GET request with the query parameters:: + + ?multipart-manifest=get&format=raw + +will return the contents of the original manifest as it was sent by the client. +The main purpose for both calls is solely debugging. + +A GET request to a manifest object with the query parameter:: + + ?part-number= + +will return the contents of the ``nth`` segment. Segments are indexed from 1, +so ``n`` must be an integer between 1 and the total number of segments in the +manifest. The response status will be ``206 Partial Content`` and its headers +will include: an ``X-Parts-Count`` header equal to the total number of +segments; a ``Content-Length`` header equal to the length of the specified +segment; a ``Content-Range`` header describing the byte range of the specified +part within the SLO. A HEAD request with a ``part-number`` parameter will also +return a response with status ``206 Partial Content`` and the same headers. + +.. note:: + + When the manifest object is uploaded you are more or less guaranteed that + every segment in the manifest exists and matched the specifications. + However, there is nothing that prevents the user from breaking the SLO + download by deleting/replacing a segment referenced in the manifest. It is + left to the user to use caution in handling the segments. + + +----------------------- +Deleting a Large Object +----------------------- + +A ``DELETE`` request will just delete the manifest object itself. The segment +data referenced by the manifest will remain unchanged. + +A ``DELETE`` with a query parameter:: + + ?multipart-manifest=delete + +will delete all the segments referenced in the manifest and then the manifest +itself. The failure response will be similar to the bulk delete middleware. + +A ``DELETE`` with the query parameters:: + + ?multipart-manifest=delete&async=yes + +will schedule all the segments referenced in the manifest to be deleted +asynchronously and then delete the manifest itself. Note that segments will +continue to appear in listings and be counted for quotas until they are +cleaned up by the object-expirer. This option is only available when all +segments are in the same container and none of them are nested SLOs. + +------------------------ +Modifying a Large Object +------------------------ + +``PUT`` and ``POST`` requests will work as expected; ``PUT``\s will just +overwrite the manifest object for example. + +------------------ +Container Listings +------------------ + +In a container listing the size listed for SLO manifest objects will be the +``total_size`` of the concatenated segments in the manifest. The overall +``X-Container-Bytes-Used`` for the container (and subsequently for the account) +will not reflect ``total_size`` of the manifest but the actual size of the JSON +data stored. The reason for this somewhat confusing discrepancy is we want the +container listing to reflect the size of the manifest object when it is +downloaded. We do not, however, want to count the bytes-used twice (for both +the manifest and the segments it's referring to) in the container and account +metadata which can be used for stats and billing purposes. +""" + +import base64 +from collections import defaultdict +from datetime import datetime +import json +import mimetypes +import re +import time + +from swift.cli.container_deleter import make_delete_jobs +from swift.common.header_key_dict import HeaderKeyDict +from swift.common.exceptions import ListingIterError, SegmentError +from swift.common.middleware.listing_formats import \ + MAX_CONTAINER_LISTING_CONTENT_LENGTH +from swift.common.swob import Request, HTTPBadRequest, HTTPServerError, \ + HTTPMethodNotAllowed, HTTPRequestEntityTooLarge, HTTPLengthRequired, \ + HTTPOk, HTTPPreconditionFailed, HTTPException, HTTPNotFound, \ + HTTPUnauthorized, HTTPConflict, HTTPUnprocessableEntity, \ + HTTPServiceUnavailable, Response, Range, normalize_etag, \ + RESPONSE_REASONS, str_to_wsgi, bytes_to_wsgi, wsgi_to_str, wsgi_quote +from swift.common.utils import get_logger, config_true_value, \ + override_bytes_from_content_type, split_path, \ + RateLimitedIterator, quote, closing_if_possible, \ + LRUCache, StreamingPile, strict_b64decode, Timestamp, friendly_close, \ + md5, parse_header +from swift.common.registry import register_swift_info +from swift.common.request_helpers import SegmentedIterable, \ + get_sys_meta_prefix, update_etag_is_at_header, resolve_etag_is_at_header, \ + get_container_update_override_key, update_ignore_range_header, \ + get_param, get_valid_part_num, get_heartbeat_response_body +from swift.common.constraints import check_utf8 +from swift.common.http import HTTP_NOT_FOUND, HTTP_UNAUTHORIZED +from swift.common.wsgi import WSGIContext, make_subrequest, make_env, \ + make_pre_authed_request +from swift.common.middleware.bulk import ACCEPTABLE_FORMATS, Bulk +from swift.obj import expirer +from swift.proxy.controllers.base import get_container_info + + +DEFAULT_RATE_LIMIT_UNDER_SIZE = 1024 ** 2 # 1 MiB +DEFAULT_MAX_MANIFEST_SEGMENTS = 1000 +DEFAULT_MAX_MANIFEST_SIZE = 8 * (1024 ** 2) # 8 MiB +DEFAULT_YIELD_FREQUENCY = 10 + + +SLO_KEYS = { + # required: optional + 'data': set(), + 'path': {'range', 'etag', 'size_bytes'}, +} + +SYSMETA_SLO_ETAG = get_sys_meta_prefix('object') + 'slo-etag' +SYSMETA_SLO_SIZE = get_sys_meta_prefix('object') + 'slo-size' + + +def parse_and_validate_input(req_body, req_path): + """ + Given a request body, parses it and returns a list of dictionaries. + + The output structure is nearly the same as the input structure, but it + is not an exact copy. Given a valid object-backed input dictionary + ``d_in``, its corresponding output dictionary ``d_out`` will be as follows: + + * d_out['etag'] == d_in['etag'] + + * d_out['path'] == d_in['path'] + + * d_in['size_bytes'] can be a string ("12") or an integer (12), but + d_out['size_bytes'] is an integer. + + * (optional) d_in['range'] is a string of the form "M-N", "M-", or + "-N", where M and N are non-negative integers. d_out['range'] is the + corresponding swob.Range object. If d_in does not have a key + 'range', neither will d_out. + + Inlined data dictionaries will have any extraneous padding stripped. + + :raises: HTTPException on parse errors or semantic errors (e.g. bogus + JSON structure, syntactically invalid ranges) + + :returns: a list of dictionaries on success + """ + try: + parsed_data = json.loads(req_body) + except ValueError: + raise HTTPBadRequest("Manifest must be valid JSON.\n") + + if not isinstance(parsed_data, list): + raise HTTPBadRequest("Manifest must be a list.\n") + + # If we got here, req_path refers to an object, so this won't ever raise + # ValueError. + vrs, account, _junk = split_path(req_path, 3, 3, True) + + errors = [] + for seg_index, seg_dict in enumerate(parsed_data): + if not isinstance(seg_dict, dict): + errors.append(b"Index %d: not a JSON object" % seg_index) + continue + + for required in SLO_KEYS: + if required in seg_dict: + segment_type = required + break + else: + errors.append( + b"Index %d: expected keys to include one of %s" + % (seg_index, + b" or ".join(repr(required) for required in SLO_KEYS))) + continue + + allowed_keys = SLO_KEYS[segment_type].union([segment_type]) + extraneous_keys = [k for k in seg_dict if k not in allowed_keys] + if extraneous_keys: + errors.append( + b"Index %d: extraneous keys %s" + % (seg_index, + b", ".join(json.dumps(ek).encode('ascii') + for ek in sorted(extraneous_keys)))) + continue + + if segment_type == 'path': + if not isinstance(seg_dict['path'], str): + errors.append(b"Index %d: \"path\" must be a string" % + seg_index) + continue + if not (seg_dict.get('etag') is None or + isinstance(seg_dict['etag'], str)): + errors.append(b'Index %d: "etag" must be a string or null ' + b'(if provided)' % seg_index) + continue + + if '/' not in seg_dict['path'].strip('/'): + errors.append( + b"Index %d: path does not refer to an object. Path must " + b"be of the form /container/object." % seg_index) + continue + + seg_size = seg_dict.get('size_bytes') + if seg_size is not None: + try: + seg_size = int(seg_size) + seg_dict['size_bytes'] = seg_size + except (TypeError, ValueError): + errors.append(b"Index %d: invalid size_bytes" % seg_index) + continue + if seg_size < 1 and seg_index != (len(parsed_data) - 1): + errors.append(b"Index %d: too small; each segment must be " + b"at least 1 byte." + % (seg_index,)) + continue + + obj_path = '/'.join(['', vrs, account, + quote(seg_dict['path'].lstrip('/'))]) + if req_path == obj_path: + errors.append( + b"Index %d: manifest must not include itself as a segment" + % seg_index) + continue + + if seg_dict.get('range'): + try: + seg_dict['range'] = Range('bytes=%s' % seg_dict['range']) + except ValueError: + errors.append(b"Index %d: invalid range" % seg_index) + continue + + if len(seg_dict['range'].ranges) > 1: + errors.append(b"Index %d: multiple ranges " + b"(only one allowed)" % seg_index) + continue + + # If the user *told* us the object's size, we can check range + # satisfiability right now. If they lied about the size, we'll + # fail that validation later. + if (seg_size is not None and 1 != len( + seg_dict['range'].ranges_for_length(seg_size))): + errors.append(b"Index %d: unsatisfiable range" % seg_index) + continue + + elif segment_type == 'data': + # Validate that the supplied data is non-empty and base64-encoded + try: + data = strict_b64decode(seg_dict['data']) + except ValueError: + errors.append( + b"Index %d: data must be valid base64" % seg_index) + continue + if len(data) < 1: + errors.append(b"Index %d: too small; each segment must be " + b"at least 1 byte." + % (seg_index,)) + continue + # re-encode to normalize padding + seg_dict['data'] = base64.b64encode(data).decode('ascii') + + if parsed_data and all('data' in d for d in parsed_data): + errors.append(b"Inline data segments require at least one " + b"object-backed segment.") + + if errors: + error_message = b"".join(e + b"\n" for e in errors) + raise HTTPBadRequest(error_message, + headers={"Content-Type": "text/plain"}) + + return parsed_data + + +def _annotate_segments(segments, logger=None): + """ + Decode any inlined data and update sub_slo segments bytes from content-type + when available; then annotate segment dicts in segments list with + 'segment_length'. + + N.B. raw_data segments don't have a bytes key and range-segments need to + calculate their length from their range key but afterwards all segments + dicts will have 'segment_length' representing the length of the segment. + """ + for seg_dict in segments: + if 'data' in seg_dict: + seg_dict['raw_data'] = base64.b64decode(seg_dict.pop('data')) + segment_length = len(seg_dict['raw_data']) + else: + if config_true_value(seg_dict.get('sub_slo')): + override_bytes_from_content_type( + seg_dict, logger=logger) + seg_range = seg_dict.get('range') + if seg_range is not None: + # The range is of the form N-M, where N and M are both + # positive decimal integers. We know this because this + # middleware is the only thing that creates the SLO + # manifests stored in the cluster. + range_start, range_end = [ + int(x) for x in seg_range.split('-')] + segment_length = (range_end - range_start) + 1 + else: + segment_length = int(seg_dict['bytes']) + seg_dict['segment_length'] = segment_length + + +def calculate_byterange_for_part_num(req, segments, part_num): + """ + Helper function to calculate the byterange for a part_num response. + + N.B. as a side-effect of calculating the single tuple representing the + byterange required for a part_num response this function will also mutate + the request's Range header so that swob knows to return 206. + + :param req: the request object + :param segments: the list of seg_dicts + :param part_num: the part number of the object to return + + :returns: a tuple representing the byterange + """ + start = 0 + for seg in segments[:part_num - 1]: + start += seg['segment_length'] + last = start + segments[part_num - 1]['segment_length'] + # We need to mutate the request's Range header so that swob knows to + # handle these partial content requests correctly. + req.range = "bytes=%d-%d" % (start, last - 1) + return start, last - 1 + + +def calculate_byteranges(req, segments, resp_attrs, part_num): + """ + Calculate the byteranges based on the request, segments, and part number. + + N.B. as a side-effect of calculating the single tuple representing the + byterange required for a part_num response this function will also mutate + the request's Range header so that swob knows to return 206. + + :param req: the request object + :param segments: the list of seg_dicts + :param resp_attrs: the slo response attributes + :param part_num: the part number of the object to return + + :returns: a list of tuples representing byteranges + """ + if req.range: + byteranges = [ + # For some reason, swob.Range.ranges_for_length adds 1 to the + # last byte's position. + (start, end - 1) for start, end + in req.range.ranges_for_length(resp_attrs.slo_size)] + elif part_num: + byteranges = [ + calculate_byterange_for_part_num(req, segments, part_num)] + else: + byteranges = [(0, resp_attrs.slo_size - 1)] + + return byteranges + + +class RespAttrs(object): + """ + Encapsulate properties of a GET or HEAD response that are pertinent to + handling a potential SLO response. + + Instances of this class are typically constructed using the + ``from_headers`` method. + + :param is_slo: True if the response appears to be an SLO manifest, False + otherwise. + :param timestamp: a value from which an instance of + :class:`~swift.common.utils.Timestamp` can be constructed. + :param manifest_etag: the Etag of the manifest object, or None if + ``is_slo`` is False. + :param slo_etag: the Etag of the SLO. + :param slo_size: the size of the SLO. + """ + def __init__(self, is_slo, timestamp, manifest_etag, slo_etag, slo_size): + self.is_slo = bool(is_slo) + self.timestamp = Timestamp(timestamp or Timestamp.zero()) + # manifest_etag is unambiguous, but json_md5 is even more explicit + self.json_md5 = manifest_etag or '' + self.slo_etag = slo_etag or '' + try: + # even though it's from sysmeta, we have to worry about empty + # values - see test_get_invalid_sysmeta_passthrough + self.slo_size = int(slo_size) + except (ValueError, TypeError): + self.slo_size = -1 + self.is_legacy = not self._has_size_and_etag() + + def _has_size_and_etag(self): + return self.slo_size >= 0 and self.slo_etag + + @classmethod + def from_headers(cls, response_headers): + """ + Inspect response headers and extract any resp_attrs we can find. + + :param response_headers: list of tuples from a object response + :returns: an instance of RespAttrs to represent the response headers + """ + is_slo = False + timestamp = None + found_etag = None + slo_etag = None + slo_size = None + for header, value in response_headers: + header = header.lower() + if header == 'x-static-large-object': + is_slo = config_true_value(value) + elif header == 'x-backend-timestamp': + timestamp = value + elif header == 'etag': + found_etag = value + elif header == SYSMETA_SLO_ETAG: + slo_etag = value + elif header == SYSMETA_SLO_SIZE: + slo_size = value + manifest_etag = found_etag if is_slo else None + return cls(is_slo, timestamp, manifest_etag, slo_etag, slo_size) + + def update_from_segments(self, segments): + """ + Always called if SLO has fetched the manifest response body, for + legacy manifests we'll calculate size/etag values we wouldn't have + gotten from sys-meta headers. + """ + # we only have to set size/etag once; it doesn't matter if we got the + # values from sysmeta headers or segments + if self._has_size_and_etag(): + return + + calculated_size = 0 + calculated_etag = md5(usedforsecurity=False) + + for seg_dict in segments: + calculated_size += seg_dict['segment_length'] + + if 'raw_data' in seg_dict: + r = md5(seg_dict['raw_data'], + usedforsecurity=False).hexdigest() + elif seg_dict.get('range'): + r = '%s:%s;' % (seg_dict['hash'], seg_dict['range']) + else: + r = seg_dict['hash'] + calculated_etag.update(r.encode('ascii')) + + self.slo_size = calculated_size + self.slo_etag = calculated_etag.hexdigest() + + +class SloGetContext(WSGIContext): + + max_slo_recursion_depth = 10 + + def __init__(self, slo): + self.slo = slo + super(SloGetContext, self).__init__(slo.app) + # we'll know more after we look at the response metadata + self.segment_listing_needed = False + + def _fetch_sub_slo_segments(self, req, version, acc, con, obj): + """ + Fetch the submanifest, parse it, and return it. + Raise exception on failures. + + :param req: the upstream request + :param version: whatever + :param acc: native + :param con: native + :param obj: native + """ + sub_req = make_subrequest( + req.environ, + path=wsgi_quote('/'.join([ + '', str_to_wsgi(version), + str_to_wsgi(acc), str_to_wsgi(con), str_to_wsgi(obj)])), + method='GET', + headers={'x-auth-token': req.headers.get('x-auth-token')}, + agent='%(orig)s SLO MultipartGET', swift_source='SLO') + params_copy = dict(req.params) + params_copy.pop('part-number', None) + sub_req.params = params_copy + sub_resp = sub_req.get_response(self.slo.app) + + if not sub_resp.is_success: + # Error message should be short + body = sub_resp.body.decode('utf-8') + msg = ('while fetching %s, GET of submanifest %s ' + 'failed with status %d (%s)') + raise ListingIterError(msg % ( + req.path, sub_req.path, sub_resp.status_int, + body if len(body) <= 60 else body[:57] + '...')) + + try: + return self._parse_segments(sub_resp.app_iter) + except HTTPException as err: + raise ListingIterError( + 'while fetching %s, JSON-decoding of submanifest %s ' + 'failed with %s' % (req.path, sub_req.path, err)) + + def _segment_path(self, version, account, seg_dict): + return "/{ver}/{acc}/{conobj}".format( + ver=version, acc=account, + conobj=seg_dict['name'].lstrip('/') + ) + + def _segment_listing_iterator(self, req, version, account, segments, + byteranges): + # We handle the range stuff here so that we can be smart about + # skipping unused submanifests. For example, if our first segment is a + # submanifest referencing 50 MiB total, but start_byte falls in + # the 51st MiB, then we can avoid fetching the first submanifest. + # + # If we were to make SegmentedIterable handle all the range + # calculations, we would be unable to make this optimization. + + # Cache segments from sub-SLOs in case more than one byterange + # includes data from a particular sub-SLO. We only cache a few sets + # of segments so that a malicious user cannot build a giant SLO tree + # and then GET it to run the proxy out of memory. + # + # LRUCache is a little awkward to use this way, but it beats doing + # things manually. + # + # 20 is sort of an arbitrary choice; it's twice our max recursion + # depth, so we know this won't expand memory requirements by too + # much. + cached_fetch_sub_slo_segments = \ + LRUCache(maxsize=20)(self._fetch_sub_slo_segments) + + for first_byte, last_byte in byteranges: + byterange_listing_iter = self._byterange_listing_iterator( + req, version, account, segments, first_byte, last_byte, + cached_fetch_sub_slo_segments) + for seg_info in byterange_listing_iter: + yield seg_info + + def _byterange_listing_iterator(self, req, version, account, segments, + first_byte, last_byte, + cached_fetch_sub_slo_segments, + recursion_depth=1): + """ + Iterable that generates a filtered and annotated stream of segment + dicts describing the sub-segment ranges that would be used by the + SegmentedIterable to construct the bytes for a ranged response. + + :param req: original request object + :param version: version + :param account: account + :param segments: segments dictionary + :param first_byte: offset into the large object for the first byte + that is returned to the client + :param last_byte: offset into the large object for the last byte + that is returned to the client + :param cached_fetch_sub_slo_segments: LRU cache used for fetching + sub-segments + :param recursion_depth: max number of recursive sub_slo calls + """ + last_sub_path = None + for seg_dict in segments: + seg_length = seg_dict['segment_length'] + if first_byte >= seg_length: + # don't need any bytes from this segment + first_byte -= seg_length + last_byte -= seg_length + continue + + if last_byte < 0: + # no bytes are needed from this or any future segment + return + + if 'raw_data' in seg_dict: + yield dict(seg_dict, + first_byte=max(0, first_byte), + last_byte=min(seg_length - 1, last_byte)) + first_byte -= seg_length + last_byte -= seg_length + continue + + seg_range = seg_dict.get('range') + if seg_range is None: + range_start, range_end = 0, seg_length - 1 + else: + # This simple parsing of the range is valid because we already + # validated and supplied concrete values for the range + # during SLO manifest creation + range_start, range_end = map(int, seg_range.split('-')) + + if config_true_value(seg_dict.get('sub_slo')): + # Do this check here so that we can avoid fetching this last + # manifest before raising the exception + if recursion_depth >= self.max_slo_recursion_depth: + raise ListingIterError( + "While processing manifest %r, " + "max recursion depth was exceeded" % req.path) + + sub_path = seg_dict['name'] + sub_cont, sub_obj = split_path(sub_path, 2, 2, True) + if last_sub_path != sub_path: + sub_segments = cached_fetch_sub_slo_segments( + req, version, account, sub_cont, sub_obj) + last_sub_path = sub_path + + # Use the existing machinery to slice into the sub-SLO. + for sub_seg_dict in self._byterange_listing_iterator( + req, version, account, sub_segments, + # This adjusts first_byte and last_byte to be + # relative to the sub-SLO. + range_start + max(0, first_byte), + min(range_end, range_start + last_byte), + + cached_fetch_sub_slo_segments, + recursion_depth=recursion_depth + 1): + yield sub_seg_dict + else: + yield dict(seg_dict, + first_byte=max(0, first_byte) + range_start, + last_byte=min(range_end, range_start + last_byte)) + + first_byte -= seg_length + last_byte -= seg_length + + def _is_body_complete(self): + content_range = '' + for header, value in self._response_headers: + if header.lower() == 'content-range': + content_range = value + break + # e.g. Content-Range: bytes 0-14289/14290 + match = re.match(r'bytes (\d+)-(\d+)/(\d+)$', content_range) + if not match: + # Malformed or missing, so we don't know what we got. + return False + first_byte, last_byte, length = [int(x) for x in match.groups()] + # If and only if we actually got back the full manifest body, then + # we can avoid re-fetching the object. + return first_byte == 0 and last_byte == length - 1 + + def _need_to_refetch_manifest(self, req, resp_attrs, is_part_num_request): + """ + Check if the segments will be needed to service the request and update + the segment_listing_needed attribute. + + :return: boolean indicating if we need to refetch, only if the segments + ARE needed we MAY need to refetch them! + """ + if req.method == 'HEAD': + # There may be some cases in the future where a HEAD resp on even a + # modern manifest should refetch, e.g. lp bug #2029174 + self.segment_listing_needed = (resp_attrs.is_legacy or + is_part_num_request) + # it will always be the case that a HEAD must re-fetch iff + # segment_listing_needed + return self.segment_listing_needed + + last_resp_status_int = self._get_status_int() + # These are based on etag (or last-modified), but the SLO's etag is + # almost certainly not the manifest object's etag. Still, it's highly + # likely that the submitted If-None-Match won't match the manifest + # object's etag, so we can avoid re-fetching the manifest if we got a + # successful response. + if last_resp_status_int in (412, 304): + # a conditional response from a modern manifest would have an + # accurate SLO etag, AND comparison with the etag-is-at header, but + # for legacy manifests responses (who always need to calculate the + # correct etag, even for if-[un]modified-since errors) we can't say + # what the etag is or if it matches unless we calculate it from + # segments - so we always need them + self.segment_listing_needed = resp_attrs.is_legacy + # if we need them; we can't get them from the error + return self.segment_listing_needed + + # This is GET request for an SLO object, if we're going to return a + # successful response we're going to need the segments, but this + # resp_iter may not contain the entire SLO manifest. + self.segment_listing_needed = True + + # modern swift object-servers should ignore Range headers on manifests, + # but during upgrade if we get a range response we'll probably have to + # refetch + if last_resp_status_int == 416: + # if the range wasn't satisfiable we need to refetch + return True + elif last_resp_status_int == 206: + # a partial response might included the whole content-range?! + return not self._is_body_complete() + else: + # a good number of error responses would have returned earlier for + # lacking is_slo sys-meta, at this point we've filtered all the + # other response codes, so this is a prefectly normal 200 response, + # no need to refetch + return False + + def _refetch_manifest(self, req, resp_iter, orig_resp_attrs): + req.environ['swift.non_client_disconnect'] = True + friendly_close(resp_iter) + del req.environ['swift.non_client_disconnect'] + + headers_subset = ['x-auth-token', 'x-open-expired'] + get_req = make_subrequest( + req.environ, method='GET', + headers={k: req.headers.get(k) + for k in headers_subset if k in req.headers}, + agent='%(orig)s SLO MultipartGET', swift_source='SLO') + resp_iter = self._app_call(get_req.environ) + new_resp_attrs = RespAttrs.from_headers(self._response_headers) + if new_resp_attrs.timestamp < orig_resp_attrs.timestamp and \ + not new_resp_attrs.is_slo: + # Our *orig_resp_attrs* saw *newer* data that indicated it was an + # SLO, but on refetch it's an older object or error; 503 seems + # reasonable? + friendly_close(resp_iter) + raise HTTPServiceUnavailable(request=req) + # else, the caller will know how to return this response + return new_resp_attrs, resp_iter + + def _parse_segments(self, resp_iter): + """ + Read the manifest body and parse segments. + + :returns: segments + :raises: HTTPServerError + """ + segments = self._get_manifest_read(resp_iter) + _annotate_segments(segments, logger=self.slo.logger) + return segments + + def _return_manifest_response(self, req, start_response, resp_iter, + is_format_raw): + if is_format_raw: + json_data = self.convert_segment_listing(resp_iter) + # we've created a new response body + resp_iter = [json_data] + replace_headers = { + # Note that we have to return the large object's content-type + # (not application/json) so it's like what the client sent on + # PUT. Otherwise, server-side copy won't work. + 'Content-Length': len(json_data), + 'Etag': md5(json_data, usedforsecurity=False).hexdigest(), + } + else: + # we're going to return the manifest resp_iter as-is + replace_headers = { + 'Content-Type': 'application/json; charset=utf-8', + } + return self._return_response(req, start_response, resp_iter, + replace_headers) + + def _return_slo_response(self, req, start_response, resp_iter, resp_attrs): + headers = { + 'Etag': '"%s"' % resp_attrs.slo_etag, + 'X-Manifest-Etag': resp_attrs.json_md5, + # swob will fix this for a GET with Range + 'Content-Length': str(resp_attrs.slo_size), + # ignore bogus content-range, make swob figure it out + 'Content-Range': None, + } + if self.segment_listing_needed: + # consume existing resp_iter; we'll create a new one + segments = self._parse_segments(resp_iter) + resp_attrs.update_from_segments(segments) + headers['Etag'] = '"%s"' % resp_attrs.slo_etag + headers['Content-Length'] = str(resp_attrs.slo_size) + part_num = get_valid_part_num(req) + if part_num: + headers['X-Parts-Count'] = len(segments) + + if part_num and part_num > len(segments): + if req.method == 'HEAD': + resp_iter = [] + headers['Content-Length'] = '0' + else: + body = b'The requested part number is not satisfiable' + resp_iter = [body] + headers['Content-Length'] = len(body) + headers['Content-Range'] = 'bytes */%d' % resp_attrs.slo_size + self._response_status = '416 Requested Range Not Satisfiable' + elif part_num and req.method == 'HEAD': + resp_iter = [] + headers['Content-Length'] = \ + segments[part_num - 1].get('segment_length') + start, end = calculate_byterange_for_part_num( + req, segments, part_num) + headers['Content-Range'] = \ + 'bytes {}-{}/{}'.format(start, end, + resp_attrs.slo_size) + # The RFC specifies 206 in the context of Range requests, and + # Range headers MUST be ignored for HEADs [1], so a HEAD will + # not normally return a 206. However, a part-number HEAD + # returns Content-Length equal to the part size, rather than + # the whole object size, so in this case we do return 206. + # [1] https://www.rfc-editor.org/rfc/rfc9110#name-range + self._response_status = '206 Partial Content' + elif req.method == 'HEAD': + resp_iter = [] + else: + byteranges = calculate_byteranges( + req, segments, resp_attrs, part_num) + resp_iter = self._build_resp_iter(req, segments, byteranges) + return self._return_response(req, start_response, resp_iter, + replace_headers=headers) + + def _return_response(self, req, start_response, resp_iter, + replace_headers): + if req.method == 'HEAD' or self._get_status_int() in (412, 304): + # we should drain HEAD and unmet condition responses since they + # don't have bodies + friendly_close(resp_iter) + resp_iter = b'' + resp_headers = HeaderKeyDict(self._response_headers, **replace_headers) + resp = Response( + status=self._response_status, + headers=resp_headers, + app_iter=resp_iter, + request=req, + conditional_response=True, + conditional_etag=resolve_etag_is_at_header(req, resp_headers)) + return resp(req.environ, start_response) + + def _return_non_slo_response(self, req, start_response, resp_iter): + # our "pass-through" response may have been from a manifest refetch w/o + # range/conditional headers that turned out to be a real object, and + # now we want out. But if the original client request included Range + # or Conditional headers we can trust swob to do the right conversion + # back into a 206/416/304/412 (as long as the response we have is a + # normal successful response and we respect any forwarding middleware's + # etag-is-at header that we stripped off for the refetch!) + resp = Response( + status=self._response_status, + headers=self._response_headers, + app_iter=resp_iter, + request=req, + conditional_response=self._get_status_int() == 200, + conditional_etag=resolve_etag_is_at_header( + req, self._response_headers) + ) + return resp(req.environ, start_response) + + def handle_slo_get_or_head(self, req, start_response): + """ + Takes a request and a start_response callable and does the normal WSGI + thing with them. Returns an iterator suitable for sending up the WSGI + chain. + + :param req: :class:`~swift.common.swob.Request` object; is a ``GET`` or + ``HEAD`` request aimed at what may (or may not) be a static + large object manifest. + :param start_response: WSGI start_response callable + """ + is_manifest_get = get_param(req, 'multipart-manifest') == 'get' + is_format_raw = is_manifest_get and get_param(req, 'format') == 'raw' + + if not is_manifest_get: + # If this object is an SLO manifest, we may have saved off the + # large object etag during the original PUT. Send an + # X-Backend-Etag-Is-At header so that, if the SLO etag *was* saved, + # we can trust the object-server to respond appropriately to + # If-Match/If-None-Match requests. + update_etag_is_at_header(req, SYSMETA_SLO_ETAG) + # Tell the object server that if it's a manifest, + # we want the whole thing + update_ignore_range_header(req, 'X-Static-Large-Object') + + # process original request + orig_path_info = req.path_info + resp_iter = self._app_call(req.environ) + resp_attrs = RespAttrs.from_headers(self._response_headers) + if resp_attrs.is_slo and not is_manifest_get: + try: + # only validate part-number if the request is to an SLO + part_num = get_valid_part_num(req) + except HTTPException: + friendly_close(resp_iter) + raise + # the next two calls hide a couple side effects, sorry: + # + # 1) regardless of the return value the "need_to_refetch" check + # *may* also set self.segment_listing_needed = True (it's + # commented to help you wrap your head around that one, + # good luck) + # 2) if we refetch, we overwrite the current resp_iter and + # resp_attrs variables, partly because we *might* get back a NOT + # resp_attrs.is_slo response (even if we had one to start), but + # hopefully they're just the manifest resp we needed to refetch! + if self._need_to_refetch_manifest(req, resp_attrs, part_num): + # reset path in case it was modified during original request + # (e.g. object versioning might re-write the path) + req.path_info = orig_path_info + resp_attrs, resp_iter = self._refetch_manifest( + req, resp_iter, resp_attrs) + + if not resp_attrs.is_slo: + # even if the original resp_attrs may have been SLO we may have + # refetched, this also handles the server error case + return self._return_non_slo_response( + req, start_response, resp_iter) + + if is_manifest_get: + # manifest pass through doesn't require resp_attrs + return self._return_manifest_response(req, start_response, + resp_iter, is_format_raw) + + # this a GET/HEAD response for the SLO object (not the manifest) + return self._return_slo_response(req, start_response, resp_iter, + resp_attrs) + + def convert_segment_listing(self, resp_iter): + """ + Converts the manifest data to match with the format + that was put in through ?multipart-manifest=put + + :param resp_iter: a response iterable + + :raises HTTPServerError: + :returns: the json-serialized raw format (as bytes) + """ + segments = self._get_manifest_read(resp_iter) + + for seg_dict in segments: + if 'data' in seg_dict: + continue + seg_dict.pop('content_type', None) + seg_dict.pop('last_modified', None) + seg_dict.pop('sub_slo', None) + seg_dict['path'] = seg_dict.pop('name', None) + seg_dict['size_bytes'] = seg_dict.pop('bytes', None) + seg_dict['etag'] = seg_dict.pop('hash', None) + + json_data = json.dumps(segments, sort_keys=True) # convert to string + return json_data.encode('utf-8') + + def _get_manifest_read(self, resp_iter): + with closing_if_possible(resp_iter): + resp_body = b''.join(resp_iter) + try: + segments = json.loads(resp_body) + except ValueError as e: + msg = 'Unable to load SLO manifest' + self.slo.logger.error('%s: %s', msg, e) + raise HTTPServerError(msg) + return segments + + def _build_resp_iter(self, req, segments, byteranges): + """ + Build a response iterable for a GET request. + + :param req: the request object + :param segments: the list of seg_dicts + :param byteranges: a list of tuples representing byteranges + + :returns: a segmented iterable + """ + ver, account, _junk = req.split_path(3, 3, rest_with_last=True) + account = wsgi_to_str(account) + plain_listing_iter = self._segment_listing_iterator( + req, ver, account, segments, byteranges) + + def ratelimit_predicate(seg_dict): + if 'raw_data' in seg_dict: + return False # it's already in memory anyway + start = seg_dict.get('start_byte') or 0 + end = seg_dict.get('end_byte') + if end is None: + end = int(seg_dict['bytes']) - 1 + is_small = (end - start + 1) < self.slo.rate_limit_under_size + return is_small + + ratelimited_listing_iter = RateLimitedIterator( + plain_listing_iter, + self.slo.rate_limit_segments_per_sec, + limit_after=self.slo.rate_limit_after_segment, + ratelimit_if=ratelimit_predicate) + + # data segments are already in the correct format, but object-backed + # segments need a path key added + segment_listing_iter = ( + seg_dict if 'raw_data' in seg_dict else + dict(seg_dict, path=self._segment_path(ver, account, seg_dict)) + for seg_dict in ratelimited_listing_iter) + + segmented_iter = SegmentedIterable( + req, self.slo.app, segment_listing_iter, + name=req.path, logger=self.slo.logger, + ua_suffix="SLO MultipartGET", + swift_source="SLO", + max_get_time=self.slo.max_get_time) + + try: + segmented_iter.validate_first_segment() + except (ListingIterError, SegmentError): + # Copy from the SLO explanation in top of this file. + # If any of the segments from the manifest are not found or + # their Etag/Content Length no longer match the connection + # will drop. In this case a 409 Conflict will be logged in + # the proxy logs and the user will receive incomplete results. + raise HTTPConflict(request=req) + return segmented_iter + + +class StaticLargeObject(object): + """ + StaticLargeObject Middleware + + See above for a full description. + + The proxy logs created for any subrequests made will have swift.source set + to "SLO". + + :param app: The next WSGI filter or app in the paste.deploy chain. + :param conf: The configuration dict for the middleware. + :param max_manifest_segments: The maximum number of segments allowed in + newly-created static large objects. + :param max_manifest_size: The maximum size (in bytes) of newly-created + static-large-object manifests. + :param yield_frequency: If the client included ``heartbeat=on`` in the + query parameters when creating a new static large + object, the period of time to wait between sending + whitespace to keep the connection alive. + """ + + def __init__(self, app, conf, + max_manifest_segments=DEFAULT_MAX_MANIFEST_SEGMENTS, + max_manifest_size=DEFAULT_MAX_MANIFEST_SIZE, + yield_frequency=DEFAULT_YIELD_FREQUENCY, + allow_async_delete=True): + self.conf = conf + self.app = app + self.logger = get_logger(conf, log_route='slo') + self.max_manifest_segments = max_manifest_segments + self.max_manifest_size = max_manifest_size + self.yield_frequency = yield_frequency + self.allow_async_delete = allow_async_delete + self.max_get_time = int(self.conf.get('max_get_time', 86400)) + self.rate_limit_under_size = int(self.conf.get( + 'rate_limit_under_size', DEFAULT_RATE_LIMIT_UNDER_SIZE)) + self.rate_limit_after_segment = int(self.conf.get( + 'rate_limit_after_segment', '10')) + self.rate_limit_segments_per_sec = int(self.conf.get( + 'rate_limit_segments_per_sec', '1')) + self.concurrency = min(1000, max(0, int(self.conf.get( + 'concurrency', '2')))) + delete_concurrency = int(self.conf.get( + 'delete_concurrency', self.concurrency)) + self.bulk_deleter = Bulk( + app, {}, + max_deletes_per_request=float('inf'), + delete_concurrency=delete_concurrency, + logger=self.logger) + + self.expirer_config = expirer.ExpirerConfig(conf, logger=self.logger) + + def handle_multipart_get_or_head(self, req, start_response): + """ + Handles the GET or HEAD of a SLO manifest. + + The response body (only on GET, of course) will consist of the + concatenation of the segments. + + :param req: a :class:`~swift.common.swob.Request` with a path + referencing an object + :param start_response: WSGI start_response callable + :raises HttpException: on errors + """ + return SloGetContext(self).handle_slo_get_or_head(req, start_response) + + def handle_multipart_put(self, req, start_response): + """ + Will handle the PUT of a SLO manifest. + Heads every object in manifest to check if is valid and if so will + save a manifest generated from the user input. Uses WSGIContext to + call self and start_response and returns a WSGI iterator. + + :param req: a :class:`~swift.common.swob.Request` with an obj in path + :param start_response: WSGI start_response callable + :raises HttpException: on errors + """ + vrs, account, container, obj = req.split_path(4, rest_with_last=True) + if req.headers.get('X-Copy-From'): + raise HTTPMethodNotAllowed( + 'Multipart Manifest PUTs cannot be COPY requests') + if req.content_length is None: + if req.headers.get('transfer-encoding', '').lower() != 'chunked': + raise HTTPLengthRequired(request=req) + else: + if req.content_length > self.max_manifest_size: + raise HTTPRequestEntityTooLarge( + "Manifest File > %d bytes" % self.max_manifest_size) + parsed_data = parse_and_validate_input( + req.body_file.read(self.max_manifest_size), + wsgi_to_str(req.path)) + problem_segments = [] + + object_segments = [seg for seg in parsed_data if 'path' in seg] + if len(object_segments) > self.max_manifest_segments: + raise HTTPRequestEntityTooLarge( + 'Number of object-backed segments must be <= %d' % + self.max_manifest_segments) + try: + out_content_type = req.accept.best_match(ACCEPTABLE_FORMATS) + except ValueError: + out_content_type = 'text/plain' # Ignore invalid header + if not out_content_type: + out_content_type = 'text/plain' + data_for_storage = [None] * len(parsed_data) + total_size = 0 + path2indices = defaultdict(list) + for index, seg_dict in enumerate(parsed_data): + if 'data' in seg_dict: + data_for_storage[index] = seg_dict + total_size += len(base64.b64decode(seg_dict['data'])) + else: + path2indices[seg_dict['path']].append(index) + + def do_head(obj_name): + obj_path = '/'.join(['', vrs, account, + str_to_wsgi(obj_name.lstrip('/'))]) + obj_path = wsgi_quote(obj_path) + + sub_req = make_subrequest( + req.environ, path=obj_path + '?', # kill the query string + method='HEAD', + headers={'x-auth-token': req.headers.get('x-auth-token')}, + agent='%(orig)s SLO MultipartPUT', swift_source='SLO') + return obj_name, sub_req.get_response(self) + + def validate_seg_dict(seg_dict, head_seg_resp, allow_empty_segment): + obj_name = seg_dict['path'] + if not head_seg_resp.is_success: + problem_segments.append([quote(obj_name), + head_seg_resp.status]) + return 0, None + + segment_length = head_seg_resp.content_length + if seg_dict.get('range'): + # Since we now know the length, we can normalize the + # range. We know that there is exactly one range + # requested since we checked that earlier in + # parse_and_validate_input(). + ranges = seg_dict['range'].ranges_for_length( + head_seg_resp.content_length) + + if not ranges: + problem_segments.append([quote(obj_name), + 'Unsatisfiable Range']) + elif ranges == [(0, head_seg_resp.content_length)]: + # Just one range, and it exactly matches the object. + # Why'd we do this again? + del seg_dict['range'] + segment_length = head_seg_resp.content_length + else: + rng = ranges[0] + seg_dict['range'] = '%d-%d' % (rng[0], rng[1] - 1) + segment_length = rng[1] - rng[0] + + if segment_length < 1 and not allow_empty_segment: + problem_segments.append( + [quote(obj_name), + 'Too small; each segment must be at least 1 byte.']) + + _size_bytes = seg_dict.get('size_bytes') + size_mismatch = ( + _size_bytes is not None and + _size_bytes != head_seg_resp.content_length + ) + if size_mismatch: + problem_segments.append([quote(obj_name), 'Size Mismatch']) + + _etag = seg_dict.get('etag') + etag_mismatch = ( + _etag is not None and + _etag != head_seg_resp.etag + ) + if etag_mismatch: + problem_segments.append([quote(obj_name), 'Etag Mismatch']) + + if head_seg_resp.last_modified: + last_modified = head_seg_resp.last_modified + else: + # shouldn't happen + last_modified = datetime.now() + + last_modified_formatted = last_modified.strftime( + '%Y-%m-%dT%H:%M:%S.%f' + ) + seg_data = { + 'name': '/' + seg_dict['path'].lstrip('/'), + 'bytes': head_seg_resp.content_length, + 'hash': head_seg_resp.etag, + 'content_type': head_seg_resp.content_type, + 'last_modified': last_modified_formatted + } + if seg_dict.get('range'): + seg_data['range'] = seg_dict['range'] + if config_true_value( + head_seg_resp.headers.get('X-Static-Large-Object')): + seg_data['sub_slo'] = True + + return segment_length, seg_data + + heartbeat = config_true_value(req.params.get('heartbeat')) + separator = b'' + if heartbeat: + # Apparently some ways of deploying require that this to happens + # *before* the return? Not sure why. + req.environ['eventlet.minimum_write_chunk_size'] = 0 + start_response('202 Accepted', [ # NB: not 201 ! + ('Content-Type', out_content_type), + ]) + separator = b'\r\n\r\n' + + def resp_iter(total_size=total_size): + # wsgi won't propagate start_response calls until some data has + # been yielded so make sure first heartbeat is sent immediately + if heartbeat: + yield b' ' + last_yield_time = time.time() + with StreamingPile(self.concurrency) as pile: + for obj_name, resp in pile.asyncstarmap(do_head, ( + (path, ) for path in path2indices)): + now = time.time() + if heartbeat and (now - last_yield_time > + self.yield_frequency): + # Make sure we've called start_response before + # sending data + yield b' ' + last_yield_time = now + for i in path2indices[obj_name]: + segment_length, seg_data = validate_seg_dict( + parsed_data[i], resp, + allow_empty_segment=(i == len(parsed_data) - 1)) + data_for_storage[i] = seg_data + total_size += segment_length + + # Middleware left of SLO can add a callback to the WSGI + # environment to perform additional validation and/or + # manipulation on the manifest that will be written. + hook = req.environ.get('swift.callback.slo_manifest_hook') + if hook: + more_problems = hook(data_for_storage) + if more_problems: + problem_segments.extend(more_problems) + + if problem_segments: + err = HTTPBadRequest(content_type=out_content_type) + resp_dict = {} + if heartbeat: + resp_dict['Response Status'] = err.status + err_body = err.body.decode('utf-8') + resp_dict['Response Body'] = err_body or '\n'.join( + RESPONSE_REASONS.get(err.status_int, [''])) + else: + start_response(err.status, + [(h, v) for h, v in err.headers.items() + if h.lower() != 'content-length']) + yield separator + get_heartbeat_response_body( + out_content_type, resp_dict, problem_segments, 'upload') + return + + slo_etag = md5(usedforsecurity=False) + for seg_data in data_for_storage: + if 'data' in seg_data: + raw_data = base64.b64decode(seg_data['data']) + r = md5(raw_data, usedforsecurity=False).hexdigest() + elif seg_data.get('range'): + r = '%s:%s;' % (seg_data['hash'], seg_data['range']) + else: + r = seg_data['hash'] + slo_etag.update(r.encode('ascii')) + + slo_etag = slo_etag.hexdigest() + client_etag = normalize_etag(req.headers.get('Etag')) + if client_etag and client_etag != slo_etag: + err = HTTPUnprocessableEntity(request=req) + if heartbeat: + resp_dict = {} + resp_dict['Response Status'] = err.status + err_body = err.body + if isinstance(err_body, bytes): + err_body = err_body.decode('utf-8', errors='replace') + resp_dict['Response Body'] = err_body or '\n'.join( + RESPONSE_REASONS.get(err.status_int, [''])) + yield separator + get_heartbeat_response_body( + out_content_type, resp_dict, problem_segments, + 'upload') + else: + for chunk in err(req.environ, start_response): + yield chunk + return + + json_data = json.dumps(data_for_storage).encode('utf-8') + req.body = json_data + req.headers.update({ + SYSMETA_SLO_ETAG: slo_etag, + SYSMETA_SLO_SIZE: total_size, + 'X-Static-Large-Object': 'True', + 'Etag': md5(json_data, usedforsecurity=False).hexdigest(), + }) + + # Ensure container listings have both etags. However, if any + # middleware to the left of us touched the base value, trust them. + override_header = get_container_update_override_key('etag') + val, sep, params = req.headers.get( + override_header, '').partition(';') + req.headers[override_header] = '%s; slo_etag=%s' % ( + (val or req.headers['Etag']) + sep + params, slo_etag) + + env = req.environ + if not env.get('CONTENT_TYPE'): + guessed_type, _junk = mimetypes.guess_type( + wsgi_to_str(req.path_info)) + env['CONTENT_TYPE'] = (guessed_type or + 'application/octet-stream') + env['swift.content_type_overridden'] = True + env['CONTENT_TYPE'] += ";swift_bytes=%d" % total_size + + resp = req.get_response(self.app) + resp_dict = {'Response Status': resp.status} + if resp.is_success: + resp.etag = slo_etag + resp_dict['Etag'] = resp.headers['Etag'] + resp_dict['Last Modified'] = resp.headers['Last-Modified'] + + if heartbeat: + resp_body = resp.body + if isinstance(resp_body, bytes): + resp_body = resp_body.decode('utf-8') + resp_dict['Response Body'] = resp_body + yield separator + get_heartbeat_response_body( + out_content_type, resp_dict, [], 'upload') + else: + for chunk in resp(req.environ, start_response): + yield chunk + + return resp_iter() + + def get_segments_to_delete_iter(self, req): + """ + A generator function to be used to delete all the segments and + sub-segments referenced in a manifest. + + :param req: a :class:`~swift.common.swob.Request` with an SLO manifest + in path + :raises HTTPPreconditionFailed: on invalid UTF8 in request path + :raises HTTPBadRequest: on too many buffered sub segments and + on invalid SLO manifest path + """ + if not check_utf8(wsgi_to_str(req.path_info)): + raise HTTPPreconditionFailed( + request=req, body='Invalid UTF8 or contains NULL') + vrs, account, container, obj = req.split_path(4, 4, True) + obj_path = '/%s/%s' % (wsgi_to_str(container), wsgi_to_str(obj)) + + segments = [{ + 'sub_slo': True, + 'name': obj_path}] + if 'version-id' in req.params: + segments[0]['version_id'] = req.params['version-id'] + + while segments: + # We chose not to set the limit at max_manifest_segments + # in the case this value was decreased by operators. + # Still it is important to set a limit to avoid this list + # growing too large and causing OOM failures. + # x10 is a best guess as to how much operators would change + # the value of max_manifest_segments. + if len(segments) > self.max_manifest_segments * 10: + raise HTTPBadRequest( + 'Too many buffered slo segments to delete.') + seg_data = segments.pop(0) + if 'data' in seg_data: + continue + if seg_data.get('sub_slo'): + try: + segments.extend( + self.get_slo_segments(seg_data['name'], req)) + except HTTPException as err: + # allow bulk delete response to report errors + err_body = err.body + if isinstance(err_body, bytes): + err_body = err_body.decode('utf-8', errors='replace') + seg_data['error'] = {'code': err.status_int, + 'message': err_body} + + # add manifest back to be deleted after segments + seg_data['sub_slo'] = False + segments.append(seg_data) + else: + yield seg_data + + def get_slo_segments(self, obj_name, req): + """ + Performs a :class:`~swift.common.swob.Request` and returns the SLO + manifest's segments. + + :param obj_name: the name of the object being deleted, + as ``/container/object`` + :param req: the base :class:`~swift.common.swob.Request` + :raises HTTPServerError: on unable to load obj_name or + on unable to load the SLO manifest data. + :raises HTTPBadRequest: on not an SLO manifest + :raises HTTPNotFound: on SLO manifest not found + :returns: SLO manifest's segments + """ + vrs, account, _junk = req.split_path(2, 3, True) + new_env = req.environ.copy() + new_env['REQUEST_METHOD'] = 'GET' + del new_env['wsgi.input'] + new_env['QUERY_STRING'] = 'multipart-manifest=get' + if 'version-id' in req.params: + new_env['QUERY_STRING'] += \ + '&version-id=' + req.params['version-id'] + new_env['CONTENT_LENGTH'] = 0 + new_env['HTTP_USER_AGENT'] = \ + '%s MultipartDELETE' % new_env.get('HTTP_USER_AGENT') + new_env['swift.source'] = 'SLO' + new_env['PATH_INFO'] = ( + '/%s/%s/%s' % (vrs, account, str_to_wsgi(obj_name.lstrip('/'))) + ) + # Just request the last byte of non-SLO objects so we don't waste + # a resources in friendly_close() below + manifest_req = Request.blank('', new_env, range='bytes=-1') + update_ignore_range_header(manifest_req, 'X-Static-Large-Object') + resp = manifest_req.get_response(self.app) + + if resp.is_success and config_true_value(resp.headers.get( + 'X-Static-Large-Object')) and len(resp.body) == 1: + # pre-2.24.0 object-server + manifest_req = Request.blank('', new_env) + resp = manifest_req.get_response(self.app) + + if resp.is_success: + if config_true_value(resp.headers.get('X-Static-Large-Object')): + try: + return json.loads(resp.body) + except ValueError: + raise HTTPServerError('Unable to load SLO manifest') + else: + # Drain and close GET request (prevents socket leaks) + friendly_close(resp) + raise HTTPBadRequest('Not an SLO manifest') + elif resp.status_int == HTTP_NOT_FOUND: + raise HTTPNotFound('SLO manifest not found') + elif resp.status_int == HTTP_UNAUTHORIZED: + raise HTTPUnauthorized('401 Unauthorized') + else: + raise HTTPServerError('Unable to load SLO manifest or segment.') + + def handle_async_delete(self, req): + if not check_utf8(wsgi_to_str(req.path_info)): + raise HTTPPreconditionFailed( + request=req, body='Invalid UTF8 or contains NULL') + vrs, account, container, obj = req.split_path(4, 4, True) + obj_path = '/%s/%s' % (wsgi_to_str(container), wsgi_to_str(obj)) + segments = [seg for seg in self.get_slo_segments(obj_path, req) + if 'data' not in seg] + if not segments: + # Degenerate case: just delete the manifest + return self.app + + segment_containers, segment_objects = zip(*( + split_path(seg['name'], 2, 2, True) for seg in segments)) + segment_containers = set(segment_containers) + if len(segment_containers) > 1: + container_csv = ', '.join( + '"%s"' % quote(c) for c in segment_containers) + raise HTTPBadRequest('All segments must be in one container. ' + 'Found segments in %s' % container_csv) + if any(seg.get('sub_slo') for seg in segments): + raise HTTPBadRequest('No segments may be large objects.') + + # Auth checks + segment_container = segment_containers.pop() + if 'swift.authorize' in req.environ: + container_info = get_container_info( + req.environ, self.app, swift_source='SLO') + req.acl = container_info.get('write_acl') + aresp = req.environ['swift.authorize'](req) + req.acl = None + if aresp: + return aresp + + if bytes_to_wsgi(segment_container.encode('utf-8')) != container: + path = '/%s/%s/%s' % (vrs, account, bytes_to_wsgi( + segment_container.encode('utf-8'))) + seg_container_info = get_container_info( + make_env(req.environ, path=path, swift_source='SLO'), + self.app, swift_source='SLO') + req.acl = seg_container_info.get('write_acl') + aresp = req.environ['swift.authorize'](req) + req.acl = None + if aresp: + return aresp + + # Did our sanity checks; schedule segments to be deleted + ts = req.ensure_x_timestamp() + expirer_jobs = make_delete_jobs( + wsgi_to_str(account), segment_container, segment_objects, ts) + expiring_objects_account, expirer_cont = \ + self.expirer_config.get_expirer_account_and_container( + ts, wsgi_to_str(account), wsgi_to_str(container), + wsgi_to_str(obj)) + enqueue_req = make_pre_authed_request( + req.environ, + method='UPDATE', + path="/v1/%s/%s" % (expiring_objects_account, expirer_cont), + body=json.dumps(expirer_jobs), + headers={'Content-Type': 'application/json', + 'X-Backend-Storage-Policy-Index': '0', + 'X-Backend-Allow-Private-Methods': 'True'}, + ) + resp = enqueue_req.get_response(self.app) + if not resp.is_success: + self.logger.error( + 'Failed to enqueue expiration entries: %s\n%s', + resp.status, resp.body) + return HTTPServiceUnavailable() + # consume the response (should be short) + friendly_close(resp) + + # Finally, delete the manifest + return self.app + + def handle_multipart_delete(self, req): + """ + Will delete all the segments in the SLO manifest and then, if + successful, will delete the manifest file. + + :param req: a :class:`~swift.common.swob.Request` with an obj in path + :returns: swob.Response whose app_iter set to Bulk.handle_delete_iter + """ + if self.allow_async_delete and config_true_value( + req.params.get('async')): + return self.handle_async_delete(req) + + req.headers['Content-Type'] = None # Ignore content-type from client + resp = HTTPOk(request=req) + try: + out_content_type = req.accept.best_match(ACCEPTABLE_FORMATS) + except ValueError: + out_content_type = None # Ignore invalid header + if out_content_type: + resp.content_type = out_content_type + resp.app_iter = self.bulk_deleter.handle_delete_iter( + req, objs_to_delete=self.get_segments_to_delete_iter(req), + user_agent='MultipartDELETE', swift_source='SLO', + out_content_type=out_content_type) + return resp + + def handle_container_listing(self, req, start_response): + resp = req.get_response(self.app) + if not resp.is_success or resp.content_type != 'application/json': + return resp(req.environ, start_response) + if resp.content_length is None or \ + resp.content_length > MAX_CONTAINER_LISTING_CONTENT_LENGTH: + return resp(req.environ, start_response) + try: + listing = json.loads(resp.body) + except ValueError: + return resp(req.environ, start_response) + + for item in listing: + if 'subdir' in item: + continue + etag, params = parse_header(item['hash']) + if 'slo_etag' in params: + item['slo_etag'] = '"%s"' % params.pop('slo_etag') + item['hash'] = etag + ''.join( + '; %s=%s' % kv for kv in params.items()) + + resp.body = json.dumps(listing).encode('ascii') + return resp(req.environ, start_response) + + def __call__(self, env, start_response): + """ + WSGI entry point + """ + if env.get('swift.slo_override'): + return self.app(env, start_response) + + req = Request(env) + try: + vrs, account, container, obj = req.split_path(3, 4, True) + is_cont_or_obj_req = True + except ValueError: + is_cont_or_obj_req = False + if not is_cont_or_obj_req: + return self.app(env, start_response) + + if not obj: + if req.method == 'GET': + return self.handle_container_listing(req, start_response) + return self.app(env, start_response) + + try: + if req.method == 'PUT' and \ + req.params.get('multipart-manifest') == 'put': + return self.handle_multipart_put(req, start_response) + if req.method == 'DELETE' and \ + req.params.get('multipart-manifest') == 'delete': + return self.handle_multipart_delete(req)(env, start_response) + if req.method == 'GET' or req.method == 'HEAD': + return self.handle_multipart_get_or_head(req, start_response) + if 'X-Static-Large-Object' in req.headers: + raise HTTPBadRequest( + request=req, + body='X-Static-Large-Object is a reserved header. ' + 'To create a static large object add query param ' + 'multipart-manifest=put.') + except HTTPException as err_resp: + return err_resp(env, start_response) + + return self.app(env, start_response) + + +def filter_factory(global_conf, **local_conf): + conf = global_conf.copy() + conf.update(local_conf) + + max_manifest_segments = int(conf.get('max_manifest_segments', + DEFAULT_MAX_MANIFEST_SEGMENTS)) + max_manifest_size = int(conf.get('max_manifest_size', + DEFAULT_MAX_MANIFEST_SIZE)) + yield_frequency = int(conf.get('yield_frequency', + DEFAULT_YIELD_FREQUENCY)) + allow_async_delete = config_true_value(conf.get('allow_async_delete', + 'true')) + + register_swift_info('slo', + max_manifest_segments=max_manifest_segments, + max_manifest_size=max_manifest_size, + yield_frequency=yield_frequency, + # this used to be configurable; report it as 1 for + # clients that might still care + min_segment_size=1, + allow_async_delete=allow_async_delete) + + def slo_filter(app): + return StaticLargeObject( + app, conf, + max_manifest_segments=max_manifest_segments, + max_manifest_size=max_manifest_size, + yield_frequency=yield_frequency, + allow_async_delete=allow_async_delete) + return slo_filter diff --git a/swift/common/middleware/staticweb.py b/swift/common/middleware/staticweb.py index 6aaeaa0ee9..d63032ae69 100644 --- a/swift/common/middleware/staticweb.py +++ b/swift/common/middleware/staticweb.py @@ -1,4 +1,4 @@ -# Copyright (c) 2010-2012 OpenStack, LLC. +# Copyright (c) 2010-2016 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,8 +16,11 @@ """ This StaticWeb WSGI middleware will serve container data as a static web site with index file and error file resolution and optional file listings. This mode -is normally only active for anonymous requests. If you want to use it with -authenticated requests, set the ``X-Web-Mode: true`` header on the request. +is normally only active for anonymous requests. When using keystone for +authentication set ``delay_auth_decision = true`` in the authtoken middleware +configuration in your ``/etc/swift/proxy-server.conf`` file. If you want to +use it with authenticated requests, set the ``X-Web-Mode: true`` header on the +request. The ``staticweb`` filter should be added to the pipeline in your ``/etc/swift/proxy-server.conf`` file just after any auth middleware. Also, the @@ -28,26 +31,16 @@ ... [pipeline:main] - pipeline = catch_errors healthcheck cache ratelimit tempauth staticweb - proxy-logging proxy-server + pipeline = catch_errors healthcheck proxy-logging cache ratelimit tempauth + staticweb proxy-logging proxy-server ... [filter:staticweb] use = egg:swift#staticweb - # Seconds to cache container x-container-meta-web-* header values. - # cache_timeout = 300 - # You can override the default log routing for this filter here: - # set log_name = staticweb - # set log_facility = LOG_LOCAL0 - # set log_level = INFO - # set access_log_name = staticweb - # set access_log_facility = LOG_LOCAL0 - # set access_log_level = INFO - # set log_headers = False Any publicly readable containers (for example, ``X-Container-Read: .r:*``, see -`acls`_ for more information on this) will be checked for +:ref:`acls` for more information on this) will be checked for X-Container-Meta-Web-Index and X-Container-Meta-Web-Error header values:: X-Container-Meta-Web-Index @@ -64,9 +57,10 @@ ``X-Container-Meta-Web-Error: error.html`` will serve .../404error.html for requests for paths not found. -For psuedo paths that have no , this middleware can serve HTML file +For pseudo paths that have no , this middleware can serve HTML file listings if you set the ``X-Container-Meta-Web-Listings: true`` metadata item -on the container. +on the container. Note that the listing must be authorized; you may want a +container ACL like ``X-Container-Read: .r:*,.rlistings``. If listings are enabled, the listings can have a custom style sheet by setting the X-Container-Meta-Web-Listings-CSS header. For instance, setting @@ -75,6 +69,29 @@ listing page, you will see the well defined document structure that can be styled. +Additionally, prefix-based :ref:`tempurl` parameters may be used to authorize +requests instead of making the whole container publicly readable. This gives +clients dynamic discoverability of the objects available within that prefix. + +.. note:: + + ``temp_url_prefix`` values should typically end with a slash (``/``) when + used with StaticWeb. StaticWeb's redirects will not carry over any TempURL + parameters, as they likely indicate that the user created an overly-broad + TempURL. + +By default, the listings will be rendered with a label of +"Listing of /v1/account/container/path". This can be altered by +setting a ``X-Container-Meta-Web-Listings-Label: